From f4f5a015b654cd3825b3d50086223a5f37148212 Mon Sep 17 00:00:00 2001 From: Anton Zabaznov Date: Fri, 25 Sep 2020 21:38:18 +0300 Subject: [PATCH] OpenCL C 3.0 patch update: define predefined macros in header if they are not defined by FE compiler --- patches/clang/0006-OpenCL-3.0-support.patch | 1208 +++++------------ ...-cl_khr_extended_subgroup-extensions.patch | 10 +- 2 files changed, 355 insertions(+), 863 deletions(-) diff --git a/patches/clang/0006-OpenCL-3.0-support.patch b/patches/clang/0006-OpenCL-3.0-support.patch index 9cec90f2..ef2ee438 100644 --- a/patches/clang/0006-OpenCL-3.0-support.patch +++ b/patches/clang/0006-OpenCL-3.0-support.patch @@ -1,7 +1,7 @@ -From d91e758930a7e59d29525659b5b698c6e9456cee Mon Sep 17 00:00:00 2001 +From bd852341c8f89af12bcf9c160bb4699193cac986 Mon Sep 17 00:00:00 2001 From: Anton Zabaznov Date: Thu, 24 Sep 2020 00:12:24 +0300 -Subject: [PATCH] OpenCL 3.0 support +Subject: [PATCH 1/2] OpenCL 3.0 support --- include/clang/Basic/Builtins.def | 67 +- @@ -21,8 +21,7 @@ Subject: [PATCH] OpenCL 3.0 support lib/CodeGen/CodeGenFunction.cpp | 6 +- lib/Frontend/CompilerInvocation.cpp | 7 +- lib/Frontend/InitPreprocessor.cpp | 8 +- - lib/Headers/opencl-c-base.h | 578 +++ - lib/Headers/opencl-c.h | 3358 ++++++++++++++--- + lib/Headers/opencl-c.h | 3422 ++++++++++++++--- lib/Parse/ParseDecl.cpp | 9 +- lib/Parse/ParsePragma.cpp | 10 +- lib/Sema/Sema.cpp | 47 +- @@ -67,8 +66,7 @@ Subject: [PATCH] OpenCL 3.0 support .../SemaOpenCL/forget-unsupported-builtins.cl | 23 + test/SemaOpenCL/invalid-pipe-builtin-cl2.0.cl | 1 + test/SemaOpenCL/storageclass-cl20.cl | 1 + - 63 files changed, 4129 insertions(+), 722 deletions(-) - create mode 100644 lib/Headers/opencl-c-base.h + 62 files changed, 3614 insertions(+), 723 deletions(-) create mode 100644 test/CodeGenOpenCL/generic-address-space-feature.cl create mode 100644 test/Sema/feature-extensions-simult-support.cl create mode 100644 test/Sema/features-ignore-pragma.cl @@ -776,595 +774,89 @@ index 4cde22ce9a..6b3f75cb1a 100644 Builder.defineMacro(#Ext); #include "clang/Basic/OpenCLExtensions.def" -diff --git a/lib/Headers/opencl-c-base.h b/lib/Headers/opencl-c-base.h -new file mode 100644 -index 0000000000..d81cbdb8a7 ---- /dev/null -+++ b/lib/Headers/opencl-c-base.h -@@ -0,0 +1,578 @@ -+//===----- opencl-c-base.h - OpenCL C language base definitions -----------===// -+// -+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -+// See https://llvm.org/LICENSE.txt for license information. -+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -+// -+//===----------------------------------------------------------------------===// -+ -+#ifndef _OPENCL_BASE_H_ -+#define _OPENCL_BASE_H_ -+ -+// built-in scalar data types: -+ -+/** -+ * An unsigned 8-bit integer. -+ */ -+typedef unsigned char uchar; -+ -+/** -+ * An unsigned 16-bit integer. -+ */ -+typedef unsigned short ushort; -+ -+/** -+ * An unsigned 32-bit integer. -+ */ -+typedef unsigned int uint; -+ -+/** -+ * An unsigned 64-bit integer. -+ */ -+typedef unsigned long ulong; -+ -+/** -+ * The unsigned integer type of the result of the sizeof operator. This -+ * is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS -+ * defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if -+ * CL_DEVICE_ADDRESS_BITS is 64-bits. -+ */ -+typedef __SIZE_TYPE__ size_t; -+ -+/** -+ * A signed integer type that is the result of subtracting two pointers. -+ * This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS -+ * defined in table 4.3 is 32-bits and is a 64-bit signed integer if -+ * CL_DEVICE_ADDRESS_BITS is 64-bits. -+ */ -+typedef __PTRDIFF_TYPE__ ptrdiff_t; -+ -+/** -+ * A signed integer type with the property that any valid pointer to -+ * void can be converted to this type, then converted back to pointer -+ * to void, and the result will compare equal to the original pointer. -+ */ -+typedef __INTPTR_TYPE__ intptr_t; -+ -+/** -+ * An unsigned integer type with the property that any valid pointer to -+ * void can be converted to this type, then converted back to pointer -+ * to void, and the result will compare equal to the original pointer. -+ */ -+typedef __UINTPTR_TYPE__ uintptr_t; -+ -+// built-in vector data types: -+typedef char char2 __attribute__((ext_vector_type(2))); -+typedef char char3 __attribute__((ext_vector_type(3))); -+typedef char char4 __attribute__((ext_vector_type(4))); -+typedef char char8 __attribute__((ext_vector_type(8))); -+typedef char char16 __attribute__((ext_vector_type(16))); -+typedef uchar uchar2 __attribute__((ext_vector_type(2))); -+typedef uchar uchar3 __attribute__((ext_vector_type(3))); -+typedef uchar uchar4 __attribute__((ext_vector_type(4))); -+typedef uchar uchar8 __attribute__((ext_vector_type(8))); -+typedef uchar uchar16 __attribute__((ext_vector_type(16))); -+typedef short short2 __attribute__((ext_vector_type(2))); -+typedef short short3 __attribute__((ext_vector_type(3))); -+typedef short short4 __attribute__((ext_vector_type(4))); -+typedef short short8 __attribute__((ext_vector_type(8))); -+typedef short short16 __attribute__((ext_vector_type(16))); -+typedef ushort ushort2 __attribute__((ext_vector_type(2))); -+typedef ushort ushort3 __attribute__((ext_vector_type(3))); -+typedef ushort ushort4 __attribute__((ext_vector_type(4))); -+typedef ushort ushort8 __attribute__((ext_vector_type(8))); -+typedef ushort ushort16 __attribute__((ext_vector_type(16))); -+typedef int int2 __attribute__((ext_vector_type(2))); -+typedef int int3 __attribute__((ext_vector_type(3))); -+typedef int int4 __attribute__((ext_vector_type(4))); -+typedef int int8 __attribute__((ext_vector_type(8))); -+typedef int int16 __attribute__((ext_vector_type(16))); -+typedef uint uint2 __attribute__((ext_vector_type(2))); -+typedef uint uint3 __attribute__((ext_vector_type(3))); -+typedef uint uint4 __attribute__((ext_vector_type(4))); -+typedef uint uint8 __attribute__((ext_vector_type(8))); -+typedef uint uint16 __attribute__((ext_vector_type(16))); -+typedef long long2 __attribute__((ext_vector_type(2))); -+typedef long long3 __attribute__((ext_vector_type(3))); -+typedef long long4 __attribute__((ext_vector_type(4))); -+typedef long long8 __attribute__((ext_vector_type(8))); -+typedef long long16 __attribute__((ext_vector_type(16))); -+typedef ulong ulong2 __attribute__((ext_vector_type(2))); -+typedef ulong ulong3 __attribute__((ext_vector_type(3))); -+typedef ulong ulong4 __attribute__((ext_vector_type(4))); -+typedef ulong ulong8 __attribute__((ext_vector_type(8))); -+typedef ulong ulong16 __attribute__((ext_vector_type(16))); -+typedef float float2 __attribute__((ext_vector_type(2))); -+typedef float float3 __attribute__((ext_vector_type(3))); -+typedef float float4 __attribute__((ext_vector_type(4))); -+typedef float float8 __attribute__((ext_vector_type(8))); -+typedef float float16 __attribute__((ext_vector_type(16))); -+#ifdef cl_khr_fp16 -+#pragma OPENCL EXTENSION cl_khr_fp16 : enable -+typedef half half2 __attribute__((ext_vector_type(2))); -+typedef half half3 __attribute__((ext_vector_type(3))); -+typedef half half4 __attribute__((ext_vector_type(4))); -+typedef half half8 __attribute__((ext_vector_type(8))); -+typedef half half16 __attribute__((ext_vector_type(16))); -+#endif -+#if defined(cl_khr_fp64) || defined(__opencl_c_fp64) -+#if __OPENCL_C_VERSION__ < CL_VERSION_1_2 -+#pragma OPENCL EXTENSION cl_khr_fp64 : enable +diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h +index 514c710c11..7def52945a 100644 +--- a/lib/Headers/opencl-c.h ++++ b/lib/Headers/opencl-c.h +@@ -10,6 +10,63 @@ + #ifndef _OPENCL_H_ + #define _OPENCL_H_ + ++ ++// Add predefined macros to build headers with standalone executable ++#ifndef CL_VERSION_3_0 ++ #define CL_VERSION_3_0 300 +#endif -+typedef double double2 __attribute__((ext_vector_type(2))); -+typedef double double3 __attribute__((ext_vector_type(3))); -+typedef double double4 __attribute__((ext_vector_type(4))); -+typedef double double8 __attribute__((ext_vector_type(8))); -+typedef double double16 __attribute__((ext_vector_type(16))); ++#ifndef __OPENCL_MEMORY_SCOPE_ALL_DEVICES ++ #define __OPENCL_MEMORY_SCOPE_ALL_DEVICES 5 +#endif + -+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) -+#define NULL ((void*)0) ++// Define features for 2.0 for header backward compatibility ++#ifndef __opencl_c_int64 ++ #define __opencl_c_int64 1 +#endif -+ -+/** -+ * Value of maximum non-infinite single-precision floating-point -+ * number. -+ */ -+#define MAXFLOAT 0x1.fffffep127f -+ -+/** -+ * A positive float constant expression. HUGE_VALF evaluates -+ * to +infinity. Used as an error value returned by the built-in -+ * math functions. -+ */ -+#define HUGE_VALF (__builtin_huge_valf()) -+ -+/** -+ * A positive double constant expression. HUGE_VAL evaluates -+ * to +infinity. Used as an error value returned by the built-in -+ * math functions. -+ */ -+#define HUGE_VAL (__builtin_huge_val()) -+ -+/** -+ * A constant expression of type float representing positive or -+ * unsigned infinity. -+ */ -+#define INFINITY (__builtin_inff()) -+ -+/** -+ * A constant expression of type float representing a quiet NaN. -+ */ -+#define NAN as_float(INT_MAX) -+ -+#define FP_ILOGB0 INT_MIN -+#define FP_ILOGBNAN INT_MAX -+ -+#define FLT_DIG 6 -+#define FLT_MANT_DIG 24 -+#define FLT_MAX_10_EXP +38 -+#define FLT_MAX_EXP +128 -+#define FLT_MIN_10_EXP -37 -+#define FLT_MIN_EXP -125 -+#define FLT_RADIX 2 -+#define FLT_MAX 0x1.fffffep127f -+#define FLT_MIN 0x1.0p-126f -+#define FLT_EPSILON 0x1.0p-23f -+ -+#define M_E_F 2.71828182845904523536028747135266250f -+#define M_LOG2E_F 1.44269504088896340735992468100189214f -+#define M_LOG10E_F 0.434294481903251827651128918916605082f -+#define M_LN2_F 0.693147180559945309417232121458176568f -+#define M_LN10_F 2.30258509299404568401799145468436421f -+#define M_PI_F 3.14159265358979323846264338327950288f -+#define M_PI_2_F 1.57079632679489661923132169163975144f -+#define M_PI_4_F 0.785398163397448309615660845819875721f -+#define M_1_PI_F 0.318309886183790671537767526745028724f -+#define M_2_PI_F 0.636619772367581343075535053490057448f -+#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f -+#define M_SQRT2_F 1.41421356237309504880168872420969808f -+#define M_SQRT1_2_F 0.707106781186547524400844362104849039f -+ -+#define DBL_DIG 15 -+#define DBL_MANT_DIG 53 -+#define DBL_MAX_10_EXP +308 -+#define DBL_MAX_EXP +1024 -+#define DBL_MIN_10_EXP -307 -+#define DBL_MIN_EXP -1021 -+#define DBL_RADIX 2 -+#define DBL_MAX 0x1.fffffffffffffp1023 -+#define DBL_MIN 0x1.0p-1022 -+#define DBL_EPSILON 0x1.0p-52 -+ -+#define M_E 0x1.5bf0a8b145769p+1 -+#define M_LOG2E 0x1.71547652b82fep+0 -+#define M_LOG10E 0x1.bcb7b1526e50ep-2 -+#define M_LN2 0x1.62e42fefa39efp-1 -+#define M_LN10 0x1.26bb1bbb55516p+1 -+#define M_PI 0x1.921fb54442d18p+1 -+#define M_PI_2 0x1.921fb54442d18p+0 -+#define M_PI_4 0x1.921fb54442d18p-1 -+#define M_1_PI 0x1.45f306dc9c883p-2 -+#define M_2_PI 0x1.45f306dc9c883p-1 -+#define M_2_SQRTPI 0x1.20dd750429b6dp+0 -+#define M_SQRT2 0x1.6a09e667f3bcdp+0 -+#define M_SQRT1_2 0x1.6a09e667f3bcdp-1 -+ -+#ifdef cl_khr_fp16 -+ -+#define HALF_DIG 3 -+#define HALF_MANT_DIG 11 -+#define HALF_MAX_10_EXP +4 -+#define HALF_MAX_EXP +16 -+#define HALF_MIN_10_EXP -4 -+#define HALF_MIN_EXP -13 -+#define HALF_RADIX 2 -+#define HALF_MAX ((0x1.ffcp15h)) -+#define HALF_MIN ((0x1.0p-14h)) -+#define HALF_EPSILON ((0x1.0p-10h)) -+ -+#define M_E_H 2.71828182845904523536028747135266250h -+#define M_LOG2E_H 1.44269504088896340735992468100189214h -+#define M_LOG10E_H 0.434294481903251827651128918916605082h -+#define M_LN2_H 0.693147180559945309417232121458176568h -+#define M_LN10_H 2.30258509299404568401799145468436421h -+#define M_PI_H 3.14159265358979323846264338327950288h -+#define M_PI_2_H 1.57079632679489661923132169163975144h -+#define M_PI_4_H 0.785398163397448309615660845819875721h -+#define M_1_PI_H 0.318309886183790671537767526745028724h -+#define M_2_PI_H 0.636619772367581343075535053490057448h -+#define M_2_SQRTPI_H 1.12837916709551257389615890312154517h -+#define M_SQRT2_H 1.41421356237309504880168872420969808h -+#define M_SQRT1_2_H 0.707106781186547524400844362104849039h -+ -+#endif //cl_khr_fp16 -+ -+#define CHAR_BIT 8 -+#define SCHAR_MAX 127 -+#define SCHAR_MIN (-128) -+#define UCHAR_MAX 255 -+#define CHAR_MAX SCHAR_MAX -+#define CHAR_MIN SCHAR_MIN -+#define USHRT_MAX 65535 -+#define SHRT_MAX 32767 -+#define SHRT_MIN (-32768) -+#define UINT_MAX 0xffffffff -+#define INT_MAX 2147483647 -+#define INT_MIN (-2147483647-1) -+#define ULONG_MAX 0xffffffffffffffffUL -+#define LONG_MAX 0x7fffffffffffffffL -+#define LONG_MIN (-0x7fffffffffffffffL-1) -+ -+// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions -+ -+// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence -+typedef uint cl_mem_fence_flags; -+ -+/** -+ * Queue a memory fence to ensure correct -+ * ordering of memory operations to local memory -+ */ -+#define CLK_LOCAL_MEM_FENCE 0x01 -+ -+/** -+ * Queue a memory fence to ensure correct -+ * ordering of memory operations to global memory -+ */ -+#define CLK_GLOBAL_MEM_FENCE 0x02 -+ -+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) -+ -+typedef enum memory_scope { -+ memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, -+ memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, -+#ifdef __opencl_c_atomic_scope_device -+ memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, ++#if __OPENCL_C_VERSION__ != CL_VERSION_3_0 ++ #ifndef __opencl_c_images ++ #define __opencl_c_images 1 ++ #endif +#endif -+#ifdef __opencl_c_atomic_scope_all_devices -+ memory_scope_all_devices = __OPENCL_MEMORY_SCOPE_ALL_DEVICES, -+ memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, ++#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ == CL_VERSION_2_0) ++#ifndef __opencl_c_pipes ++ #define __opencl_c_pipes 1 +#endif -+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) || \ -+ defined(__opencl_c_subgroups) -+ memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP ++#ifndef __opencl_c_generic_address_space ++ #define __opencl_c_generic_address_space 1 +#endif -+} memory_scope; -+ -+/** -+ * Queue a memory fence to ensure correct ordering of memory -+ * operations between work-items of a work-group to -+ * image memory. -+ */ -+#define CLK_IMAGE_MEM_FENCE 0x04 -+ -+#ifndef ATOMIC_VAR_INIT -+#define ATOMIC_VAR_INIT(x) (x) -+#endif //ATOMIC_VAR_INIT -+#define ATOMIC_FLAG_INIT 0 -+ -+// enum values aligned with what clang uses in EmitAtomicExpr() -+typedef enum memory_order { -+ memory_order_relaxed = __ATOMIC_RELAXED, -+ memory_order_acquire = __ATOMIC_ACQUIRE, -+ memory_order_release = __ATOMIC_RELEASE, -+ memory_order_acq_rel = __ATOMIC_ACQ_REL, -+#ifdef __opencl_c_atomic_order_seq_cst -+ memory_order_seq_cst = __ATOMIC_SEQ_CST -+#endif //__opencl_c_atomic_order_seq_cst -+} memory_order; -+ -+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) -+ -+// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions -+ -+// These values need to match the runtime equivalent -+// -+// Addressing Mode. -+// -+#define CLK_ADDRESS_NONE 0 -+#define CLK_ADDRESS_CLAMP_TO_EDGE 2 -+#define CLK_ADDRESS_CLAMP 4 -+#define CLK_ADDRESS_REPEAT 6 -+#define CLK_ADDRESS_MIRRORED_REPEAT 8 -+ -+// -+// Coordination Normalization -+// -+#define CLK_NORMALIZED_COORDS_FALSE 0 -+#define CLK_NORMALIZED_COORDS_TRUE 1 -+ -+// -+// Filtering Mode. -+// -+#define CLK_FILTER_NEAREST 0x10 -+#define CLK_FILTER_LINEAR 0x20 -+ -+#ifdef cl_khr_gl_msaa_sharing -+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable -+#endif //cl_khr_gl_msaa_sharing -+ -+// -+// Channel Datatype. -+// -+#define CLK_SNORM_INT8 0x10D0 -+#define CLK_SNORM_INT16 0x10D1 -+#define CLK_UNORM_INT8 0x10D2 -+#define CLK_UNORM_INT16 0x10D3 -+#define CLK_UNORM_SHORT_565 0x10D4 -+#define CLK_UNORM_SHORT_555 0x10D5 -+#define CLK_UNORM_INT_101010 0x10D6 -+#define CLK_SIGNED_INT8 0x10D7 -+#define CLK_SIGNED_INT16 0x10D8 -+#define CLK_SIGNED_INT32 0x10D9 -+#define CLK_UNSIGNED_INT8 0x10DA -+#define CLK_UNSIGNED_INT16 0x10DB -+#define CLK_UNSIGNED_INT32 0x10DC -+#define CLK_HALF_FLOAT 0x10DD -+#define CLK_FLOAT 0x10DE -+#define CLK_UNORM_INT24 0x10DF -+ -+// Channel order, numbering must be aligned with cl_channel_order in cl.h -+// -+#define CLK_R 0x10B0 -+#define CLK_A 0x10B1 -+#define CLK_RG 0x10B2 -+#define CLK_RA 0x10B3 -+#define CLK_RGB 0x10B4 -+#define CLK_RGBA 0x10B5 -+#define CLK_BGRA 0x10B6 -+#define CLK_ARGB 0x10B7 -+#define CLK_INTENSITY 0x10B8 -+#define CLK_LUMINANCE 0x10B9 -+#define CLK_Rx 0x10BA -+#define CLK_RGx 0x10BB -+#define CLK_RGBx 0x10BC -+#define CLK_DEPTH 0x10BD -+#define CLK_DEPTH_STENCIL 0x10BE -+#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -+#define CLK_sRGB 0x10BF -+#define CLK_sRGBx 0x10C0 -+#define CLK_sRGBA 0x10C1 -+#define CLK_sBGRA 0x10C2 -+#define CLK_ABGR 0x10C3 -+#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 -+ -+// OpenCL v2.0 s6.13.16 - Pipe Functions -+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) -+#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t)) ++#ifndef __opencl_c_work_group_collective_functions ++ #define __opencl_c_work_group_collective_functions 1 ++#endif ++#ifndef __opencl_c_atomic_order_acq_rel ++ #define __opencl_c_atomic_order_acq_rel 1 ++#endif ++#ifndef __opencl_c_atomic_order_seq_cst ++ #define __opencl_c_atomic_order_seq_cst 1 ++#endif ++#ifndef __opencl_c_atomic_scope_device ++ #define __opencl_c_atomic_scope_device 1 ++#endif ++#ifndef __opencl_c_atomic_scope_all_devices ++ #define __opencl_c_atomic_scope_all_devices 1 ++#endif ++#ifndef __opencl_c_subgroups ++ #define __opencl_c_subgroups 1 ++#endif ++#ifndef __opencl_c_3d_image_writes ++ #define __opencl_c_3d_image_writes 1 ++#endif ++#ifndef __opencl_c_device_enqueue ++ #define __opencl_c_device_enqueue 1 ++#endif ++#ifndef __opencl_c_read_write_images ++ #define __opencl_c_read_write_images 1 ++#endif ++#ifndef __opencl_c_program_scope_global_variables ++ #define __opencl_c_program_scope_global_variables 1 ++#endif ++#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ == CL_VERSION_2_0) + -+// OpenCL v2.0 s6.13.17 - Enqueue Kernels -+#define CL_COMPLETE 0x0 -+#define CL_RUNNING 0x1 -+#define CL_SUBMITTED 0x2 -+#define CL_QUEUED 0x3 -+ -+#define CLK_SUCCESS 0 -+#define CLK_ENQUEUE_FAILURE -101 -+#define CLK_INVALID_QUEUE -102 -+#define CLK_INVALID_NDRANGE -160 -+#define CLK_INVALID_EVENT_WAIT_LIST -57 -+#define CLK_DEVICE_QUEUE_FULL -161 -+#define CLK_INVALID_ARG_SIZE -51 -+#define CLK_EVENT_ALLOCATION_FAILURE -100 -+#define CLK_OUT_OF_RESOURCES -5 -+ -+#define CLK_NULL_QUEUE 0 -+#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t)) -+ -+// execution model related definitions -+#define CLK_ENQUEUE_FLAGS_NO_WAIT 0x0 -+#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0x1 -+#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 0x2 -+ -+typedef int kernel_enqueue_flags_t; -+typedef int clk_profiling_info; -+ -+// Profiling info name (see capture_event_profiling_info) -+#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1 -+ -+#define MAX_WORK_DIM 3 -+ -+typedef struct { -+ unsigned int workDimension; -+ size_t globalWorkOffset[MAX_WORK_DIM]; -+ size_t globalWorkSize[MAX_WORK_DIM]; -+ size_t localWorkSize[MAX_WORK_DIM]; -+} ndrange_t; + #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 + #ifndef cl_khr_depth_images + #define cl_khr_depth_images +@@ -143,7 +200,12 @@ typedef half half4 __attribute__((ext_vector_type(4))); + typedef half half8 __attribute__((ext_vector_type(8))); + typedef half half16 __attribute__((ext_vector_type(16))); + #endif +-#ifdef cl_khr_fp64 ++#if defined(cl_khr_fp64) || defined(__opencl_c_fp64) + -+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) ++#ifndef __opencl_c_fp64 ++ #define __opencl_c_fp64 1 ++#endif + -+#ifdef cl_intel_device_side_avc_motion_estimation -+#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin -+ -+#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0 -+#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1 -+#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2 -+#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3 -+ -+#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0 -+#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1 -+#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2 -+#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3 -+ -+#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0 -+#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 -+#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 -+ -+#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 -+#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E -+#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D -+#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B -+#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 -+#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F -+#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F -+#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F -+ -+#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 -+#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 -+#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 -+ -+#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 -+#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 -+#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 -+#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 -+#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 -+#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 -+#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 -+#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 -+#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 -+ -+#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 -+#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 -+ -+#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 -+#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 -+#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 -+ -+#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 -+#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 -+#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 -+#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 -+ -+#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 -+#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 -+#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 -+#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B -+#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 -+ -+#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 -+#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 -+#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 -+#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 -+ -+#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0 -+#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1 -+#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2 -+ -+#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 -+#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 -+ -+#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24) -+#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24) -+#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30) -+#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30) -+ -+#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 -+#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 -+ -+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0 -+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 -+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 -+#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 -+ -+#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 -+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 -+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 -+#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 -+ -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 -+#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 -+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 -+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 -+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 -+#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 -+ -+#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1 -+#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2 -+#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3 -+ -+#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 -+#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 -+ -+#define CLK_AVC_ME_INITIALIZE_INTEL 0x0 -+ -+#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0 -+#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0 -+#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0 -+ -+#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0 -+#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0 -+#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0 -+ -+#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0 -+#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0 -+#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0 -+#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0 -+ -+#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end -+#endif // cl_intel_device_side_avc_motion_estimation -+ -+#endif //_OPENCL_BASE_H_ -diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h -index 514c710c11..9dcd10d54f 100644 ---- a/lib/Headers/opencl-c.h -+++ b/lib/Headers/opencl-c.h -@@ -4883,7 +4883,7 @@ float16 __ovld __cnfn convert_float16(float16); + #if __OPENCL_C_VERSION__ < CL_VERSION_1_2 + #pragma OPENCL EXTENSION cl_khr_fp64 : enable + #endif +@@ -4883,7 +4945,7 @@ float16 __ovld __cnfn convert_float16(float16); // Conversions with double data type parameters or return value. @@ -1373,7 +865,7 @@ index 514c710c11..9dcd10d54f 100644 char __ovld __cnfn convert_char(double); char __ovld __cnfn convert_char_rte(double); char __ovld __cnfn convert_char_rtn(double); -@@ -5703,7 +5703,7 @@ double16 __ovld __cnfn convert_double16_rtz(uchar16); +@@ -5703,7 +5765,7 @@ double16 __ovld __cnfn convert_double16_rtz(uchar16); double16 __ovld __cnfn convert_double16_rtz(uint16); double16 __ovld __cnfn convert_double16_rtz(ulong16); double16 __ovld __cnfn convert_double16_rtz(ushort16); @@ -1382,7 +874,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 // Convert half types to non-double types. -@@ -6521,7 +6521,7 @@ half16 __ovld __cnfn convert_half16_rtz(float16); +@@ -6521,7 +6583,7 @@ half16 __ovld __cnfn convert_half16_rtz(float16); half16 __ovld __cnfn convert_half16_rtz(half16); // Convert half types to double types. @@ -1391,7 +883,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn convert_double(half); double __ovld __cnfn convert_double_rte(half); double __ovld __cnfn convert_double_rtp(half); -@@ -6584,7 +6584,7 @@ half16 __ovld __cnfn convert_half16_rte(double16); +@@ -6584,7 +6646,7 @@ half16 __ovld __cnfn convert_half16_rte(double16); half16 __ovld __cnfn convert_half16_rtp(double16); half16 __ovld __cnfn convert_half16_rtn(double16); half16 __ovld __cnfn convert_half16_rtz(double16); @@ -1400,7 +892,7 @@ index 514c710c11..9dcd10d54f 100644 #endif // cl_khr_fp16 -@@ -6655,14 +6655,14 @@ half16 __ovld __cnfn convert_half16_rtz(double16); +@@ -6655,14 +6717,14 @@ half16 __ovld __cnfn convert_half16_rtz(double16); #define as_float8(x) __builtin_astype((x), float8) #define as_float16(x) __builtin_astype((x), float16) @@ -1417,7 +909,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 #define as_half(x) __builtin_astype((x), half) -@@ -6785,14 +6785,14 @@ float3 __ovld __cnfn acos(float3); +@@ -6785,14 +6847,14 @@ float3 __ovld __cnfn acos(float3); float4 __ovld __cnfn acos(float4); float8 __ovld __cnfn acos(float8); float16 __ovld __cnfn acos(float16); @@ -1434,7 +926,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn acos(half); half2 __ovld __cnfn acos(half2); -@@ -6811,14 +6811,14 @@ float3 __ovld __cnfn acosh(float3); +@@ -6811,14 +6873,14 @@ float3 __ovld __cnfn acosh(float3); float4 __ovld __cnfn acosh(float4); float8 __ovld __cnfn acosh(float8); float16 __ovld __cnfn acosh(float16); @@ -1451,7 +943,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn acosh(half); half2 __ovld __cnfn acosh(half2); -@@ -6837,14 +6837,14 @@ float3 __ovld __cnfn acospi(float3 x); +@@ -6837,14 +6899,14 @@ float3 __ovld __cnfn acospi(float3 x); float4 __ovld __cnfn acospi(float4 x); float8 __ovld __cnfn acospi(float8 x); float16 __ovld __cnfn acospi(float16 x); @@ -1468,7 +960,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn acospi(half x); half2 __ovld __cnfn acospi(half2 x); -@@ -6863,14 +6863,14 @@ float3 __ovld __cnfn asin(float3); +@@ -6863,14 +6925,14 @@ float3 __ovld __cnfn asin(float3); float4 __ovld __cnfn asin(float4); float8 __ovld __cnfn asin(float8); float16 __ovld __cnfn asin(float16); @@ -1485,7 +977,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn asin(half); half2 __ovld __cnfn asin(half2); -@@ -6889,14 +6889,14 @@ float3 __ovld __cnfn asinh(float3); +@@ -6889,14 +6951,14 @@ float3 __ovld __cnfn asinh(float3); float4 __ovld __cnfn asinh(float4); float8 __ovld __cnfn asinh(float8); float16 __ovld __cnfn asinh(float16); @@ -1502,7 +994,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn asinh(half); half2 __ovld __cnfn asinh(half2); -@@ -6915,14 +6915,14 @@ float3 __ovld __cnfn asinpi(float3 x); +@@ -6915,14 +6977,14 @@ float3 __ovld __cnfn asinpi(float3 x); float4 __ovld __cnfn asinpi(float4 x); float8 __ovld __cnfn asinpi(float8 x); float16 __ovld __cnfn asinpi(float16 x); @@ -1519,7 +1011,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn asinpi(half x); half2 __ovld __cnfn asinpi(half2 x); -@@ -6941,14 +6941,14 @@ float3 __ovld __cnfn atan(float3 y_over_x); +@@ -6941,14 +7003,14 @@ float3 __ovld __cnfn atan(float3 y_over_x); float4 __ovld __cnfn atan(float4 y_over_x); float8 __ovld __cnfn atan(float8 y_over_x); float16 __ovld __cnfn atan(float16 y_over_x); @@ -1536,7 +1028,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn atan(half y_over_x); half2 __ovld __cnfn atan(half2 y_over_x); -@@ -6967,14 +6967,14 @@ float3 __ovld __cnfn atan2(float3 y, float3 x); +@@ -6967,14 +7029,14 @@ float3 __ovld __cnfn atan2(float3 y, float3 x); float4 __ovld __cnfn atan2(float4 y, float4 x); float8 __ovld __cnfn atan2(float8 y, float8 x); float16 __ovld __cnfn atan2(float16 y, float16 x); @@ -1553,7 +1045,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn atan2(half y, half x); half2 __ovld __cnfn atan2(half2 y, half2 x); -@@ -6993,14 +6993,14 @@ float3 __ovld __cnfn atanh(float3); +@@ -6993,14 +7055,14 @@ float3 __ovld __cnfn atanh(float3); float4 __ovld __cnfn atanh(float4); float8 __ovld __cnfn atanh(float8); float16 __ovld __cnfn atanh(float16); @@ -1570,7 +1062,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn atanh(half); half2 __ovld __cnfn atanh(half2); -@@ -7019,14 +7019,14 @@ float3 __ovld __cnfn atanpi(float3 x); +@@ -7019,14 +7081,14 @@ float3 __ovld __cnfn atanpi(float3 x); float4 __ovld __cnfn atanpi(float4 x); float8 __ovld __cnfn atanpi(float8 x); float16 __ovld __cnfn atanpi(float16 x); @@ -1587,7 +1079,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn atanpi(half x); half2 __ovld __cnfn atanpi(half2 x); -@@ -7045,14 +7045,14 @@ float3 __ovld __cnfn atan2pi(float3 y, float3 x); +@@ -7045,14 +7107,14 @@ float3 __ovld __cnfn atan2pi(float3 y, float3 x); float4 __ovld __cnfn atan2pi(float4 y, float4 x); float8 __ovld __cnfn atan2pi(float8 y, float8 x); float16 __ovld __cnfn atan2pi(float16 y, float16 x); @@ -1604,7 +1096,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn atan2pi(half y, half x); half2 __ovld __cnfn atan2pi(half2 y, half2 x); -@@ -7071,14 +7071,14 @@ float3 __ovld __cnfn cbrt(float3); +@@ -7071,14 +7133,14 @@ float3 __ovld __cnfn cbrt(float3); float4 __ovld __cnfn cbrt(float4); float8 __ovld __cnfn cbrt(float8); float16 __ovld __cnfn cbrt(float16); @@ -1621,7 +1113,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn cbrt(half); half2 __ovld __cnfn cbrt(half2); -@@ -7098,14 +7098,14 @@ float3 __ovld __cnfn ceil(float3); +@@ -7098,14 +7160,14 @@ float3 __ovld __cnfn ceil(float3); float4 __ovld __cnfn ceil(float4); float8 __ovld __cnfn ceil(float8); float16 __ovld __cnfn ceil(float16); @@ -1638,7 +1130,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn ceil(half); half2 __ovld __cnfn ceil(half2); -@@ -7124,14 +7124,14 @@ float3 __ovld __cnfn copysign(float3 x, float3 y); +@@ -7124,14 +7186,14 @@ float3 __ovld __cnfn copysign(float3 x, float3 y); float4 __ovld __cnfn copysign(float4 x, float4 y); float8 __ovld __cnfn copysign(float8 x, float8 y); float16 __ovld __cnfn copysign(float16 x, float16 y); @@ -1655,7 +1147,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn copysign(half x, half y); half2 __ovld __cnfn copysign(half2 x, half2 y); -@@ -7150,14 +7150,14 @@ float3 __ovld __cnfn cos(float3); +@@ -7150,14 +7212,14 @@ float3 __ovld __cnfn cos(float3); float4 __ovld __cnfn cos(float4); float8 __ovld __cnfn cos(float8); float16 __ovld __cnfn cos(float16); @@ -1672,7 +1164,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn cos(half); half2 __ovld __cnfn cos(half2); -@@ -7176,14 +7176,14 @@ float3 __ovld __cnfn cosh(float3); +@@ -7176,14 +7238,14 @@ float3 __ovld __cnfn cosh(float3); float4 __ovld __cnfn cosh(float4); float8 __ovld __cnfn cosh(float8); float16 __ovld __cnfn cosh(float16); @@ -1689,7 +1181,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn cosh(half); half2 __ovld __cnfn cosh(half2); -@@ -7202,14 +7202,14 @@ float3 __ovld __cnfn cospi(float3 x); +@@ -7202,14 +7264,14 @@ float3 __ovld __cnfn cospi(float3 x); float4 __ovld __cnfn cospi(float4 x); float8 __ovld __cnfn cospi(float8 x); float16 __ovld __cnfn cospi(float16 x); @@ -1706,7 +1198,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn cospi(half x); half2 __ovld __cnfn cospi(half2 x); -@@ -7228,14 +7228,14 @@ float3 __ovld __cnfn erfc(float3); +@@ -7228,14 +7290,14 @@ float3 __ovld __cnfn erfc(float3); float4 __ovld __cnfn erfc(float4); float8 __ovld __cnfn erfc(float8); float16 __ovld __cnfn erfc(float16); @@ -1723,7 +1215,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn erfc(half); half2 __ovld __cnfn erfc(half2); -@@ -7255,14 +7255,14 @@ float3 __ovld __cnfn erf(float3); +@@ -7255,14 +7317,14 @@ float3 __ovld __cnfn erf(float3); float4 __ovld __cnfn erf(float4); float8 __ovld __cnfn erf(float8); float16 __ovld __cnfn erf(float16); @@ -1740,7 +1232,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn erf(half); half2 __ovld __cnfn erf(half2); -@@ -7281,14 +7281,14 @@ float3 __ovld __cnfn exp(float3 x); +@@ -7281,14 +7343,14 @@ float3 __ovld __cnfn exp(float3 x); float4 __ovld __cnfn exp(float4 x); float8 __ovld __cnfn exp(float8 x); float16 __ovld __cnfn exp(float16 x); @@ -1757,7 +1249,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn exp(half x); half2 __ovld __cnfn exp(half2 x); -@@ -7307,14 +7307,14 @@ float3 __ovld __cnfn exp2(float3); +@@ -7307,14 +7369,14 @@ float3 __ovld __cnfn exp2(float3); float4 __ovld __cnfn exp2(float4); float8 __ovld __cnfn exp2(float8); float16 __ovld __cnfn exp2(float16); @@ -1774,7 +1266,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn exp2(half); half2 __ovld __cnfn exp2(half2); -@@ -7333,14 +7333,14 @@ float3 __ovld __cnfn exp10(float3); +@@ -7333,14 +7395,14 @@ float3 __ovld __cnfn exp10(float3); float4 __ovld __cnfn exp10(float4); float8 __ovld __cnfn exp10(float8); float16 __ovld __cnfn exp10(float16); @@ -1791,7 +1283,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn exp10(half); half2 __ovld __cnfn exp10(half2); -@@ -7359,14 +7359,14 @@ float3 __ovld __cnfn expm1(float3 x); +@@ -7359,14 +7421,14 @@ float3 __ovld __cnfn expm1(float3 x); float4 __ovld __cnfn expm1(float4 x); float8 __ovld __cnfn expm1(float8 x); float16 __ovld __cnfn expm1(float16 x); @@ -1808,7 +1300,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn expm1(half x); half2 __ovld __cnfn expm1(half2 x); -@@ -7385,14 +7385,14 @@ float3 __ovld __cnfn fabs(float3); +@@ -7385,14 +7447,14 @@ float3 __ovld __cnfn fabs(float3); float4 __ovld __cnfn fabs(float4); float8 __ovld __cnfn fabs(float8); float16 __ovld __cnfn fabs(float16); @@ -1825,7 +1317,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn fabs(half); half2 __ovld __cnfn fabs(half2); -@@ -7411,14 +7411,14 @@ float3 __ovld __cnfn fdim(float3 x, float3 y); +@@ -7411,14 +7473,14 @@ float3 __ovld __cnfn fdim(float3 x, float3 y); float4 __ovld __cnfn fdim(float4 x, float4 y); float8 __ovld __cnfn fdim(float8 x, float8 y); float16 __ovld __cnfn fdim(float16 x, float16 y); @@ -1842,7 +1334,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn fdim(half x, half y); half2 __ovld __cnfn fdim(half2 x, half2 y); -@@ -7438,14 +7438,14 @@ float3 __ovld __cnfn floor(float3); +@@ -7438,14 +7500,14 @@ float3 __ovld __cnfn floor(float3); float4 __ovld __cnfn floor(float4); float8 __ovld __cnfn floor(float8); float16 __ovld __cnfn floor(float16); @@ -1859,7 +1351,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn floor(half); half2 __ovld __cnfn floor(half2); -@@ -7468,14 +7468,14 @@ float3 __ovld __cnfn fma(float3 a, float3 b, float3 c); +@@ -7468,14 +7530,14 @@ float3 __ovld __cnfn fma(float3 a, float3 b, float3 c); float4 __ovld __cnfn fma(float4 a, float4 b, float4 c); float8 __ovld __cnfn fma(float8 a, float8 b, float8 c); float16 __ovld __cnfn fma(float16 a, float16 b, float16 c); @@ -1876,7 +1368,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn fma(half a, half b, half c); half2 __ovld __cnfn fma(half2 a, half2 b, half2 c); -@@ -7502,7 +7502,7 @@ float3 __ovld __cnfn fmax(float3 x, float y); +@@ -7502,7 +7564,7 @@ float3 __ovld __cnfn fmax(float3 x, float y); float4 __ovld __cnfn fmax(float4 x, float y); float8 __ovld __cnfn fmax(float8 x, float y); float16 __ovld __cnfn fmax(float16 x, float y); @@ -1885,7 +1377,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn fmax(double x, double y); double2 __ovld __cnfn fmax(double2 x, double2 y); double3 __ovld __cnfn fmax(double3 x, double3 y); -@@ -7514,7 +7514,7 @@ double3 __ovld __cnfn fmax(double3 x, double y); +@@ -7514,7 +7576,7 @@ double3 __ovld __cnfn fmax(double3 x, double y); double4 __ovld __cnfn fmax(double4 x, double y); double8 __ovld __cnfn fmax(double8 x, double y); double16 __ovld __cnfn fmax(double16 x, double y); @@ -1894,7 +1386,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn fmax(half x, half y); half2 __ovld __cnfn fmax(half2 x, half2 y); -@@ -7546,7 +7546,7 @@ float3 __ovld __cnfn fmin(float3 x, float y); +@@ -7546,7 +7608,7 @@ float3 __ovld __cnfn fmin(float3 x, float y); float4 __ovld __cnfn fmin(float4 x, float y); float8 __ovld __cnfn fmin(float8 x, float y); float16 __ovld __cnfn fmin(float16 x, float y); @@ -1903,7 +1395,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn fmin(double x, double y); double2 __ovld __cnfn fmin(double2 x, double2 y); double3 __ovld __cnfn fmin(double3 x, double3 y); -@@ -7558,7 +7558,7 @@ double3 __ovld __cnfn fmin(double3 x, double y); +@@ -7558,7 +7620,7 @@ double3 __ovld __cnfn fmin(double3 x, double y); double4 __ovld __cnfn fmin(double4 x, double y); double8 __ovld __cnfn fmin(double8 x, double y); double16 __ovld __cnfn fmin(double16 x, double y); @@ -1912,7 +1404,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn fmin(half x, half y); half2 __ovld __cnfn fmin(half2 x, half2 y); -@@ -7582,14 +7582,14 @@ float3 __ovld __cnfn fmod(float3 x, float3 y); +@@ -7582,14 +7644,14 @@ float3 __ovld __cnfn fmod(float3 x, float3 y); float4 __ovld __cnfn fmod(float4 x, float4 y); float8 __ovld __cnfn fmod(float8 x, float8 y); float16 __ovld __cnfn fmod(float16 x, float16 y); @@ -1929,7 +1421,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn fmod(half x, half y); half2 __ovld __cnfn fmod(half2 x, half2 y); -@@ -7603,21 +7603,21 @@ half16 __ovld __cnfn fmod(half16 x, half16 y); +@@ -7603,21 +7665,21 @@ half16 __ovld __cnfn fmod(half16 x, half16 y); * Returns fmin(x - floor (x), 0x1.fffffep-1f ). * floor(x) is returned in iptr. */ @@ -1954,7 +1446,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld fract(half x, half *iptr); half2 __ovld fract(half2 x, half2 *iptr); -@@ -7626,7 +7626,9 @@ half4 __ovld fract(half4 x, half4 *iptr); +@@ -7626,7 +7688,9 @@ half4 __ovld fract(half4 x, half4 *iptr); half8 __ovld fract(half8 x, half8 *iptr); half16 __ovld fract(half16 x, half16 *iptr); #endif //cl_khr_fp16 @@ -1965,7 +1457,7 @@ index 514c710c11..9dcd10d54f 100644 float __ovld fract(float x, __global float *iptr); float2 __ovld fract(float2 x, __global float2 *iptr); float3 __ovld fract(float3 x, __global float3 *iptr); -@@ -7645,7 +7647,7 @@ float3 __ovld fract(float3 x, __private float3 *iptr); +@@ -7645,7 +7709,7 @@ float3 __ovld fract(float3 x, __private float3 *iptr); float4 __ovld fract(float4 x, __private float4 *iptr); float8 __ovld fract(float8 x, __private float8 *iptr); float16 __ovld fract(float16 x, __private float16 *iptr); @@ -1974,7 +1466,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld fract(double x, __global double *iptr); double2 __ovld fract(double2 x, __global double2 *iptr); double3 __ovld fract(double3 x, __global double3 *iptr); -@@ -7664,7 +7666,7 @@ double3 __ovld fract(double3 x, __private double3 *iptr); +@@ -7664,7 +7728,7 @@ double3 __ovld fract(double3 x, __private double3 *iptr); double4 __ovld fract(double4 x, __private double4 *iptr); double8 __ovld fract(double8 x, __private double8 *iptr); double16 __ovld fract(double16 x, __private double16 *iptr); @@ -1983,7 +1475,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld fract(half x, __global half *iptr); half2 __ovld fract(half2 x, __global half2 *iptr); -@@ -7685,29 +7687,29 @@ half4 __ovld fract(half4 x, __private half4 *iptr); +@@ -7685,29 +7749,29 @@ half4 __ovld fract(half4 x, __private half4 *iptr); half8 __ovld fract(half8 x, __private half8 *iptr); half16 __ovld fract(half16 x, __private half16 *iptr); #endif //cl_khr_fp16 @@ -2018,7 +1510,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld frexp(half x, int *exp); half2 __ovld frexp(half2 x, int2 *exp); -@@ -7716,7 +7718,9 @@ half4 __ovld frexp(half4 x, int4 *exp); +@@ -7716,7 +7780,9 @@ half4 __ovld frexp(half4 x, int4 *exp); half8 __ovld frexp(half8 x, int8 *exp); half16 __ovld frexp(half16 x, int16 *exp); #endif //cl_khr_fp16 @@ -2029,7 +1521,7 @@ index 514c710c11..9dcd10d54f 100644 float __ovld frexp(float x, __global int *exp); float2 __ovld frexp(float2 x, __global int2 *exp); float3 __ovld frexp(float3 x, __global int3 *exp); -@@ -7735,7 +7739,7 @@ float3 __ovld frexp(float3 x, __private int3 *exp); +@@ -7735,7 +7801,7 @@ float3 __ovld frexp(float3 x, __private int3 *exp); float4 __ovld frexp(float4 x, __private int4 *exp); float8 __ovld frexp(float8 x, __private int8 *exp); float16 __ovld frexp(float16 x, __private int16 *exp); @@ -2038,7 +1530,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld frexp(double x, __global int *exp); double2 __ovld frexp(double2 x, __global int2 *exp); double3 __ovld frexp(double3 x, __global int3 *exp); -@@ -7754,7 +7758,7 @@ double3 __ovld frexp(double3 x, __private int3 *exp); +@@ -7754,7 +7820,7 @@ double3 __ovld frexp(double3 x, __private int3 *exp); double4 __ovld frexp(double4 x, __private int4 *exp); double8 __ovld frexp(double8 x, __private int8 *exp); double16 __ovld frexp(double16 x, __private int16 *exp); @@ -2047,7 +1539,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld frexp(half x, __global int *exp); half2 __ovld frexp(half2 x, __global int2 *exp); -@@ -7775,7 +7779,8 @@ half4 __ovld frexp(half4 x, __private int4 *exp); +@@ -7775,7 +7841,8 @@ half4 __ovld frexp(half4 x, __private int4 *exp); half8 __ovld frexp(half8 x, __private int8 *exp); half16 __ovld frexp(half16 x, __private int16 *exp); #endif //cl_khr_fp16 @@ -2057,7 +1549,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Compute the value of the square root of x^2 + y^2 -@@ -7787,14 +7792,14 @@ float3 __ovld __cnfn hypot(float3 x, float3 y); +@@ -7787,14 +7854,14 @@ float3 __ovld __cnfn hypot(float3 x, float3 y); float4 __ovld __cnfn hypot(float4 x, float4 y); float8 __ovld __cnfn hypot(float8 x, float8 y); float16 __ovld __cnfn hypot(float16 x, float16 y); @@ -2074,7 +1566,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn hypot(half x, half y); half2 __ovld __cnfn hypot(half2 x, half2 y); -@@ -7813,14 +7818,14 @@ int3 __ovld __cnfn ilogb(float3 x); +@@ -7813,14 +7880,14 @@ int3 __ovld __cnfn ilogb(float3 x); int4 __ovld __cnfn ilogb(float4 x); int8 __ovld __cnfn ilogb(float8 x); int16 __ovld __cnfn ilogb(float16 x); @@ -2091,7 +1583,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn ilogb(half x); int2 __ovld __cnfn ilogb(half2 x); -@@ -7844,7 +7849,7 @@ float3 __ovld __cnfn ldexp(float3 x, int n); +@@ -7844,7 +7911,7 @@ float3 __ovld __cnfn ldexp(float3 x, int n); float4 __ovld __cnfn ldexp(float4 x, int n); float8 __ovld __cnfn ldexp(float8 x, int n); float16 __ovld __cnfn ldexp(float16 x, int n); @@ -2100,7 +1592,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn ldexp(double x, int n); double2 __ovld __cnfn ldexp(double2 x, int2 n); double3 __ovld __cnfn ldexp(double3 x, int3 n); -@@ -7856,7 +7861,7 @@ double3 __ovld __cnfn ldexp(double3 x, int n); +@@ -7856,7 +7923,7 @@ double3 __ovld __cnfn ldexp(double3 x, int n); double4 __ovld __cnfn ldexp(double4 x, int n); double8 __ovld __cnfn ldexp(double8 x, int n); double16 __ovld __cnfn ldexp(double16 x, int n); @@ -2109,7 +1601,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn ldexp(half x, int n); half2 __ovld __cnfn ldexp(half2 x, int2 n); -@@ -7883,14 +7888,14 @@ float3 __ovld __cnfn lgamma(float3 x); +@@ -7883,14 +7950,14 @@ float3 __ovld __cnfn lgamma(float3 x); float4 __ovld __cnfn lgamma(float4 x); float8 __ovld __cnfn lgamma(float8 x); float16 __ovld __cnfn lgamma(float16 x); @@ -2126,7 +1618,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn lgamma(half x); half2 __ovld __cnfn lgamma(half2 x); -@@ -7900,21 +7905,21 @@ half8 __ovld __cnfn lgamma(half8 x); +@@ -7900,21 +7967,21 @@ half8 __ovld __cnfn lgamma(half8 x); half16 __ovld __cnfn lgamma(half16 x); #endif //cl_khr_fp16 @@ -2151,7 +1643,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld lgamma_r(half x, int *signp); half2 __ovld lgamma_r(half2 x, int2 *signp); -@@ -7923,7 +7928,9 @@ half4 __ovld lgamma_r(half4 x, int4 *signp); +@@ -7923,7 +7990,9 @@ half4 __ovld lgamma_r(half4 x, int4 *signp); half8 __ovld lgamma_r(half8 x, int8 *signp); half16 __ovld lgamma_r(half16 x, int16 *signp); #endif //cl_khr_fp16 @@ -2162,7 +1654,7 @@ index 514c710c11..9dcd10d54f 100644 float __ovld lgamma_r(float x, __global int *signp); float2 __ovld lgamma_r(float2 x, __global int2 *signp); float3 __ovld lgamma_r(float3 x, __global int3 *signp); -@@ -7942,7 +7949,7 @@ float3 __ovld lgamma_r(float3 x, __private int3 *signp); +@@ -7942,7 +8011,7 @@ float3 __ovld lgamma_r(float3 x, __private int3 *signp); float4 __ovld lgamma_r(float4 x, __private int4 *signp); float8 __ovld lgamma_r(float8 x, __private int8 *signp); float16 __ovld lgamma_r(float16 x, __private int16 *signp); @@ -2171,7 +1663,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld lgamma_r(double x, __global int *signp); double2 __ovld lgamma_r(double2 x, __global int2 *signp); double3 __ovld lgamma_r(double3 x, __global int3 *signp); -@@ -7961,7 +7968,7 @@ double3 __ovld lgamma_r(double3 x, __private int3 *signp); +@@ -7961,7 +8030,7 @@ double3 __ovld lgamma_r(double3 x, __private int3 *signp); double4 __ovld lgamma_r(double4 x, __private int4 *signp); double8 __ovld lgamma_r(double8 x, __private int8 *signp); double16 __ovld lgamma_r(double16 x, __private int16 *signp); @@ -2180,7 +1672,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld lgamma_r(half x, __global int *signp); half2 __ovld lgamma_r(half2 x, __global int2 *signp); -@@ -7982,8 +7989,8 @@ half4 __ovld lgamma_r(half4 x, __private int4 *signp); +@@ -7982,8 +8051,8 @@ half4 __ovld lgamma_r(half4 x, __private int4 *signp); half8 __ovld lgamma_r(half8 x, __private int8 *signp); half16 __ovld lgamma_r(half16 x, __private int16 *signp); #endif //cl_khr_fp16 @@ -2191,7 +1683,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Compute natural logarithm. */ -@@ -7993,14 +8000,14 @@ float3 __ovld __cnfn log(float3); +@@ -7993,14 +8062,14 @@ float3 __ovld __cnfn log(float3); float4 __ovld __cnfn log(float4); float8 __ovld __cnfn log(float8); float16 __ovld __cnfn log(float16); @@ -2208,7 +1700,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn log(half); half2 __ovld __cnfn log(half2); -@@ -8011,7 +8018,7 @@ half16 __ovld __cnfn log(half16); +@@ -8011,7 +8080,7 @@ half16 __ovld __cnfn log(half16); #endif //cl_khr_fp16 /** @@ -2217,7 +1709,7 @@ index 514c710c11..9dcd10d54f 100644 */ float __ovld __cnfn log2(float); float2 __ovld __cnfn log2(float2); -@@ -8019,14 +8026,14 @@ float3 __ovld __cnfn log2(float3); +@@ -8019,14 +8088,14 @@ float3 __ovld __cnfn log2(float3); float4 __ovld __cnfn log2(float4); float8 __ovld __cnfn log2(float8); float16 __ovld __cnfn log2(float16); @@ -2234,7 +1726,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn log2(half); half2 __ovld __cnfn log2(half2); -@@ -8045,14 +8052,14 @@ float3 __ovld __cnfn log10(float3); +@@ -8045,14 +8114,14 @@ float3 __ovld __cnfn log10(float3); float4 __ovld __cnfn log10(float4); float8 __ovld __cnfn log10(float8); float16 __ovld __cnfn log10(float16); @@ -2251,7 +1743,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn log10(half); half2 __ovld __cnfn log10(half2); -@@ -8071,14 +8078,14 @@ float3 __ovld __cnfn log1p(float3 x); +@@ -8071,14 +8140,14 @@ float3 __ovld __cnfn log1p(float3 x); float4 __ovld __cnfn log1p(float4 x); float8 __ovld __cnfn log1p(float8 x); float16 __ovld __cnfn log1p(float16 x); @@ -2268,7 +1760,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn log1p(half x); half2 __ovld __cnfn log1p(half2 x); -@@ -8098,14 +8105,14 @@ float3 __ovld __cnfn logb(float3 x); +@@ -8098,14 +8167,14 @@ float3 __ovld __cnfn logb(float3 x); float4 __ovld __cnfn logb(float4 x); float8 __ovld __cnfn logb(float8 x); float16 __ovld __cnfn logb(float16 x); @@ -2285,7 +1777,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn logb(half x); half2 __ovld __cnfn logb(half2 x); -@@ -8128,14 +8135,14 @@ float3 __ovld __cnfn mad(float3 a, float3 b, float3 c); +@@ -8128,14 +8197,14 @@ float3 __ovld __cnfn mad(float3 a, float3 b, float3 c); float4 __ovld __cnfn mad(float4 a, float4 b, float4 c); float8 __ovld __cnfn mad(float8 a, float8 b, float8 c); float16 __ovld __cnfn mad(float16 a, float16 b, float16 c); @@ -2302,7 +1794,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn mad(half a, half b, half c); half2 __ovld __cnfn mad(half2 a, half2 b, half2 c); -@@ -8155,14 +8162,14 @@ float3 __ovld __cnfn maxmag(float3 x, float3 y); +@@ -8155,14 +8224,14 @@ float3 __ovld __cnfn maxmag(float3 x, float3 y); float4 __ovld __cnfn maxmag(float4 x, float4 y); float8 __ovld __cnfn maxmag(float8 x, float8 y); float16 __ovld __cnfn maxmag(float16 x, float16 y); @@ -2319,7 +1811,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn maxmag(half x, half y); half2 __ovld __cnfn maxmag(half2 x, half2 y); -@@ -8182,14 +8189,14 @@ float3 __ovld __cnfn minmag(float3 x, float3 y); +@@ -8182,14 +8251,14 @@ float3 __ovld __cnfn minmag(float3 x, float3 y); float4 __ovld __cnfn minmag(float4 x, float4 y); float8 __ovld __cnfn minmag(float8 x, float8 y); float16 __ovld __cnfn minmag(float16 x, float16 y); @@ -2336,7 +1828,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn minmag(half x, half y); half2 __ovld __cnfn minmag(half2 x, half2 y); -@@ -8206,21 +8213,21 @@ half16 __ovld __cnfn minmag(half16 x, half16 y); +@@ -8206,21 +8275,21 @@ half16 __ovld __cnfn minmag(half16 x, half16 y); * the argument. It stores the integral part in the object * pointed to by iptr. */ @@ -2361,7 +1853,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld modf(half x, half *iptr); half2 __ovld modf(half2 x, half2 *iptr); -@@ -8229,7 +8236,9 @@ half4 __ovld modf(half4 x, half4 *iptr); +@@ -8229,7 +8298,9 @@ half4 __ovld modf(half4 x, half4 *iptr); half8 __ovld modf(half8 x, half8 *iptr); half16 __ovld modf(half16 x, half16 *iptr); #endif //cl_khr_fp16 @@ -2372,7 +1864,7 @@ index 514c710c11..9dcd10d54f 100644 float __ovld modf(float x, __global float *iptr); float2 __ovld modf(float2 x, __global float2 *iptr); float3 __ovld modf(float3 x, __global float3 *iptr); -@@ -8248,7 +8257,7 @@ float3 __ovld modf(float3 x, __private float3 *iptr); +@@ -8248,7 +8319,7 @@ float3 __ovld modf(float3 x, __private float3 *iptr); float4 __ovld modf(float4 x, __private float4 *iptr); float8 __ovld modf(float8 x, __private float8 *iptr); float16 __ovld modf(float16 x, __private float16 *iptr); @@ -2381,7 +1873,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld modf(double x, __global double *iptr); double2 __ovld modf(double2 x, __global double2 *iptr); double3 __ovld modf(double3 x, __global double3 *iptr); -@@ -8267,7 +8276,7 @@ double3 __ovld modf(double3 x, __private double3 *iptr); +@@ -8267,7 +8338,7 @@ double3 __ovld modf(double3 x, __private double3 *iptr); double4 __ovld modf(double4 x, __private double4 *iptr); double8 __ovld modf(double8 x, __private double8 *iptr); double16 __ovld modf(double16 x, __private double16 *iptr); @@ -2390,7 +1882,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld modf(half x, __global half *iptr); half2 __ovld modf(half2 x, __global half2 *iptr); -@@ -8288,7 +8297,8 @@ half4 __ovld modf(half4 x, __private half4 *iptr); +@@ -8288,7 +8359,8 @@ half4 __ovld modf(half4 x, __private half4 *iptr); half8 __ovld modf(half8 x, __private half8 *iptr); half16 __ovld modf(half16 x, __private half16 *iptr); #endif //cl_khr_fp16 @@ -2400,7 +1892,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Returns a quiet NaN. The nancode may be placed -@@ -8300,14 +8310,14 @@ float3 __ovld __cnfn nan(uint3 nancode); +@@ -8300,14 +8372,14 @@ float3 __ovld __cnfn nan(uint3 nancode); float4 __ovld __cnfn nan(uint4 nancode); float8 __ovld __cnfn nan(uint8 nancode); float16 __ovld __cnfn nan(uint16 nancode); @@ -2417,7 +1909,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn nan(ushort nancode); half2 __ovld __cnfn nan(ushort2 nancode); -@@ -8330,14 +8340,14 @@ float3 __ovld __cnfn nextafter(float3 x, float3 y); +@@ -8330,14 +8402,14 @@ float3 __ovld __cnfn nextafter(float3 x, float3 y); float4 __ovld __cnfn nextafter(float4 x, float4 y); float8 __ovld __cnfn nextafter(float8 x, float8 y); float16 __ovld __cnfn nextafter(float16 x, float16 y); @@ -2434,7 +1926,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn nextafter(half x, half y); half2 __ovld __cnfn nextafter(half2 x, half2 y); -@@ -8356,14 +8366,14 @@ float3 __ovld __cnfn pow(float3 x, float3 y); +@@ -8356,14 +8428,14 @@ float3 __ovld __cnfn pow(float3 x, float3 y); float4 __ovld __cnfn pow(float4 x, float4 y); float8 __ovld __cnfn pow(float8 x, float8 y); float16 __ovld __cnfn pow(float16 x, float16 y); @@ -2451,7 +1943,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn pow(half x, half y); half2 __ovld __cnfn pow(half2 x, half2 y); -@@ -8382,14 +8392,14 @@ float3 __ovld __cnfn pown(float3 x, int3 y); +@@ -8382,14 +8454,14 @@ float3 __ovld __cnfn pown(float3 x, int3 y); float4 __ovld __cnfn pown(float4 x, int4 y); float8 __ovld __cnfn pown(float8 x, int8 y); float16 __ovld __cnfn pown(float16 x, int16 y); @@ -2468,7 +1960,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn pown(half x, int y); half2 __ovld __cnfn pown(half2 x, int2 y); -@@ -8408,14 +8418,14 @@ float3 __ovld __cnfn powr(float3 x, float3 y); +@@ -8408,14 +8480,14 @@ float3 __ovld __cnfn powr(float3 x, float3 y); float4 __ovld __cnfn powr(float4 x, float4 y); float8 __ovld __cnfn powr(float8 x, float8 y); float16 __ovld __cnfn powr(float16 x, float16 y); @@ -2485,7 +1977,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn powr(half x, half y); half2 __ovld __cnfn powr(half2 x, half2 y); -@@ -8437,14 +8447,14 @@ float3 __ovld __cnfn remainder(float3 x, float3 y); +@@ -8437,14 +8509,14 @@ float3 __ovld __cnfn remainder(float3 x, float3 y); float4 __ovld __cnfn remainder(float4 x, float4 y); float8 __ovld __cnfn remainder(float8 x, float8 y); float16 __ovld __cnfn remainder(float16 x, float16 y); @@ -2502,7 +1994,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn remainder(half x, half y); half2 __ovld __cnfn remainder(half2 x, half2 y); -@@ -8466,21 +8476,21 @@ half16 __ovld __cnfn remainder(half16 x, half16 y); +@@ -8466,21 +8538,21 @@ half16 __ovld __cnfn remainder(half16 x, half16 y); * sign as x/y. It stores this signed value in the object * pointed to by quo. */ @@ -2527,7 +2019,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld remquo(half x, half y, int *quo); half2 __ovld remquo(half2 x, half2 y, int2 *quo); -@@ -8488,9 +8498,10 @@ half3 __ovld remquo(half3 x, half3 y, int3 *quo); +@@ -8488,9 +8560,10 @@ half3 __ovld remquo(half3 x, half3 y, int3 *quo); half4 __ovld remquo(half4 x, half4 y, int4 *quo); half8 __ovld remquo(half8 x, half8 y, int8 *quo); half16 __ovld remquo(half16 x, half16 y, int16 *quo); @@ -2540,7 +2032,7 @@ index 514c710c11..9dcd10d54f 100644 float __ovld remquo(float x, float y, __global int *quo); float2 __ovld remquo(float2 x, float2 y, __global int2 *quo); float3 __ovld remquo(float3 x, float3 y, __global int3 *quo); -@@ -8509,7 +8520,7 @@ float3 __ovld remquo(float3 x, float3 y, __private int3 *quo); +@@ -8509,7 +8582,7 @@ float3 __ovld remquo(float3 x, float3 y, __private int3 *quo); float4 __ovld remquo(float4 x, float4 y, __private int4 *quo); float8 __ovld remquo(float8 x, float8 y, __private int8 *quo); float16 __ovld remquo(float16 x, float16 y, __private int16 *quo); @@ -2549,7 +2041,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld remquo(double x, double y, __global int *quo); double2 __ovld remquo(double2 x, double2 y, __global int2 *quo); double3 __ovld remquo(double3 x, double3 y, __global int3 *quo); -@@ -8528,7 +8539,7 @@ double3 __ovld remquo(double3 x, double3 y, __private int3 *quo); +@@ -8528,7 +8601,7 @@ double3 __ovld remquo(double3 x, double3 y, __private int3 *quo); double4 __ovld remquo(double4 x, double4 y, __private int4 *quo); double8 __ovld remquo(double8 x, double8 y, __private int8 *quo); double16 __ovld remquo(double16 x, double16 y, __private int16 *quo); @@ -2558,7 +2050,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld remquo(half x, half y, __global int *quo); half2 __ovld remquo(half2 x, half2 y, __global int2 *quo); -@@ -8549,7 +8560,8 @@ half4 __ovld remquo(half4 x, half4 y, __private int4 *quo); +@@ -8549,7 +8622,8 @@ half4 __ovld remquo(half4 x, half4 y, __private int4 *quo); half8 __ovld remquo(half8 x, half8 y, __private int8 *quo); half16 __ovld remquo(half16 x, half16 y, __private int16 *quo); #endif //cl_khr_fp16 @@ -2568,7 +2060,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Round to integral value (using round to nearest * even rounding mode) in floating-point format. -@@ -8562,14 +8574,14 @@ float3 __ovld __cnfn rint(float3); +@@ -8562,14 +8636,14 @@ float3 __ovld __cnfn rint(float3); float4 __ovld __cnfn rint(float4); float8 __ovld __cnfn rint(float8); float16 __ovld __cnfn rint(float16); @@ -2585,7 +2077,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn rint(half); half2 __ovld __cnfn rint(half2); -@@ -8588,14 +8600,14 @@ float3 __ovld __cnfn rootn(float3 x, int3 y); +@@ -8588,14 +8662,14 @@ float3 __ovld __cnfn rootn(float3 x, int3 y); float4 __ovld __cnfn rootn(float4 x, int4 y); float8 __ovld __cnfn rootn(float8 x, int8 y); float16 __ovld __cnfn rootn(float16 x, int16 y); @@ -2602,7 +2094,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn rootn(half x, int y); half2 __ovld __cnfn rootn(half2 x, int2 y); -@@ -8616,14 +8628,14 @@ float3 __ovld __cnfn round(float3 x); +@@ -8616,14 +8690,14 @@ float3 __ovld __cnfn round(float3 x); float4 __ovld __cnfn round(float4 x); float8 __ovld __cnfn round(float8 x); float16 __ovld __cnfn round(float16 x); @@ -2619,7 +2111,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn round(half x); half2 __ovld __cnfn round(half2 x); -@@ -8642,14 +8654,14 @@ float3 __ovld __cnfn rsqrt(float3); +@@ -8642,14 +8716,14 @@ float3 __ovld __cnfn rsqrt(float3); float4 __ovld __cnfn rsqrt(float4); float8 __ovld __cnfn rsqrt(float8); float16 __ovld __cnfn rsqrt(float16); @@ -2636,7 +2128,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn rsqrt(half); half2 __ovld __cnfn rsqrt(half2); -@@ -8668,14 +8680,14 @@ float3 __ovld __cnfn sin(float3); +@@ -8668,14 +8742,14 @@ float3 __ovld __cnfn sin(float3); float4 __ovld __cnfn sin(float4); float8 __ovld __cnfn sin(float8); float16 __ovld __cnfn sin(float16); @@ -2653,7 +2145,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn sin(half); half2 __ovld __cnfn sin(half2); -@@ -8690,21 +8702,21 @@ half16 __ovld __cnfn sin(half16); +@@ -8690,21 +8764,21 @@ half16 __ovld __cnfn sin(half16); * is the return value and computed cosine is returned * in cosval. */ @@ -2678,7 +2170,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld sincos(half x, half *cosval); half2 __ovld sincos(half2 x, half2 *cosval); -@@ -8713,7 +8725,9 @@ half4 __ovld sincos(half4 x, half4 *cosval); +@@ -8713,7 +8787,9 @@ half4 __ovld sincos(half4 x, half4 *cosval); half8 __ovld sincos(half8 x, half8 *cosval); half16 __ovld sincos(half16 x, half16 *cosval); #endif //cl_khr_fp16 @@ -2689,7 +2181,7 @@ index 514c710c11..9dcd10d54f 100644 float __ovld sincos(float x, __global float *cosval); float2 __ovld sincos(float2 x, __global float2 *cosval); float3 __ovld sincos(float3 x, __global float3 *cosval); -@@ -8732,7 +8746,7 @@ float3 __ovld sincos(float3 x, __private float3 *cosval); +@@ -8732,7 +8808,7 @@ float3 __ovld sincos(float3 x, __private float3 *cosval); float4 __ovld sincos(float4 x, __private float4 *cosval); float8 __ovld sincos(float8 x, __private float8 *cosval); float16 __ovld sincos(float16 x, __private float16 *cosval); @@ -2698,7 +2190,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld sincos(double x, __global double *cosval); double2 __ovld sincos(double2 x, __global double2 *cosval); double3 __ovld sincos(double3 x, __global double3 *cosval); -@@ -8751,7 +8765,7 @@ double3 __ovld sincos(double3 x, __private double3 *cosval); +@@ -8751,7 +8827,7 @@ double3 __ovld sincos(double3 x, __private double3 *cosval); double4 __ovld sincos(double4 x, __private double4 *cosval); double8 __ovld sincos(double8 x, __private double8 *cosval); double16 __ovld sincos(double16 x, __private double16 *cosval); @@ -2707,7 +2199,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld sincos(half x, __global half *cosval); half2 __ovld sincos(half2 x, __global half2 *cosval); -@@ -8772,8 +8786,8 @@ half4 __ovld sincos(half4 x, __private half4 *cosval); +@@ -8772,8 +8848,8 @@ half4 __ovld sincos(half4 x, __private half4 *cosval); half8 __ovld sincos(half8 x, __private half8 *cosval); half16 __ovld sincos(half16 x, __private half16 *cosval); #endif //cl_khr_fp16 @@ -2718,7 +2210,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Compute hyperbolic sine. */ -@@ -8783,14 +8797,14 @@ float3 __ovld __cnfn sinh(float3); +@@ -8783,14 +8859,14 @@ float3 __ovld __cnfn sinh(float3); float4 __ovld __cnfn sinh(float4); float8 __ovld __cnfn sinh(float8); float16 __ovld __cnfn sinh(float16); @@ -2735,7 +2227,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn sinh(half); half2 __ovld __cnfn sinh(half2); -@@ -8809,14 +8823,14 @@ float3 __ovld __cnfn sinpi(float3 x); +@@ -8809,14 +8885,14 @@ float3 __ovld __cnfn sinpi(float3 x); float4 __ovld __cnfn sinpi(float4 x); float8 __ovld __cnfn sinpi(float8 x); float16 __ovld __cnfn sinpi(float16 x); @@ -2752,7 +2244,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn sinpi(half x); half2 __ovld __cnfn sinpi(half2 x); -@@ -8835,14 +8849,14 @@ float3 __ovld __cnfn sqrt(float3); +@@ -8835,14 +8911,14 @@ float3 __ovld __cnfn sqrt(float3); float4 __ovld __cnfn sqrt(float4); float8 __ovld __cnfn sqrt(float8); float16 __ovld __cnfn sqrt(float16); @@ -2769,7 +2261,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn sqrt(half); half2 __ovld __cnfn sqrt(half2); -@@ -8861,14 +8875,14 @@ float3 __ovld __cnfn tan(float3); +@@ -8861,14 +8937,14 @@ float3 __ovld __cnfn tan(float3); float4 __ovld __cnfn tan(float4); float8 __ovld __cnfn tan(float8); float16 __ovld __cnfn tan(float16); @@ -2786,7 +2278,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn tan(half); half2 __ovld __cnfn tan(half2); -@@ -8887,14 +8901,14 @@ float3 __ovld __cnfn tanh(float3); +@@ -8887,14 +8963,14 @@ float3 __ovld __cnfn tanh(float3); float4 __ovld __cnfn tanh(float4); float8 __ovld __cnfn tanh(float8); float16 __ovld __cnfn tanh(float16); @@ -2803,7 +2295,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn tanh(half); half2 __ovld __cnfn tanh(half2); -@@ -8913,14 +8927,14 @@ float3 __ovld __cnfn tanpi(float3 x); +@@ -8913,14 +8989,14 @@ float3 __ovld __cnfn tanpi(float3 x); float4 __ovld __cnfn tanpi(float4 x); float8 __ovld __cnfn tanpi(float8 x); float16 __ovld __cnfn tanpi(float16 x); @@ -2820,7 +2312,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn tanpi(half x); half2 __ovld __cnfn tanpi(half2 x); -@@ -8939,14 +8953,14 @@ float3 __ovld __cnfn tgamma(float3); +@@ -8939,14 +9015,14 @@ float3 __ovld __cnfn tgamma(float3); float4 __ovld __cnfn tgamma(float4); float8 __ovld __cnfn tgamma(float8); float16 __ovld __cnfn tgamma(float16); @@ -2837,7 +2329,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn tgamma(half); half2 __ovld __cnfn tgamma(half2); -@@ -8966,14 +8980,14 @@ float3 __ovld __cnfn trunc(float3); +@@ -8966,14 +9042,14 @@ float3 __ovld __cnfn trunc(float3); float4 __ovld __cnfn trunc(float4); float8 __ovld __cnfn trunc(float8); float16 __ovld __cnfn trunc(float16); @@ -2854,7 +2346,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn trunc(half); half2 __ovld __cnfn trunc(half2); -@@ -10383,7 +10397,7 @@ float3 __ovld __cnfn clamp(float3 x, float minval, float maxval); +@@ -10383,7 +10459,7 @@ float3 __ovld __cnfn clamp(float3 x, float minval, float maxval); float4 __ovld __cnfn clamp(float4 x, float minval, float maxval); float8 __ovld __cnfn clamp(float8 x, float minval, float maxval); float16 __ovld __cnfn clamp(float16 x, float minval, float maxval); @@ -2863,7 +2355,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn clamp(double x, double minval, double maxval); double2 __ovld __cnfn clamp(double2 x, double2 minval, double2 maxval); double3 __ovld __cnfn clamp(double3 x, double3 minval, double3 maxval); -@@ -10395,7 +10409,7 @@ double3 __ovld __cnfn clamp(double3 x, double minval, double maxval); +@@ -10395,7 +10471,7 @@ double3 __ovld __cnfn clamp(double3 x, double minval, double maxval); double4 __ovld __cnfn clamp(double4 x, double minval, double maxval); double8 __ovld __cnfn clamp(double8 x, double minval, double maxval); double16 __ovld __cnfn clamp(double16 x, double minval, double maxval); @@ -2872,7 +2364,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn clamp(half x, half minval, half maxval); half2 __ovld __cnfn clamp(half2 x, half2 minval, half2 maxval); -@@ -10420,14 +10434,14 @@ float3 __ovld __cnfn degrees(float3 radians); +@@ -10420,14 +10496,14 @@ float3 __ovld __cnfn degrees(float3 radians); float4 __ovld __cnfn degrees(float4 radians); float8 __ovld __cnfn degrees(float8 radians); float16 __ovld __cnfn degrees(float16 radians); @@ -2889,7 +2381,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn degrees(half radians); half2 __ovld __cnfn degrees(half2 radians); -@@ -10452,7 +10466,7 @@ float3 __ovld __cnfn max(float3 x, float y); +@@ -10452,7 +10528,7 @@ float3 __ovld __cnfn max(float3 x, float y); float4 __ovld __cnfn max(float4 x, float y); float8 __ovld __cnfn max(float8 x, float y); float16 __ovld __cnfn max(float16 x, float y); @@ -2898,7 +2390,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn max(double x, double y); double2 __ovld __cnfn max(double2 x, double2 y); double3 __ovld __cnfn max(double3 x, double3 y); -@@ -10464,7 +10478,7 @@ double3 __ovld __cnfn max(double3 x, double y); +@@ -10464,7 +10540,7 @@ double3 __ovld __cnfn max(double3 x, double y); double4 __ovld __cnfn max(double4 x, double y); double8 __ovld __cnfn max(double8 x, double y); double16 __ovld __cnfn max(double16 x, double y); @@ -2907,7 +2399,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn max(half x, half y); half2 __ovld __cnfn max(half2 x, half2 y); -@@ -10494,7 +10508,7 @@ float3 __ovld __cnfn min(float3 x, float y); +@@ -10494,7 +10570,7 @@ float3 __ovld __cnfn min(float3 x, float y); float4 __ovld __cnfn min(float4 x, float y); float8 __ovld __cnfn min(float8 x, float y); float16 __ovld __cnfn min(float16 x, float y); @@ -2916,7 +2408,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn min(double x, double y); double2 __ovld __cnfn min(double2 x, double2 y); double3 __ovld __cnfn min(double3 x, double3 y); -@@ -10506,7 +10520,7 @@ double3 __ovld __cnfn min(double3 x, double y); +@@ -10506,7 +10582,7 @@ double3 __ovld __cnfn min(double3 x, double y); double4 __ovld __cnfn min(double4 x, double y); double8 __ovld __cnfn min(double8 x, double y); double16 __ovld __cnfn min(double16 x, double y); @@ -2925,7 +2417,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn min(half x, half y); half2 __ovld __cnfn min(half2 x, half2 y); -@@ -10539,7 +10553,7 @@ float3 __ovld __cnfn mix(float3 x, float3 y, float a); +@@ -10539,7 +10615,7 @@ float3 __ovld __cnfn mix(float3 x, float3 y, float a); float4 __ovld __cnfn mix(float4 x, float4 y, float a); float8 __ovld __cnfn mix(float8 x, float8 y, float a); float16 __ovld __cnfn mix(float16 x, float16 y, float a); @@ -2934,7 +2426,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn mix(double x, double y, double a); double2 __ovld __cnfn mix(double2 x, double2 y, double2 a); double3 __ovld __cnfn mix(double3 x, double3 y, double3 a); -@@ -10551,7 +10565,7 @@ double3 __ovld __cnfn mix(double3 x, double3 y, double a); +@@ -10551,7 +10627,7 @@ double3 __ovld __cnfn mix(double3 x, double3 y, double a); double4 __ovld __cnfn mix(double4 x, double4 y, double a); double8 __ovld __cnfn mix(double8 x, double8 y, double a); double16 __ovld __cnfn mix(double16 x, double16 y, double a); @@ -2943,7 +2435,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn mix(half x, half y, half a); half2 __ovld __cnfn mix(half2 x, half2 y, half2 a); -@@ -10576,14 +10590,14 @@ float3 __ovld __cnfn radians(float3 degrees); +@@ -10576,14 +10652,14 @@ float3 __ovld __cnfn radians(float3 degrees); float4 __ovld __cnfn radians(float4 degrees); float8 __ovld __cnfn radians(float8 degrees); float16 __ovld __cnfn radians(float16 degrees); @@ -2960,7 +2452,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn radians(half degrees); half2 __ovld __cnfn radians(half2 degrees); -@@ -10607,7 +10621,7 @@ float3 __ovld __cnfn step(float edge, float3 x); +@@ -10607,7 +10683,7 @@ float3 __ovld __cnfn step(float edge, float3 x); float4 __ovld __cnfn step(float edge, float4 x); float8 __ovld __cnfn step(float edge, float8 x); float16 __ovld __cnfn step(float edge, float16 x); @@ -2969,7 +2461,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn step(double edge, double x); double2 __ovld __cnfn step(double2 edge, double2 x); double3 __ovld __cnfn step(double3 edge, double3 x); -@@ -10619,7 +10633,7 @@ double3 __ovld __cnfn step(double edge, double3 x); +@@ -10619,7 +10695,7 @@ double3 __ovld __cnfn step(double edge, double3 x); double4 __ovld __cnfn step(double edge, double4 x); double8 __ovld __cnfn step(double edge, double8 x); double16 __ovld __cnfn step(double edge, double16 x); @@ -2978,7 +2470,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn step(half edge, half x); half2 __ovld __cnfn step(half2 edge, half2 x); -@@ -10659,7 +10673,7 @@ float3 __ovld __cnfn smoothstep(float edge0, float edge1, float3 x); +@@ -10659,7 +10735,7 @@ float3 __ovld __cnfn smoothstep(float edge0, float edge1, float3 x); float4 __ovld __cnfn smoothstep(float edge0, float edge1, float4 x); float8 __ovld __cnfn smoothstep(float edge0, float edge1, float8 x); float16 __ovld __cnfn smoothstep(float edge0, float edge1, float16 x); @@ -2987,7 +2479,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn smoothstep(double edge0, double edge1, double x); double2 __ovld __cnfn smoothstep(double2 edge0, double2 edge1, double2 x); double3 __ovld __cnfn smoothstep(double3 edge0, double3 edge1, double3 x); -@@ -10671,7 +10685,7 @@ double3 __ovld __cnfn smoothstep(double edge0, double edge1, double3 x); +@@ -10671,7 +10747,7 @@ double3 __ovld __cnfn smoothstep(double edge0, double edge1, double3 x); double4 __ovld __cnfn smoothstep(double edge0, double edge1, double4 x); double8 __ovld __cnfn smoothstep(double edge0, double edge1, double8 x); double16 __ovld __cnfn smoothstep(double edge0, double edge1, double16 x); @@ -2996,7 +2488,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn smoothstep(half edge0, half edge1, half x); half2 __ovld __cnfn smoothstep(half2 edge0, half2 edge1, half2 x); -@@ -10697,14 +10711,14 @@ float3 __ovld __cnfn sign(float3 x); +@@ -10697,14 +10773,14 @@ float3 __ovld __cnfn sign(float3 x); float4 __ovld __cnfn sign(float4 x); float8 __ovld __cnfn sign(float8 x); float16 __ovld __cnfn sign(float16 x); @@ -3013,7 +2505,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn sign(half x); half2 __ovld __cnfn sign(half2 x); -@@ -10722,10 +10736,10 @@ half16 __ovld __cnfn sign(half16 x); +@@ -10722,10 +10798,10 @@ half16 __ovld __cnfn sign(half16 x); */ float4 __ovld __cnfn cross(float4 p0, float4 p1); float3 __ovld __cnfn cross(float3 p0, float3 p1); @@ -3026,7 +2518,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half4 __ovld __cnfn cross(half4 p0, half4 p1); half3 __ovld __cnfn cross(half3 p0, half3 p1); -@@ -10738,12 +10752,12 @@ float __ovld __cnfn dot(float p0, float p1); +@@ -10738,12 +10814,12 @@ float __ovld __cnfn dot(float p0, float p1); float __ovld __cnfn dot(float2 p0, float2 p1); float __ovld __cnfn dot(float3 p0, float3 p1); float __ovld __cnfn dot(float4 p0, float4 p1); @@ -3041,7 +2533,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn dot(half p0, half p1); half __ovld __cnfn dot(half2 p0, half2 p1); -@@ -10759,12 +10773,12 @@ float __ovld __cnfn distance(float p0, float p1); +@@ -10759,12 +10835,12 @@ float __ovld __cnfn distance(float p0, float p1); float __ovld __cnfn distance(float2 p0, float2 p1); float __ovld __cnfn distance(float3 p0, float3 p1); float __ovld __cnfn distance(float4 p0, float4 p1); @@ -3056,7 +2548,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn distance(half p0, half p1); half __ovld __cnfn distance(half2 p0, half2 p1); -@@ -10780,12 +10794,12 @@ float __ovld __cnfn length(float p); +@@ -10780,12 +10856,12 @@ float __ovld __cnfn length(float p); float __ovld __cnfn length(float2 p); float __ovld __cnfn length(float3 p); float __ovld __cnfn length(float4 p); @@ -3071,7 +2563,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn length(half p); half __ovld __cnfn length(half2 p); -@@ -10801,12 +10815,12 @@ float __ovld __cnfn normalize(float p); +@@ -10801,12 +10877,12 @@ float __ovld __cnfn normalize(float p); float2 __ovld __cnfn normalize(float2 p); float3 __ovld __cnfn normalize(float3 p); float4 __ovld __cnfn normalize(float4 p); @@ -3086,7 +2578,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn normalize(half p); half2 __ovld __cnfn normalize(half2 p); -@@ -10887,14 +10901,14 @@ int3 __ovld __cnfn isequal(float3 x, float3 y); +@@ -10887,14 +10963,14 @@ int3 __ovld __cnfn isequal(float3 x, float3 y); int4 __ovld __cnfn isequal(float4 x, float4 y); int8 __ovld __cnfn isequal(float8 x, float8 y); int16 __ovld __cnfn isequal(float16 x, float16 y); @@ -3103,7 +2595,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isequal(half x, half y); short2 __ovld __cnfn isequal(half2 x, half2 y); -@@ -10913,14 +10927,14 @@ int3 __ovld __cnfn isnotequal(float3 x, float3 y); +@@ -10913,14 +10989,14 @@ int3 __ovld __cnfn isnotequal(float3 x, float3 y); int4 __ovld __cnfn isnotequal(float4 x, float4 y); int8 __ovld __cnfn isnotequal(float8 x, float8 y); int16 __ovld __cnfn isnotequal(float16 x, float16 y); @@ -3120,7 +2612,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isnotequal(half x, half y); short2 __ovld __cnfn isnotequal(half2 x, half2 y); -@@ -10939,14 +10953,14 @@ int3 __ovld __cnfn isgreater(float3 x, float3 y); +@@ -10939,14 +11015,14 @@ int3 __ovld __cnfn isgreater(float3 x, float3 y); int4 __ovld __cnfn isgreater(float4 x, float4 y); int8 __ovld __cnfn isgreater(float8 x, float8 y); int16 __ovld __cnfn isgreater(float16 x, float16 y); @@ -3137,7 +2629,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isgreater(half x, half y); short2 __ovld __cnfn isgreater(half2 x, half2 y); -@@ -10965,14 +10979,14 @@ int3 __ovld __cnfn isgreaterequal(float3 x, float3 y); +@@ -10965,14 +11041,14 @@ int3 __ovld __cnfn isgreaterequal(float3 x, float3 y); int4 __ovld __cnfn isgreaterequal(float4 x, float4 y); int8 __ovld __cnfn isgreaterequal(float8 x, float8 y); int16 __ovld __cnfn isgreaterequal(float16 x, float16 y); @@ -3154,7 +2646,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isgreaterequal(half x, half y); short2 __ovld __cnfn isgreaterequal(half2 x, half2 y); -@@ -10991,14 +11005,14 @@ int3 __ovld __cnfn isless(float3 x, float3 y); +@@ -10991,14 +11067,14 @@ int3 __ovld __cnfn isless(float3 x, float3 y); int4 __ovld __cnfn isless(float4 x, float4 y); int8 __ovld __cnfn isless(float8 x, float8 y); int16 __ovld __cnfn isless(float16 x, float16 y); @@ -3171,7 +2663,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isless(half x, half y); short2 __ovld __cnfn isless(half2 x, half2 y); -@@ -11017,14 +11031,14 @@ int3 __ovld __cnfn islessequal(float3 x, float3 y); +@@ -11017,14 +11093,14 @@ int3 __ovld __cnfn islessequal(float3 x, float3 y); int4 __ovld __cnfn islessequal(float4 x, float4 y); int8 __ovld __cnfn islessequal(float8 x, float8 y); int16 __ovld __cnfn islessequal(float16 x, float16 y); @@ -3188,7 +2680,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn islessequal(half x, half y); short2 __ovld __cnfn islessequal(half2 x, half2 y); -@@ -11044,14 +11058,14 @@ int3 __ovld __cnfn islessgreater(float3 x, float3 y); +@@ -11044,14 +11120,14 @@ int3 __ovld __cnfn islessgreater(float3 x, float3 y); int4 __ovld __cnfn islessgreater(float4 x, float4 y); int8 __ovld __cnfn islessgreater(float8 x, float8 y); int16 __ovld __cnfn islessgreater(float16 x, float16 y); @@ -3205,7 +2697,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn islessgreater(half x, half y); short2 __ovld __cnfn islessgreater(half2 x, half2 y); -@@ -11070,14 +11084,14 @@ int3 __ovld __cnfn isfinite(float3); +@@ -11070,14 +11146,14 @@ int3 __ovld __cnfn isfinite(float3); int4 __ovld __cnfn isfinite(float4); int8 __ovld __cnfn isfinite(float8); int16 __ovld __cnfn isfinite(float16); @@ -3222,7 +2714,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isfinite(half); short2 __ovld __cnfn isfinite(half2); -@@ -11096,14 +11110,14 @@ int3 __ovld __cnfn isinf(float3); +@@ -11096,14 +11172,14 @@ int3 __ovld __cnfn isinf(float3); int4 __ovld __cnfn isinf(float4); int8 __ovld __cnfn isinf(float8); int16 __ovld __cnfn isinf(float16); @@ -3239,7 +2731,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isinf(half); short2 __ovld __cnfn isinf(half2); -@@ -11122,14 +11136,14 @@ int3 __ovld __cnfn isnan(float3); +@@ -11122,14 +11198,14 @@ int3 __ovld __cnfn isnan(float3); int4 __ovld __cnfn isnan(float4); int8 __ovld __cnfn isnan(float8); int16 __ovld __cnfn isnan(float16); @@ -3256,7 +2748,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isnan(half); short2 __ovld __cnfn isnan(half2); -@@ -11148,14 +11162,14 @@ int3 __ovld __cnfn isnormal(float3); +@@ -11148,14 +11224,14 @@ int3 __ovld __cnfn isnormal(float3); int4 __ovld __cnfn isnormal(float4); int8 __ovld __cnfn isnormal(float8); int16 __ovld __cnfn isnormal(float16); @@ -3273,7 +2765,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isnormal(half); short2 __ovld __cnfn isnormal(half2); -@@ -11176,14 +11190,14 @@ int3 __ovld __cnfn isordered(float3 x, float3 y); +@@ -11176,14 +11252,14 @@ int3 __ovld __cnfn isordered(float3 x, float3 y); int4 __ovld __cnfn isordered(float4 x, float4 y); int8 __ovld __cnfn isordered(float8 x, float8 y); int16 __ovld __cnfn isordered(float16 x, float16 y); @@ -3290,7 +2782,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isordered(half x, half y); short2 __ovld __cnfn isordered(half2 x, half2 y); -@@ -11204,14 +11218,14 @@ int3 __ovld __cnfn isunordered(float3 x, float3 y); +@@ -11204,14 +11280,14 @@ int3 __ovld __cnfn isunordered(float3 x, float3 y); int4 __ovld __cnfn isunordered(float4 x, float4 y); int8 __ovld __cnfn isunordered(float8 x, float8 y); int16 __ovld __cnfn isunordered(float16 x, float16 y); @@ -3307,7 +2799,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn isunordered(half x, half y); short2 __ovld __cnfn isunordered(half2 x, half2 y); -@@ -11234,14 +11248,14 @@ int3 __ovld __cnfn signbit(float3); +@@ -11234,14 +11310,14 @@ int3 __ovld __cnfn signbit(float3); int4 __ovld __cnfn signbit(float4); int8 __ovld __cnfn signbit(float8); int16 __ovld __cnfn signbit(float16); @@ -3324,7 +2816,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 int __ovld __cnfn signbit(half); short2 __ovld __cnfn signbit(half2); -@@ -11368,14 +11382,14 @@ float3 __ovld __cnfn bitselect(float3 a, float3 b, float3 c); +@@ -11368,14 +11444,14 @@ float3 __ovld __cnfn bitselect(float3 a, float3 b, float3 c); float4 __ovld __cnfn bitselect(float4 a, float4 b, float4 c); float8 __ovld __cnfn bitselect(float8 a, float8 b, float8 c); float16 __ovld __cnfn bitselect(float16 a, float16 b, float16 c); @@ -3341,7 +2833,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn bitselect(half a, half b, half c); half2 __ovld __cnfn bitselect(half2 a, half2 b, half2 c); -@@ -11508,7 +11522,7 @@ ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c); +@@ -11508,7 +11584,7 @@ ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c); long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c); ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c); @@ -3350,7 +2842,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __cnfn select(double a, double b, long c); double2 __ovld __cnfn select(double2 a, double2 b, long2 c); double3 __ovld __cnfn select(double3 a, double3 b, long3 c); -@@ -11521,7 +11535,7 @@ double3 __ovld __cnfn select(double3 a, double3 b, ulong3 c); +@@ -11521,7 +11597,7 @@ double3 __ovld __cnfn select(double3 a, double3 b, ulong3 c); double4 __ovld __cnfn select(double4 a, double4 b, ulong4 c); double8 __ovld __cnfn select(double8 a, double8 b, ulong8 c); double16 __ovld __cnfn select(double16 a, double16 b, ulong16 c); @@ -3359,7 +2851,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __cnfn select(half a, half b, short c); half2 __ovld __cnfn select(half2 a, half2 b, short2 c); -@@ -11600,13 +11614,13 @@ uint16 __ovld vload16(size_t offset, const __constant uint *p); +@@ -11600,13 +11676,13 @@ uint16 __ovld vload16(size_t offset, const __constant uint *p); long16 __ovld vload16(size_t offset, const __constant long *p); ulong16 __ovld vload16(size_t offset, const __constant ulong *p); float16 __ovld vload16(size_t offset, const __constant float *p); @@ -3375,7 +2867,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld vload(size_t offset, const __constant half *p); -@@ -11617,7 +11631,7 @@ half8 __ovld vload8(size_t offset, const __constant half *p); +@@ -11617,7 +11693,7 @@ half8 __ovld vload8(size_t offset, const __constant half *p); half16 __ovld vload16(size_t offset, const __constant half *p); #endif //cl_khr_fp16 @@ -3384,7 +2876,7 @@ index 514c710c11..9dcd10d54f 100644 char2 __ovld vload2(size_t offset, const char *p); uchar2 __ovld vload2(size_t offset, const uchar *p); short2 __ovld vload2(size_t offset, const short *p); -@@ -11664,13 +11678,13 @@ long16 __ovld vload16(size_t offset, const long *p); +@@ -11664,13 +11740,13 @@ long16 __ovld vload16(size_t offset, const long *p); ulong16 __ovld vload16(size_t offset, const ulong *p); float16 __ovld vload16(size_t offset, const float *p); @@ -3400,7 +2892,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld vload(size_t offset, const half *p); -@@ -11680,7 +11694,7 @@ half4 __ovld vload4(size_t offset, const half *p); +@@ -11680,7 +11756,7 @@ half4 __ovld vload4(size_t offset, const half *p); half8 __ovld vload8(size_t offset, const half *p); half16 __ovld vload16(size_t offset, const half *p); #endif //cl_khr_fp16 @@ -3409,7 +2901,7 @@ index 514c710c11..9dcd10d54f 100644 char2 __ovld vload2(size_t offset, const __global char *p); uchar2 __ovld vload2(size_t offset, const __global uchar *p); short2 __ovld vload2(size_t offset, const __global short *p); -@@ -11817,7 +11831,7 @@ long16 __ovld vload16(size_t offset, const __private long *p); +@@ -11817,7 +11893,7 @@ long16 __ovld vload16(size_t offset, const __private long *p); ulong16 __ovld vload16(size_t offset, const __private ulong *p); float16 __ovld vload16(size_t offset, const __private float *p); @@ -3418,7 +2910,7 @@ index 514c710c11..9dcd10d54f 100644 double2 __ovld vload2(size_t offset, const __global double *p); double3 __ovld vload3(size_t offset, const __global double *p); double4 __ovld vload4(size_t offset, const __global double *p); -@@ -11833,7 +11847,7 @@ double3 __ovld vload3(size_t offset, const __private double *p); +@@ -11833,7 +11909,7 @@ double3 __ovld vload3(size_t offset, const __private double *p); double4 __ovld vload4(size_t offset, const __private double *p); double8 __ovld vload8(size_t offset, const __private double *p); double16 __ovld vload16(size_t offset, const __private double *p); @@ -3427,7 +2919,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld vload(size_t offset, const __global half *p); -@@ -11855,9 +11869,8 @@ half4 __ovld vload4(size_t offset, const __private half *p); +@@ -11855,9 +11931,8 @@ half4 __ovld vload4(size_t offset, const __private half *p); half8 __ovld vload8(size_t offset, const __private half *p); half16 __ovld vload16(size_t offset, const __private half *p); #endif //cl_khr_fp16 @@ -3438,7 +2930,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore2(char2 data, size_t offset, char *p); void __ovld vstore2(uchar2 data, size_t offset, uchar *p); void __ovld vstore2(short2 data, size_t offset, short *p); -@@ -11903,13 +11916,13 @@ void __ovld vstore16(uint16 data, size_t offset, uint *p); +@@ -11903,13 +11978,13 @@ void __ovld vstore16(uint16 data, size_t offset, uint *p); void __ovld vstore16(long16 data, size_t offset, long *p); void __ovld vstore16(ulong16 data, size_t offset, ulong *p); void __ovld vstore16(float16 data, size_t offset, float *p); @@ -3454,7 +2946,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 void __ovld vstore(half data, size_t offset, half *p); void __ovld vstore2(half2 data, size_t offset, half *p); -@@ -11918,7 +11931,7 @@ void __ovld vstore4(half4 data, size_t offset, half *p); +@@ -11918,7 +11993,7 @@ void __ovld vstore4(half4 data, size_t offset, half *p); void __ovld vstore8(half8 data, size_t offset, half *p); void __ovld vstore16(half16 data, size_t offset, half *p); #endif //cl_khr_fp16 @@ -3463,7 +2955,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore2(char2 data, size_t offset, __global char *p); void __ovld vstore2(uchar2 data, size_t offset, __global uchar *p); void __ovld vstore2(short2 data, size_t offset, __global short *p); -@@ -12054,7 +12067,7 @@ void __ovld vstore16(uint16 data, size_t offset, __private uint *p); +@@ -12054,7 +12129,7 @@ void __ovld vstore16(uint16 data, size_t offset, __private uint *p); void __ovld vstore16(long16 data, size_t offset, __private long *p); void __ovld vstore16(ulong16 data, size_t offset, __private ulong *p); void __ovld vstore16(float16 data, size_t offset, __private float *p); @@ -3472,7 +2964,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore2(double2 data, size_t offset, __global double *p); void __ovld vstore3(double3 data, size_t offset, __global double *p); void __ovld vstore4(double4 data, size_t offset, __global double *p); -@@ -12070,7 +12083,7 @@ void __ovld vstore3(double3 data, size_t offset, __private double *p); +@@ -12070,7 +12145,7 @@ void __ovld vstore3(double3 data, size_t offset, __private double *p); void __ovld vstore4(double4 data, size_t offset, __private double *p); void __ovld vstore8(double8 data, size_t offset, __private double *p); void __ovld vstore16(double16 data, size_t offset, __private double *p); @@ -3481,7 +2973,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 void __ovld vstore(half data, size_t offset, __global half *p); void __ovld vstore2(half2 data, size_t offset, __global half *p); -@@ -12091,8 +12104,6 @@ void __ovld vstore4(half4 data, size_t offset, __private half *p); +@@ -12091,8 +12166,6 @@ void __ovld vstore4(half4 data, size_t offset, __private half *p); void __ovld vstore8(half8 data, size_t offset, __private half *p); void __ovld vstore16(half16 data, size_t offset, __private half *p); #endif //cl_khr_fp16 @@ -3490,7 +2982,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Read sizeof (half) bytes of data from address * (p + offset). The data read is interpreted as a -@@ -12102,14 +12113,12 @@ void __ovld vstore16(half16 data, size_t offset, __private half *p); +@@ -12102,14 +12175,12 @@ void __ovld vstore16(half16 data, size_t offset, __private half *p); * must be 16-bit aligned. */ float __ovld vload_half(size_t offset, const __constant half *p); @@ -3507,7 +2999,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Read sizeof (halfn) bytes of data from address * (p + (offset * n)). The data read is interpreted -@@ -12123,13 +12132,14 @@ float3 __ovld vload_half3(size_t offset, const __constant half *p); +@@ -12123,13 +12194,14 @@ float3 __ovld vload_half3(size_t offset, const __constant half *p); float4 __ovld vload_half4(size_t offset, const __constant half *p); float8 __ovld vload_half8(size_t offset, const __constant half *p); float16 __ovld vload_half16(size_t offset, const __constant half *p); @@ -3524,7 +3016,7 @@ index 514c710c11..9dcd10d54f 100644 float2 __ovld vload_half2(size_t offset, const __global half *p); float3 __ovld vload_half3(size_t offset, const __global half *p); float4 __ovld vload_half4(size_t offset, const __global half *p); -@@ -12145,7 +12155,6 @@ float3 __ovld vload_half3(size_t offset, const __private half *p); +@@ -12145,7 +12217,6 @@ float3 __ovld vload_half3(size_t offset, const __private half *p); float4 __ovld vload_half4(size_t offset, const __private half *p); float8 __ovld vload_half8(size_t offset, const __private half *p); float16 __ovld vload_half16(size_t offset, const __private half *p); @@ -3532,7 +3024,7 @@ index 514c710c11..9dcd10d54f 100644 /** * The float value given by data is first -@@ -12158,20 +12167,20 @@ float16 __ovld vload_half16(size_t offset, const __private half *p); +@@ -12158,20 +12229,20 @@ float16 __ovld vload_half16(size_t offset, const __private half *p); * The default current rounding mode is round to * nearest even. */ @@ -3557,7 +3049,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore_half(float data, size_t offset, __global half *p); void __ovld vstore_half_rte(float data, size_t offset, __global half *p); void __ovld vstore_half_rtz(float data, size_t offset, __global half *p); -@@ -12187,7 +12196,7 @@ void __ovld vstore_half_rte(float data, size_t offset, __private half *p); +@@ -12187,7 +12258,7 @@ void __ovld vstore_half_rte(float data, size_t offset, __private half *p); void __ovld vstore_half_rtz(float data, size_t offset, __private half *p); void __ovld vstore_half_rtp(float data, size_t offset, __private half *p); void __ovld vstore_half_rtn(float data, size_t offset, __private half *p); @@ -3566,7 +3058,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore_half(double data, size_t offset, __global half *p); void __ovld vstore_half_rte(double data, size_t offset, __global half *p); void __ovld vstore_half_rtz(double data, size_t offset, __global half *p); -@@ -12203,8 +12212,7 @@ void __ovld vstore_half_rte(double data, size_t offset, __private half *p); +@@ -12203,8 +12274,7 @@ void __ovld vstore_half_rte(double data, size_t offset, __private half *p); void __ovld vstore_half_rtz(double data, size_t offset, __private half *p); void __ovld vstore_half_rtp(double data, size_t offset, __private half *p); void __ovld vstore_half_rtn(double data, size_t offset, __private half *p); @@ -3576,7 +3068,7 @@ index 514c710c11..9dcd10d54f 100644 /** * The floatn value given by data is converted to -@@ -12217,7 +12225,7 @@ void __ovld vstore_half_rtn(double data, size_t offset, __private half *p); +@@ -12217,7 +12287,7 @@ void __ovld vstore_half_rtn(double data, size_t offset, __private half *p); * The default current rounding mode is round to * nearest even. */ @@ -3585,7 +3077,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore_half2(float2 data, size_t offset, half *p); void __ovld vstore_half3(float3 data, size_t offset, half *p); void __ovld vstore_half4(float4 data, size_t offset, half *p); -@@ -12243,7 +12251,7 @@ void __ovld vstore_half3_rtn(float3 data, size_t offset, half *p); +@@ -12243,7 +12313,7 @@ void __ovld vstore_half3_rtn(float3 data, size_t offset, half *p); void __ovld vstore_half4_rtn(float4 data, size_t offset, half *p); void __ovld vstore_half8_rtn(float8 data, size_t offset, half *p); void __ovld vstore_half16_rtn(float16 data, size_t offset, half *p); @@ -3594,7 +3086,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore_half2(double2 data, size_t offset, half *p); void __ovld vstore_half3(double3 data, size_t offset, half *p); void __ovld vstore_half4(double4 data, size_t offset, half *p); -@@ -12269,8 +12277,8 @@ void __ovld vstore_half3_rtn(double3 data, size_t offset, half *p); +@@ -12269,8 +12339,8 @@ void __ovld vstore_half3_rtn(double3 data, size_t offset, half *p); void __ovld vstore_half4_rtn(double4 data, size_t offset, half *p); void __ovld vstore_half8_rtn(double8 data, size_t offset, half *p); void __ovld vstore_half16_rtn(double16 data, size_t offset, half *p); @@ -3605,7 +3097,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore_half2(float2 data, size_t offset, __global half *p); void __ovld vstore_half3(float3 data, size_t offset, __global half *p); void __ovld vstore_half4(float4 data, size_t offset, __global half *p); -@@ -12346,7 +12354,7 @@ void __ovld vstore_half3_rtn(float3 data, size_t offset, __private half *p); +@@ -12346,7 +12416,7 @@ void __ovld vstore_half3_rtn(float3 data, size_t offset, __private half *p); void __ovld vstore_half4_rtn(float4 data, size_t offset, __private half *p); void __ovld vstore_half8_rtn(float8 data, size_t offset, __private half *p); void __ovld vstore_half16_rtn(float16 data, size_t offset, __private half *p); @@ -3614,7 +3106,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstore_half2(double2 data, size_t offset, __global half *p); void __ovld vstore_half3(double3 data, size_t offset, __global half *p); void __ovld vstore_half4(double4 data, size_t offset, __global half *p); -@@ -12422,8 +12430,7 @@ void __ovld vstore_half3_rtn(double3 data, size_t offset, __private half *p); +@@ -12422,8 +12492,7 @@ void __ovld vstore_half3_rtn(double3 data, size_t offset, __private half *p); void __ovld vstore_half4_rtn(double4 data, size_t offset, __private half *p); void __ovld vstore_half8_rtn(double8 data, size_t offset, __private half *p); void __ovld vstore_half16_rtn(double16 data, size_t offset, __private half *p); @@ -3624,7 +3116,7 @@ index 514c710c11..9dcd10d54f 100644 /** * For n = 1, 2, 4, 8 and 16 read sizeof (halfn) -@@ -12444,14 +12451,14 @@ float3 __ovld vloada_half3(size_t offset, const __constant half *p); +@@ -12444,14 +12513,14 @@ float3 __ovld vloada_half3(size_t offset, const __constant half *p); float4 __ovld vloada_half4(size_t offset, const __constant half *p); float8 __ovld vloada_half8(size_t offset, const __constant half *p); float16 __ovld vloada_half16(size_t offset, const __constant half *p); @@ -3641,7 +3133,7 @@ index 514c710c11..9dcd10d54f 100644 float __ovld vloada_half(size_t offset, const __global half *p); float2 __ovld vloada_half2(size_t offset, const __global half *p); float3 __ovld vloada_half3(size_t offset, const __global half *p); -@@ -12470,8 +12477,6 @@ float3 __ovld vloada_half3(size_t offset, const __private half *p); +@@ -12470,8 +12539,6 @@ float3 __ovld vloada_half3(size_t offset, const __private half *p); float4 __ovld vloada_half4(size_t offset, const __private half *p); float8 __ovld vloada_half8(size_t offset, const __private half *p); float16 __ovld vloada_half16(size_t offset, const __private half *p); @@ -3650,7 +3142,7 @@ index 514c710c11..9dcd10d54f 100644 /** * The floatn value given by data is converted to * a halfn value using the appropriate rounding -@@ -12488,7 +12493,7 @@ float16 __ovld vloada_half16(size_t offset, const __private half *p); +@@ -12488,7 +12555,7 @@ float16 __ovld vloada_half16(size_t offset, const __private half *p); * mode. The default current rounding mode is * round to nearest even. */ @@ -3659,7 +3151,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstorea_half(float data, size_t offset, half *p); void __ovld vstorea_half2(float2 data, size_t offset, half *p); void __ovld vstorea_half3(float3 data, size_t offset, half *p); -@@ -12524,7 +12529,7 @@ void __ovld vstorea_half4_rtn(float4 data, size_t offset, half *p); +@@ -12524,7 +12591,7 @@ void __ovld vstorea_half4_rtn(float4 data, size_t offset, half *p); void __ovld vstorea_half8_rtn(float8 data, size_t offset, half *p); void __ovld vstorea_half16_rtn(float16 data, size_t offset, half *p); @@ -3668,7 +3160,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstorea_half(double data, size_t offset, half *p); void __ovld vstorea_half2(double2 data, size_t offset, half *p); void __ovld vstorea_half3(double3 data, size_t offset, half *p); -@@ -12559,9 +12564,9 @@ void __ovld vstorea_half3_rtn(double3 data, size_t offset, half *p); +@@ -12559,9 +12626,9 @@ void __ovld vstorea_half3_rtn(double3 data, size_t offset, half *p); void __ovld vstorea_half4_rtn(double4 data, size_t offset, half *p); void __ovld vstorea_half8_rtn(double8 data, size_t offset, half *p); void __ovld vstorea_half16_rtn(double16 data, size_t offset, half *p); @@ -3680,7 +3172,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstorea_half(float data, size_t offset, __global half *p); void __ovld vstorea_half2(float2 data, size_t offset, __global half *p); void __ovld vstorea_half3(float3 data, size_t offset, __global half *p); -@@ -12667,7 +12672,7 @@ void __ovld vstorea_half4_rtn(float4 data, size_t offset, __private half *p); +@@ -12667,7 +12734,7 @@ void __ovld vstorea_half4_rtn(float4 data, size_t offset, __private half *p); void __ovld vstorea_half8_rtn(float8 data, size_t offset, __private half *p); void __ovld vstorea_half16_rtn(float16 data, size_t offset, __private half *p); @@ -3689,7 +3181,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld vstorea_half(double data, size_t offset, __global half *p); void __ovld vstorea_half2(double2 data, size_t offset, __global half *p); void __ovld vstorea_half3(double3 data, size_t offset, __global half *p); -@@ -12772,8 +12777,7 @@ void __ovld vstorea_half3_rtn(double3 data,size_t offset, __private half *p); +@@ -12772,8 +12839,7 @@ void __ovld vstorea_half3_rtn(double3 data,size_t offset, __private half *p); void __ovld vstorea_half4_rtn(double4 data,size_t offset, __private half *p); void __ovld vstorea_half8_rtn(double8 data,size_t offset, __private half *p); void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p); @@ -3699,7 +3191,7 @@ index 514c710c11..9dcd10d54f 100644 // OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions -@@ -12838,9 +12842,15 @@ void __ovld __conv barrier(cl_mem_fence_flags flags); +@@ -12838,9 +12904,15 @@ void __ovld __conv barrier(cl_mem_fence_flags flags); typedef enum memory_scope { memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, @@ -3716,7 +3208,7 @@ index 514c710c11..9dcd10d54f 100644 memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP #endif } memory_scope; -@@ -12892,7 +12902,7 @@ void __ovld write_mem_fence(cl_mem_fence_flags flags); +@@ -12892,7 +12964,7 @@ void __ovld write_mem_fence(cl_mem_fence_flags flags); // OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions @@ -3725,7 +3217,7 @@ index 514c710c11..9dcd10d54f 100644 cl_mem_fence_flags __ovld get_fence(const void *ptr); cl_mem_fence_flags __ovld get_fence(void *ptr); -@@ -12903,7 +12913,7 @@ cl_mem_fence_flags __ovld get_fence(void *ptr); +@@ -12903,7 +12975,7 @@ cl_mem_fence_flags __ovld get_fence(void *ptr); * where gentype is builtin type or user defined type. */ @@ -3734,7 +3226,7 @@ index 514c710c11..9dcd10d54f 100644 // OpenCL v1.1 s6.11.10, v1.2 s6.12.10, v2.0 s6.13.10 - Async Copies from Global to Local Memory, Local to Global Memory, and Prefetch -@@ -13042,7 +13052,7 @@ event_t __ovld async_work_group_copy(__global uint16 *dst, const __local uint16 +@@ -13042,7 +13114,7 @@ event_t __ovld async_work_group_copy(__global uint16 *dst, const __local uint16 event_t __ovld async_work_group_copy(__global long16 *dst, const __local long16 *src, size_t num_elements, event_t event); event_t __ovld async_work_group_copy(__global ulong16 *dst, const __local ulong16 *src, size_t num_elements, event_t event); event_t __ovld async_work_group_copy(__global float16 *dst, const __local float16 *src, size_t num_elements, event_t event); @@ -3743,7 +3235,7 @@ index 514c710c11..9dcd10d54f 100644 event_t __ovld async_work_group_copy(__local double *dst, const __global double *src, size_t num_elements, event_t event); event_t __ovld async_work_group_copy(__local double2 *dst, const __global double2 *src, size_t num_elements, event_t event); event_t __ovld async_work_group_copy(__local double3 *dst, const __global double3 *src, size_t num_elements, event_t event); -@@ -13055,7 +13065,7 @@ event_t __ovld async_work_group_copy(__global double3 *dst, const __local double +@@ -13055,7 +13127,7 @@ event_t __ovld async_work_group_copy(__global double3 *dst, const __local double event_t __ovld async_work_group_copy(__global double4 *dst, const __local double4 *src, size_t num_elements, event_t event); event_t __ovld async_work_group_copy(__global double8 *dst, const __local double8 *src, size_t num_elements, event_t event); event_t __ovld async_work_group_copy(__global double16 *dst, const __local double16 *src, size_t num_elements, event_t event); @@ -3752,7 +3244,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 event_t __ovld async_work_group_copy(__local half *dst, const __global half *src, size_t num_elements, event_t event); event_t __ovld async_work_group_copy(__local half2 *dst, const __global half2 *src, size_t num_elements, event_t event); -@@ -13205,7 +13215,7 @@ event_t __ovld async_work_group_strided_copy(__global uint16 *dst, const __local +@@ -13205,7 +13277,7 @@ event_t __ovld async_work_group_strided_copy(__global uint16 *dst, const __local event_t __ovld async_work_group_strided_copy(__global long16 *dst, const __local long16 *src, size_t num_elements, size_t dst_stride, event_t event); event_t __ovld async_work_group_strided_copy(__global ulong16 *dst, const __local ulong16 *src, size_t num_elements, size_t dst_stride, event_t event); event_t __ovld async_work_group_strided_copy(__global float16 *dst, const __local float16 *src, size_t num_elements, size_t dst_stride, event_t event); @@ -3761,7 +3253,7 @@ index 514c710c11..9dcd10d54f 100644 event_t __ovld async_work_group_strided_copy(__local double *dst, const __global double *src, size_t num_elements, size_t src_stride, event_t event); event_t __ovld async_work_group_strided_copy(__local double2 *dst, const __global double2 *src, size_t num_elements, size_t src_stride, event_t event); event_t __ovld async_work_group_strided_copy(__local double3 *dst, const __global double3 *src, size_t num_elements, size_t src_stride, event_t event); -@@ -13218,7 +13228,7 @@ event_t __ovld async_work_group_strided_copy(__global double3 *dst, const __loca +@@ -13218,7 +13290,7 @@ event_t __ovld async_work_group_strided_copy(__global double3 *dst, const __loca event_t __ovld async_work_group_strided_copy(__global double4 *dst, const __local double4 *src, size_t num_elements, size_t dst_stride, event_t event); event_t __ovld async_work_group_strided_copy(__global double8 *dst, const __local double8 *src, size_t num_elements, size_t dst_stride, event_t event); event_t __ovld async_work_group_strided_copy(__global double16 *dst, const __local double16 *src, size_t num_elements, size_t dst_stride, event_t event); @@ -3770,7 +3262,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 event_t __ovld async_work_group_strided_copy(__local half *dst, const __global half *src, size_t num_elements, size_t src_stride, event_t event); event_t __ovld async_work_group_strided_copy(__local half2 *dst, const __global half2 *src, size_t num_elements, size_t src_stride, event_t event); -@@ -13308,14 +13318,14 @@ void __ovld prefetch(const __global uint16 *p, size_t num_elements); +@@ -13308,14 +13380,14 @@ void __ovld prefetch(const __global uint16 *p, size_t num_elements); void __ovld prefetch(const __global long16 *p, size_t num_elements); void __ovld prefetch(const __global ulong16 *p, size_t num_elements); void __ovld prefetch(const __global float16 *p, size_t num_elements); @@ -3787,7 +3279,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 void __ovld prefetch(const __global half *p, size_t num_elements); void __ovld prefetch(const __global half2 *p, size_t num_elements); -@@ -13338,9 +13348,15 @@ void __ovld prefetch(const __global half16 *p, size_t num_elements); +@@ -13338,9 +13410,15 @@ void __ovld prefetch(const __global half16 *p, size_t num_elements); * pointed by p. The function returns old. */ int __ovld atomic_add(volatile __global int *p, int val); @@ -3805,7 +3297,7 @@ index 514c710c11..9dcd10d54f 100644 #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_add(volatile __global int *p, int val); -@@ -13364,9 +13380,15 @@ unsigned long __ovld atom_add(volatile __local unsigned long *p, unsigned long v +@@ -13364,9 +13442,15 @@ unsigned long __ovld atom_add(volatile __local unsigned long *p, unsigned long v * returns old. */ int __ovld atomic_sub(volatile __global int *p, int val); @@ -3823,7 +3315,7 @@ index 514c710c11..9dcd10d54f 100644 #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_sub(volatile __global int *p, int val); -@@ -13390,9 +13412,11 @@ unsigned long __ovld atom_sub(volatile __local unsigned long *p, unsigned long v +@@ -13390,9 +13474,11 @@ unsigned long __ovld atom_sub(volatile __local unsigned long *p, unsigned long v * value. */ int __ovld atomic_xchg(volatile __global int *p, int val); @@ -3837,7 +3329,7 @@ index 514c710c11..9dcd10d54f 100644 float __ovld atomic_xchg(volatile __global float *p, float val); float __ovld atomic_xchg(volatile __local float *p, float val); -@@ -13474,9 +13498,16 @@ unsigned long __ovld atom_dec(volatile __local unsigned long *p); +@@ -13474,9 +13560,16 @@ unsigned long __ovld atom_dec(volatile __local unsigned long *p); * returns old. */ int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val); @@ -3856,7 +3348,7 @@ index 514c710c11..9dcd10d54f 100644 #if defined(cl_khr_global_int32_base_atomics) int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val); -@@ -13502,9 +13533,15 @@ unsigned long __ovld atom_cmpxchg(volatile __local unsigned long *p, unsigned lo +@@ -13502,9 +13595,15 @@ unsigned long __ovld atom_cmpxchg(volatile __local unsigned long *p, unsigned lo * returns old. */ int __ovld atomic_min(volatile __global int *p, int val); @@ -3874,7 +3366,7 @@ index 514c710c11..9dcd10d54f 100644 #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_min(volatile __global int *p, int val); -@@ -13530,9 +13567,15 @@ unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long v +@@ -13530,9 +13629,15 @@ unsigned long __ovld atom_min(volatile __local unsigned long *p, unsigned long v * returns old. */ int __ovld atomic_max(volatile __global int *p, int val); @@ -3892,7 +3384,7 @@ index 514c710c11..9dcd10d54f 100644 #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_max(volatile __global int *p, int val); -@@ -13557,9 +13600,15 @@ unsigned long __ovld atom_max(volatile __local unsigned long *p, unsigned long v +@@ -13557,9 +13662,15 @@ unsigned long __ovld atom_max(volatile __local unsigned long *p, unsigned long v * pointed by p. The function returns old. */ int __ovld atomic_and(volatile __global int *p, int val); @@ -3910,7 +3402,7 @@ index 514c710c11..9dcd10d54f 100644 #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_and(volatile __global int *p, int val); -@@ -13584,9 +13633,15 @@ unsigned long __ovld atom_and(volatile __local unsigned long *p, unsigned long v +@@ -13584,9 +13695,15 @@ unsigned long __ovld atom_and(volatile __local unsigned long *p, unsigned long v * pointed by p. The function returns old. */ int __ovld atomic_or(volatile __global int *p, int val); @@ -3928,7 +3420,7 @@ index 514c710c11..9dcd10d54f 100644 #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_or(volatile __global int *p, int val); -@@ -13611,9 +13666,15 @@ unsigned long __ovld atom_or(volatile __local unsigned long *p, unsigned long va +@@ -13611,9 +13728,15 @@ unsigned long __ovld atom_or(volatile __local unsigned long *p, unsigned long va * pointed by p. The function returns old. */ int __ovld atomic_xor(volatile __global int *p, int val); @@ -3946,7 +3438,7 @@ index 514c710c11..9dcd10d54f 100644 #if defined(cl_khr_global_int32_extended_atomics) int __ovld atom_xor(volatile __global int *p, int val); -@@ -13661,120 +13722,78 @@ typedef enum memory_order +@@ -13661,120 +13784,78 @@ typedef enum memory_order #endif // atomic_init() @@ -4098,7 +3590,7 @@ index 514c710c11..9dcd10d54f 100644 #endif //defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) // OpenCL v2.0 s6.13.11.7.5: -@@ -13782,196 +13801,2236 @@ ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long opera +@@ -13782,196 +13863,2236 @@ ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long opera // or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t. #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) @@ -5567,45 +5059,13 @@ index 514c710c11..9dcd10d54f 100644 + float *expected, float desired); +bool __ovld atomic_compare_exchange_weak(volatile atomic_float *object, + float *expected, float desired); - #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) --#ifdef cl_khr_fp64 --bool __ovld atomic_compare_exchange_strong(volatile atomic_double *object, double *expected, double desired); --bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected, -- double desired, memory_order success, memory_order failure); --bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected, -- double desired, memory_order success, memory_order failure, memory_scope scope); --bool __ovld atomic_compare_exchange_weak(volatile atomic_double *object, double *expected, double desired); --bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected, -- double desired, memory_order success, memory_order failure); --bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected, -- double desired, memory_order success, memory_order failure, memory_scope scope); --#endif //cl_khr_fp64 --bool __ovld atomic_compare_exchange_strong(volatile atomic_long *object, long *expected, long desired); --bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected, -- long desired, memory_order success, memory_order failure); --bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected, -- long desired, memory_order success, memory_order failure, memory_scope scope); --bool __ovld atomic_compare_exchange_weak(volatile atomic_long *object, long *expected, long desired); --bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected, -- long desired, memory_order success, memory_order failure); --bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected, -- long desired, memory_order success, memory_order failure, memory_scope scope); --bool __ovld atomic_compare_exchange_strong(volatile atomic_ulong *object, ulong *expected, ulong desired); --bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected, -- ulong desired, memory_order success, memory_order failure); --bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected, -- ulong desired, memory_order success, memory_order failure, memory_scope scope); --bool __ovld atomic_compare_exchange_weak(volatile atomic_ulong *object, ulong *expected, ulong desired); --bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected, -- ulong desired, memory_order success, memory_order failure); --bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected, -- ulong desired, memory_order success, memory_order failure, memory_scope scope); ++#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) +#if defined(cl_khr_fp64) || defined(__opencl_c_fp64) +bool __ovld atomic_compare_exchange_strong(volatile atomic_double *object, + double *expected, double desired); +bool __ovld atomic_compare_exchange_weak(volatile atomic_double *object, + double *expected, double desired); - #endif ++#endif +bool __ovld atomic_compare_exchange_strong(volatile atomic_long *object, + long *expected, long desired); +bool __ovld atomic_compare_exchange_weak(volatile atomic_long *object, @@ -5717,7 +5177,39 @@ index 514c710c11..9dcd10d54f 100644 +bool __ovld atomic_compare_exchange_weak(volatile atomic_float __local *object, + float __private *expected, + float desired); -+#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) + #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) +-#ifdef cl_khr_fp64 +-bool __ovld atomic_compare_exchange_strong(volatile atomic_double *object, double *expected, double desired); +-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected, +- double desired, memory_order success, memory_order failure); +-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_double *object, double *expected, +- double desired, memory_order success, memory_order failure, memory_scope scope); +-bool __ovld atomic_compare_exchange_weak(volatile atomic_double *object, double *expected, double desired); +-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected, +- double desired, memory_order success, memory_order failure); +-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_double *object, double *expected, +- double desired, memory_order success, memory_order failure, memory_scope scope); +-#endif //cl_khr_fp64 +-bool __ovld atomic_compare_exchange_strong(volatile atomic_long *object, long *expected, long desired); +-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected, +- long desired, memory_order success, memory_order failure); +-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_long *object, long *expected, +- long desired, memory_order success, memory_order failure, memory_scope scope); +-bool __ovld atomic_compare_exchange_weak(volatile atomic_long *object, long *expected, long desired); +-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected, +- long desired, memory_order success, memory_order failure); +-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_long *object, long *expected, +- long desired, memory_order success, memory_order failure, memory_scope scope); +-bool __ovld atomic_compare_exchange_strong(volatile atomic_ulong *object, ulong *expected, ulong desired); +-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected, +- ulong desired, memory_order success, memory_order failure); +-bool __ovld atomic_compare_exchange_strong_explicit(volatile atomic_ulong *object, ulong *expected, +- ulong desired, memory_order success, memory_order failure, memory_scope scope); +-bool __ovld atomic_compare_exchange_weak(volatile atomic_ulong *object, ulong *expected, ulong desired); +-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected, +- ulong desired, memory_order success, memory_order failure); +-bool __ovld atomic_compare_exchange_weak_explicit(volatile atomic_ulong *object, ulong *expected, +- ulong desired, memory_order success, memory_order failure, memory_scope scope); +#if defined(cl_khr_fp64) || defined(__opencl_c_fp64) +bool __ovld +atomic_compare_exchange_strong(volatile atomic_double __global *object, @@ -5755,7 +5247,7 @@ index 514c710c11..9dcd10d54f 100644 +bool __ovld atomic_compare_exchange_weak(volatile atomic_double __local *object, + double __private *expected, + double desired); -+#endif + #endif +bool __ovld +atomic_compare_exchange_strong(volatile atomic_long __global *object, + long __global *expected, long desired); @@ -6487,7 +5979,7 @@ index 514c710c11..9dcd10d54f 100644 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 -@@ -14199,7 +16258,7 @@ float16 __ovld __cnfn shuffle(float4 x, uint16 mask); +@@ -14199,7 +16320,7 @@ float16 __ovld __cnfn shuffle(float4 x, uint16 mask); float16 __ovld __cnfn shuffle(float8 x, uint16 mask); float16 __ovld __cnfn shuffle(float16 x, uint16 mask); @@ -6496,7 +5988,7 @@ index 514c710c11..9dcd10d54f 100644 double2 __ovld __cnfn shuffle(double2 x, ulong2 mask); double2 __ovld __cnfn shuffle(double4 x, ulong2 mask); double2 __ovld __cnfn shuffle(double8 x, ulong2 mask); -@@ -14219,7 +16278,7 @@ double16 __ovld __cnfn shuffle(double2 x, ulong16 mask); +@@ -14219,7 +16340,7 @@ double16 __ovld __cnfn shuffle(double2 x, ulong16 mask); double16 __ovld __cnfn shuffle(double4 x, ulong16 mask); double16 __ovld __cnfn shuffle(double8 x, ulong16 mask); double16 __ovld __cnfn shuffle(double16 x, ulong16 mask); @@ -6505,7 +5997,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half2 __ovld __cnfn shuffle(half2 x, ushort2 mask); -@@ -14423,7 +16482,7 @@ float16 __ovld __cnfn shuffle2(float4 x, float4 y, uint16 mask); +@@ -14423,7 +16544,7 @@ float16 __ovld __cnfn shuffle2(float4 x, float4 y, uint16 mask); float16 __ovld __cnfn shuffle2(float8 x, float8 y, uint16 mask); float16 __ovld __cnfn shuffle2(float16 x, float16 y, uint16 mask); @@ -6514,7 +6006,7 @@ index 514c710c11..9dcd10d54f 100644 double2 __ovld __cnfn shuffle2(double2 x, double2 y, ulong2 mask); double2 __ovld __cnfn shuffle2(double4 x, double4 y, ulong2 mask); double2 __ovld __cnfn shuffle2(double8 x, double8 y, ulong2 mask); -@@ -14443,7 +16502,7 @@ double16 __ovld __cnfn shuffle2(double2 x, double2 y, ulong16 mask); +@@ -14443,7 +16564,7 @@ double16 __ovld __cnfn shuffle2(double2 x, double2 y, ulong16 mask); double16 __ovld __cnfn shuffle2(double4 x, double4 y, ulong16 mask); double16 __ovld __cnfn shuffle2(double8 x, double8 y, ulong16 mask); double16 __ovld __cnfn shuffle2(double16 x, double16 y, ulong16 mask); @@ -6523,7 +6015,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half2 __ovld __cnfn shuffle2(half2 x, half2 y, ushort2 mask); -@@ -14501,6 +16560,7 @@ int printf(__constant const char* st, ...); +@@ -14501,6 +16622,7 @@ int printf(__constant const char* st, ...); #pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable #endif //cl_khr_gl_msaa_sharing @@ -6531,7 +6023,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Use the coordinate (coord.xy) to do an element lookup in * the 2D image object specified by image. -@@ -14802,7 +16862,8 @@ half4 __purefn __ovld read_imageh(read_only image1d_buffer_t image, int coord); +@@ -14802,7 +16924,8 @@ half4 __purefn __ovld read_imageh(read_only image1d_buffer_t image, int coord); #endif //cl_khr_fp16 // Image read functions for read_write images @@ -6541,7 +6033,7 @@ index 514c710c11..9dcd10d54f 100644 float4 __purefn __ovld read_imagef(read_write image1d_t image, int coord); int4 __purefn __ovld read_imagei(read_write image1d_t image, int coord); uint4 __purefn __ovld read_imageui(read_write image1d_t image, int coord); -@@ -14845,7 +16906,8 @@ float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 co +@@ -14845,7 +16968,8 @@ float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 co float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample); #endif //cl_khr_gl_msaa_sharing @@ -6551,7 +6043,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_mipmap_image float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod); int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod); -@@ -14919,7 +16981,8 @@ float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler +@@ -14919,7 +17043,8 @@ float4 __purefn __ovld read_imagef(read_write image3d_t image, sampler_t sampler int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler, float4 coord, float lod); uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float lod); #endif //cl_khr_mipmap_image @@ -6561,7 +6053,7 @@ index 514c710c11..9dcd10d54f 100644 // Image read functions returning half4 type #ifdef cl_khr_fp16 -@@ -14930,7 +16993,8 @@ half4 __purefn __ovld read_imageh(read_write image1d_array_t image, int2 coord); +@@ -14930,7 +17055,8 @@ half4 __purefn __ovld read_imageh(read_write image1d_array_t image, int2 coord); half4 __purefn __ovld read_imageh(read_write image2d_array_t image, int4 coord); half4 __purefn __ovld read_imageh(read_write image1d_buffer_t image, int coord); #endif //cl_khr_fp16 @@ -6571,7 +6063,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Write color value to location specified by coordinate -@@ -15019,7 +17083,7 @@ void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, flo +@@ -15019,7 +17145,7 @@ void __ovld write_imagef(write_only image1d_array_t image_array, int2 coord, flo void __ovld write_imagei(write_only image1d_array_t image_array, int2 coord, int4 color); void __ovld write_imageui(write_only image1d_array_t image_array, int2 coord, uint4 color); @@ -6580,7 +6072,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld write_imagef(write_only image3d_t image, int4 coord, float4 color); void __ovld write_imagei(write_only image3d_t image, int4 coord, int4 color); void __ovld write_imageui(write_only image3d_t image, int4 coord, uint4 color); -@@ -15052,7 +17116,7 @@ void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, in +@@ -15052,7 +17178,7 @@ void __ovld write_imageui(write_only image2d_array_t image_array, int4 coord, in void __ovld write_imagef(write_only image2d_depth_t image, int2 coord, int lod, float depth); void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, int lod, float depth); @@ -6589,7 +6081,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld write_imagef(write_only image3d_t image, int4 coord, int lod, float4 color); void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 color); void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color); -@@ -15065,7 +17129,7 @@ void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 +@@ -15065,7 +17191,7 @@ void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 #ifdef cl_khr_fp16 void __ovld write_imageh(write_only image1d_t image, int coord, half4 color); void __ovld write_imageh(write_only image2d_t image, int2 coord, half4 color); @@ -6598,7 +6090,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld write_imageh(write_only image3d_t image, int4 coord, half4 color); #endif void __ovld write_imageh(write_only image1d_array_t image, int2 coord, half4 color); -@@ -15074,7 +17138,8 @@ void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 col +@@ -15074,7 +17200,8 @@ void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 col #endif //cl_khr_fp16 // Image write functions for read_write images @@ -6608,7 +6100,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld write_imagef(read_write image2d_t image, int2 coord, float4 color); void __ovld write_imagei(read_write image2d_t image, int2 coord, int4 color); void __ovld write_imageui(read_write image2d_t image, int2 coord, uint4 color); -@@ -15095,7 +17160,7 @@ void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, flo +@@ -15095,7 +17222,7 @@ void __ovld write_imagef(read_write image1d_array_t image_array, int2 coord, flo void __ovld write_imagei(read_write image1d_array_t image_array, int2 coord, int4 color); void __ovld write_imageui(read_write image1d_array_t image_array, int2 coord, uint4 color); @@ -6617,7 +6109,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld write_imagef(read_write image3d_t image, int4 coord, float4 color); void __ovld write_imagei(read_write image3d_t image, int4 coord, int4 color); void __ovld write_imageui(read_write image3d_t image, int4 coord, uint4 color); -@@ -15127,7 +17192,7 @@ void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, in +@@ -15127,7 +17254,7 @@ void __ovld write_imageui(read_write image2d_array_t image_array, int4 coord, in void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, int lod, float color); void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, int lod, float color); @@ -6626,7 +6118,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld write_imagef(read_write image3d_t image, int4 coord, int lod, float4 color); void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 color); void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color); -@@ -15140,14 +17205,15 @@ void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 +@@ -15140,14 +17267,15 @@ void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 #ifdef cl_khr_fp16 void __ovld write_imageh(read_write image1d_t image, int coord, half4 color); void __ovld write_imageh(read_write image2d_t image, int2 coord, half4 color); @@ -6644,7 +6136,7 @@ index 514c710c11..9dcd10d54f 100644 // Note: In OpenCL v1.0/1.1/1.2, image argument of image query builtin functions does not have // access qualifier, which by default assume read_only access qualifier. Image query builtin -@@ -15160,7 +17226,7 @@ void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 col +@@ -15160,7 +17288,7 @@ void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 col int __ovld __cnfn get_image_width(read_only image1d_t image); int __ovld __cnfn get_image_width(read_only image1d_buffer_t image); int __ovld __cnfn get_image_width(read_only image2d_t image); @@ -6653,7 +6145,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_width(read_only image3d_t image); #endif int __ovld __cnfn get_image_width(read_only image1d_array_t image); -@@ -15179,7 +17245,7 @@ int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image); +@@ -15179,7 +17307,7 @@ int __ovld __cnfn get_image_width(read_only image2d_array_msaa_depth_t image); int __ovld __cnfn get_image_width(write_only image1d_t image); int __ovld __cnfn get_image_width(write_only image1d_buffer_t image); int __ovld __cnfn get_image_width(write_only image2d_t image); @@ -6662,7 +6154,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_width(write_only image3d_t image); #endif int __ovld __cnfn get_image_width(write_only image1d_array_t image); -@@ -15195,7 +17261,8 @@ int __ovld __cnfn get_image_width(write_only image2d_array_msaa_t image); +@@ -15195,7 +17323,8 @@ int __ovld __cnfn get_image_width(write_only image2d_array_msaa_t image); int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6672,7 +6164,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_width(read_write image1d_t image); int __ovld __cnfn get_image_width(read_write image1d_buffer_t image); int __ovld __cnfn get_image_width(read_write image2d_t image); -@@ -15212,7 +17279,8 @@ int __ovld __cnfn get_image_width(read_write image2d_msaa_depth_t image); +@@ -15212,7 +17341,8 @@ int __ovld __cnfn get_image_width(read_write image2d_msaa_depth_t image); int __ovld __cnfn get_image_width(read_write image2d_array_msaa_t image); int __ovld __cnfn get_image_width(read_write image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6682,7 +6174,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Return the image height in pixels. -@@ -15232,7 +17300,7 @@ int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image); +@@ -15232,7 +17362,7 @@ int __ovld __cnfn get_image_height(read_only image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing int __ovld __cnfn get_image_height(write_only image2d_t image); @@ -6691,7 +6183,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_height(write_only image3d_t image); #endif int __ovld __cnfn get_image_height(write_only image2d_array_t image); -@@ -15247,7 +17315,8 @@ int __ovld __cnfn get_image_height(write_only image2d_array_msaa_t image); +@@ -15247,7 +17377,8 @@ int __ovld __cnfn get_image_height(write_only image2d_array_msaa_t image); int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6701,7 +6193,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_height(read_write image2d_t image); int __ovld __cnfn get_image_height(read_write image3d_t image); int __ovld __cnfn get_image_height(read_write image2d_array_t image); -@@ -15261,20 +17330,23 @@ int __ovld __cnfn get_image_height(read_write image2d_msaa_depth_t image); +@@ -15261,20 +17392,23 @@ int __ovld __cnfn get_image_height(read_write image2d_msaa_depth_t image); int __ovld __cnfn get_image_height(read_write image2d_array_msaa_t image); int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6729,7 +6221,7 @@ index 514c710c11..9dcd10d54f 100644 // OpenCL Extension v2.0 s9.18 - Mipmaps #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -@@ -15289,13 +17361,15 @@ int __ovld get_image_num_mip_levels(read_only image3d_t image); +@@ -15289,13 +17423,15 @@ int __ovld get_image_num_mip_levels(read_only image3d_t image); int __ovld get_image_num_mip_levels(write_only image1d_t image); int __ovld get_image_num_mip_levels(write_only image2d_t image); @@ -6746,7 +6238,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld get_image_num_mip_levels(read_only image1d_array_t image); int __ovld get_image_num_mip_levels(read_only image2d_array_t image); -@@ -15307,10 +17381,12 @@ int __ovld get_image_num_mip_levels(write_only image2d_array_t image); +@@ -15307,10 +17443,12 @@ int __ovld get_image_num_mip_levels(write_only image2d_array_t image); int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t image); int __ovld get_image_num_mip_levels(write_only image2d_depth_t image); @@ -6759,7 +6251,7 @@ index 514c710c11..9dcd10d54f 100644 #endif //cl_khr_mipmap_image #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 -@@ -15374,7 +17450,7 @@ int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth +@@ -15374,7 +17512,7 @@ int __ovld __cnfn get_image_channel_data_type(read_only image2d_array_msaa_depth int __ovld __cnfn get_image_channel_data_type(write_only image1d_t image); int __ovld __cnfn get_image_channel_data_type(write_only image1d_buffer_t image); int __ovld __cnfn get_image_channel_data_type(write_only image2d_t image); @@ -6768,7 +6260,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_channel_data_type(write_only image3d_t image); #endif int __ovld __cnfn get_image_channel_data_type(write_only image1d_array_t image); -@@ -15390,7 +17466,8 @@ int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_t im +@@ -15390,7 +17528,8 @@ int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_t im int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6778,7 +6270,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_channel_data_type(read_write image1d_t image); int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t image); int __ovld __cnfn get_image_channel_data_type(read_write image2d_t image); -@@ -15407,7 +17484,8 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_depth_t im +@@ -15407,7 +17546,8 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_depth_t im int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_t image); int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6788,7 +6280,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Return the image channel order. Valid values are: -@@ -15470,7 +17548,7 @@ int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t i +@@ -15470,7 +17610,7 @@ int __ovld __cnfn get_image_channel_order(read_only image2d_array_msaa_depth_t i int __ovld __cnfn get_image_channel_order(write_only image1d_t image); int __ovld __cnfn get_image_channel_order(write_only image1d_buffer_t image); int __ovld __cnfn get_image_channel_order(write_only image2d_t image); @@ -6797,7 +6289,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_channel_order(write_only image3d_t image); #endif int __ovld __cnfn get_image_channel_order(write_only image1d_array_t image); -@@ -15486,7 +17564,8 @@ int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_t image) +@@ -15486,7 +17626,8 @@ int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_t image) int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6807,7 +6299,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __cnfn get_image_channel_order(read_write image1d_t image); int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t image); int __ovld __cnfn get_image_channel_order(read_write image2d_t image); -@@ -15503,7 +17582,8 @@ int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_depth_t image) +@@ -15503,7 +17644,8 @@ int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_depth_t image) int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_t image); int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6817,7 +6309,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Return the 2D image width and height as an int2 -@@ -15536,7 +17616,8 @@ int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_t image); +@@ -15536,7 +17678,8 @@ int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_t image); int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6827,7 +6319,7 @@ index 514c710c11..9dcd10d54f 100644 int2 __ovld __cnfn get_image_dim(read_write image2d_t image); int2 __ovld __cnfn get_image_dim(read_write image2d_array_t image); #ifdef cl_khr_depth_images -@@ -15549,7 +17630,8 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_depth_t image); +@@ -15549,7 +17692,8 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_depth_t image); int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_t image); int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image); #endif //cl_khr_gl_msaa_sharing @@ -6837,7 +6329,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Return the 3D image width, height, and depth as an -@@ -15558,12 +17640,14 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image); +@@ -15558,12 +17702,14 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image); * component and the w component is 0. */ int4 __ovld __cnfn get_image_dim(read_only image3d_t image); @@ -6855,7 +6347,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Return the image array size. -@@ -15589,7 +17673,8 @@ size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_t image_ +@@ -15589,7 +17735,8 @@ size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_t image_ size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_depth_t image_array); #endif //cl_khr_gl_msaa_sharing @@ -6865,7 +6357,7 @@ index 514c710c11..9dcd10d54f 100644 size_t __ovld __cnfn get_image_array_size(read_write image1d_array_t image_array); size_t __ovld __cnfn get_image_array_size(read_write image2d_array_t image_array); #ifdef cl_khr_depth_images -@@ -15599,7 +17684,8 @@ size_t __ovld __cnfn get_image_array_size(read_write image2d_array_depth_t image +@@ -15599,7 +17746,8 @@ size_t __ovld __cnfn get_image_array_size(read_write image2d_array_depth_t image size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_t image_array); size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t image_array); #endif //cl_khr_gl_msaa_sharing @@ -6875,7 +6367,7 @@ index 514c710c11..9dcd10d54f 100644 /** * Return the number of samples associated with image -@@ -15617,18 +17703,23 @@ int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image); +@@ -15617,18 +17765,23 @@ int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image); int __ovld get_image_num_samples(write_only image2d_array_msaa_t image); int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image); @@ -6902,7 +6394,7 @@ index 514c710c11..9dcd10d54f 100644 int __ovld __conv work_group_all(int predicate); int __ovld __conv work_group_any(int predicate); -@@ -15652,11 +17743,11 @@ ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z); +@@ -15652,11 +17805,11 @@ ulong __ovld __conv work_group_broadcast(ulong a, size_t x, size_t y, size_t z); float __ovld __conv work_group_broadcast(float a, size_t local_id); float __ovld __conv work_group_broadcast(float a, size_t x, size_t y); float __ovld __conv work_group_broadcast(float a, size_t x, size_t y, size_t z); @@ -6916,7 +6408,7 @@ index 514c710c11..9dcd10d54f 100644 #ifdef cl_khr_fp16 half __ovld __conv work_group_reduce_add(half x); -@@ -15714,7 +17805,7 @@ float __ovld __conv work_group_scan_exclusive_max(float x); +@@ -15714,7 +17867,7 @@ float __ovld __conv work_group_scan_exclusive_max(float x); float __ovld __conv work_group_scan_inclusive_add(float x); float __ovld __conv work_group_scan_inclusive_min(float x); float __ovld __conv work_group_scan_inclusive_max(float x); @@ -6925,7 +6417,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __conv work_group_reduce_add(double x); double __ovld __conv work_group_reduce_min(double x); double __ovld __conv work_group_reduce_max(double x); -@@ -15724,19 +17815,12 @@ double __ovld __conv work_group_scan_exclusive_max(double x); +@@ -15724,19 +17877,12 @@ double __ovld __conv work_group_scan_exclusive_max(double x); double __ovld __conv work_group_scan_inclusive_add(double x); double __ovld __conv work_group_scan_inclusive_min(double x); double __ovld __conv work_group_scan_inclusive_max(double x); @@ -6949,7 +6441,7 @@ index 514c710c11..9dcd10d54f 100644 #define CL_COMPLETE 0x0 #define CL_RUNNING 0x1 -@@ -15775,7 +17859,17 @@ typedef struct { +@@ -15775,7 +17921,17 @@ typedef struct { size_t globalWorkSize[MAX_WORK_DIM]; size_t localWorkSize[MAX_WORK_DIM]; } ndrange_t; @@ -6967,7 +6459,7 @@ index 514c710c11..9dcd10d54f 100644 ndrange_t __ovld ndrange_1D(size_t); ndrange_t __ovld ndrange_1D(size_t, size_t); ndrange_t __ovld ndrange_1D(size_t, size_t, size_t); -@@ -15803,11 +17897,13 @@ bool __ovld is_valid_event (clk_event_t event); +@@ -15803,11 +17959,13 @@ bool __ovld is_valid_event (clk_event_t event); void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value); queue_t __ovld get_default_queue(void); @@ -6983,7 +6475,7 @@ index 514c710c11..9dcd10d54f 100644 // Shared Sub Group Functions uint __ovld get_sub_group_size(void); uint __ovld get_max_sub_group_size(void); -@@ -15893,7 +17989,7 @@ half __ovld __conv sub_group_scan_inclusive_min(half x); +@@ -15893,7 +18051,7 @@ half __ovld __conv sub_group_scan_inclusive_min(half x); half __ovld __conv sub_group_scan_inclusive_max(half x); #endif //cl_khr_fp16 @@ -6992,7 +6484,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __conv sub_group_broadcast(double x, uint sub_group_local_id); double __ovld __conv sub_group_reduce_add(double x); double __ovld __conv sub_group_reduce_min(double x); -@@ -15904,7 +18000,7 @@ double __ovld __conv sub_group_scan_exclusive_max(double x); +@@ -15904,7 +18062,7 @@ double __ovld __conv sub_group_scan_exclusive_max(double x); double __ovld __conv sub_group_scan_inclusive_add(double x); double __ovld __conv sub_group_scan_inclusive_min(double x); double __ovld __conv sub_group_scan_inclusive_max(double x); @@ -7001,7 +6493,7 @@ index 514c710c11..9dcd10d54f 100644 #endif //cl_khr_subgroups cl_intel_subgroups -@@ -16006,34 +18102,46 @@ uint16 __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c ); +@@ -16006,34 +18164,46 @@ uint16 __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c ); long __ovld __conv intel_sub_group_shuffle_xor( long x, uint c ); ulong __ovld __conv intel_sub_group_shuffle_xor( ulong x, uint c ); @@ -7052,7 +6544,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld __conv intel_sub_group_block_write( __global uint* p, uint data ); void __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data ); -@@ -16047,7 +18155,7 @@ half __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c ); +@@ -16047,7 +18217,7 @@ half __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c ); half __ovld __conv intel_sub_group_shuffle_xor( half x, uint c ); #endif @@ -7061,7 +6553,7 @@ index 514c710c11..9dcd10d54f 100644 double __ovld __conv intel_sub_group_shuffle( double x, uint c ); double __ovld __conv intel_sub_group_shuffle_down( double prev, double cur, uint c ); double __ovld __conv intel_sub_group_shuffle_up( double prev, double cur, uint c ); -@@ -16146,68 +18254,92 @@ ushort __ovld __conv intel_sub_group_scan_inclusive_min( ushort x ); +@@ -16146,68 +18316,92 @@ ushort __ovld __conv intel_sub_group_scan_inclusive_min( ushort x ); short __ovld __conv intel_sub_group_scan_inclusive_max( short x ); ushort __ovld __conv intel_sub_group_scan_inclusive_max( ushort x ); @@ -7162,7 +6654,7 @@ index 514c710c11..9dcd10d54f 100644 void __ovld __conv intel_sub_group_block_write_us( __global ushort* p, ushort data ); void __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data ); -@@ -16457,6 +18589,7 @@ short2 __ovld intel_sub_group_avc_ime_adjust_ref_offset( +@@ -16457,6 +18651,7 @@ short2 __ovld intel_sub_group_avc_ime_adjust_ref_offset( short2 ref_offset, ushort2 src_coord, ushort2 ref_window_size, ushort2 image_size); @@ -7170,7 +6662,7 @@ index 514c710c11..9dcd10d54f 100644 intel_sub_group_avc_ime_result_t __ovld intel_sub_group_avc_ime_evaluate_with_single_reference( read_only image2d_t src_image, read_only image2d_t ref_image, -@@ -16497,6 +18630,7 @@ intel_sub_group_avc_ime_evaluate_with_dual_reference_streaminout( +@@ -16497,6 +18692,7 @@ intel_sub_group_avc_ime_evaluate_with_dual_reference_streaminout( read_only image2d_t bwd_ref_image, sampler_t vme_media_sampler, intel_sub_group_avc_ime_payload_t payload, intel_sub_group_avc_ime_dual_reference_streamin_t streamin_components); @@ -7178,7 +6670,7 @@ index 514c710c11..9dcd10d54f 100644 intel_sub_group_avc_ime_single_reference_streamin_t __ovld intel_sub_group_avc_ime_get_single_reference_streamin( -@@ -16561,6 +18695,7 @@ intel_sub_group_avc_ref_payload_t __ovld +@@ -16561,6 +18757,7 @@ intel_sub_group_avc_ref_payload_t __ovld intel_sub_group_avc_ref_set_bilinear_filter_enable( intel_sub_group_avc_ref_payload_t payload); @@ -7186,7 +6678,7 @@ index 514c710c11..9dcd10d54f 100644 intel_sub_group_avc_ref_result_t __ovld intel_sub_group_avc_ref_evaluate_with_single_reference( read_only image2d_t src_image, read_only image2d_t ref_image, -@@ -16579,6 +18714,7 @@ intel_sub_group_avc_ref_evaluate_with_multi_reference( +@@ -16579,6 +18776,7 @@ intel_sub_group_avc_ref_evaluate_with_multi_reference( read_only image2d_t src_image, uint packed_reference_ids, uchar packed_reference_field_polarities, sampler_t vme_media_sampler, intel_sub_group_avc_ref_payload_t payload); @@ -7194,7 +6686,7 @@ index 514c710c11..9dcd10d54f 100644 // SIC built-in functions intel_sub_group_avc_sic_payload_t __ovld -@@ -16629,6 +18765,7 @@ intel_sub_group_avc_sic_set_block_based_raw_skip_sad( +@@ -16629,6 +18827,7 @@ intel_sub_group_avc_sic_set_block_based_raw_skip_sad( uchar block_based_skip_type, intel_sub_group_avc_sic_payload_t payload); @@ -7202,7 +6694,7 @@ index 514c710c11..9dcd10d54f 100644 intel_sub_group_avc_sic_result_t __ovld intel_sub_group_avc_sic_evaluate_ipe( read_only image2d_t src_image, sampler_t vme_media_sampler, -@@ -16651,6 +18788,7 @@ intel_sub_group_avc_sic_evaluate_with_multi_reference( +@@ -16651,6 +18850,7 @@ intel_sub_group_avc_sic_evaluate_with_multi_reference( read_only image2d_t src_image, uint packed_reference_ids, uchar packed_reference_field_polarities, sampler_t vme_media_sampler, intel_sub_group_avc_sic_payload_t payload); diff --git a/patches/clang/0007-OpenCL-Add-cl_khr_extended_subgroup-extensions.patch b/patches/clang/0007-OpenCL-Add-cl_khr_extended_subgroup-extensions.patch index 11a26f6d..2be8dd21 100644 --- a/patches/clang/0007-OpenCL-Add-cl_khr_extended_subgroup-extensions.patch +++ b/patches/clang/0007-OpenCL-Add-cl_khr_extended_subgroup-extensions.patch @@ -1,7 +1,7 @@ -From fe6f30499053cf9bd2c5c4acc82e06947af1eff2 Mon Sep 17 00:00:00 2001 +From bac6a22b16e23f6845c2852035aa880f0fee35d7 Mon Sep 17 00:00:00 2001 From: Anastasia Stulova -Date: Thu, 24 Sep 2020 12:08:28 +0300 -Subject: [PATCH] [PATCH] [OpenCL] Add cl_khr_extended_subgroup extensions. +Date: Fri, 25 Sep 2020 21:11:01 +0300 +Subject: [PATCH] [OpenCL] Add cl_khr_extended_subgroup extensions. Added extensions and their function declarations into the standard header. @@ -36,10 +36,10 @@ index 77c905ac6c..92959e2b28 100644 // Clang Extensions. OPENCLEXT_INTERNAL(cl_clang_storage_class_specifiers, 100, ~0U) diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h -index 9dcd10d54f..812d7ccf85 100644 +index 7def52945a..db14ce75a7 100644 --- a/lib/Headers/opencl-c.h +++ b/lib/Headers/opencl-c.h -@@ -18004,6 +18004,674 @@ double __ovld __conv sub_group_scan_inclusive_max(double x); +@@ -18066,6 +18066,674 @@ double __ovld __conv sub_group_scan_inclusive_max(double x); #endif //cl_khr_subgroups cl_intel_subgroups