Add neon intrinsics for aarch64

Related-To: NEO-6452 Signed-off-by: Sebastian Luzynski <sebastian.jozef.luzynski@intel.com>
intel · Mar 29, 2022 · cf90603 · cf90603
1 parent c7d8915
commit cf90603
Show file tree

Hide file tree

Showing 16 changed files with 445 additions and 27 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -618,6 +618,7 @@ else()
   endif()
   check_cxx_compiler_flag(-msse4.2 COMPILER_SUPPORTS_SSE42)
   check_cxx_compiler_flag(-mavx2 COMPILER_SUPPORTS_AVX2)
+  check_cxx_compiler_flag(-march=armv8-a+simd COMPILER_SUPPORTS_NEON)
 endif()
 
 if(NOT MSVC)

diff --git a/shared/source/helpers/aarch64/CMakeLists.txt b/shared/source/helpers/aarch64/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2019-2021 Intel Corporation
+# Copyright (C) 2019-2022 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -10,5 +10,12 @@ if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
        ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
   )
 
+  if(COMPILER_SUPPORTS_NEON)
+    list(APPEND NEO_CORE_HELPERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_neon.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/uint16_neon.h
+    )
+  endif()
+
   set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
 endif()
diff --git a/shared/source/helpers/aarch64/local_id_gen.cpp b/shared/source/helpers/aarch64/local_id_gen.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -9,10 +9,12 @@
 
 #include "shared/source/helpers/aligned_memory.h"
 #include "shared/source/helpers/local_id_gen_special.inl"
+#include "shared/source/utilities/cpu_info.h"
 
 namespace NEO {
 
 struct uint16x8_t;
+struct uint16x16_t;
 
 // This is the initial value of SIMD for local ID
 // computation.  It correlates to the SIMD lane.
@@ -27,6 +29,18 @@ void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3>
 void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
 void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;
 
+// Initialize the lookup table based on CPU capabilities
+LocalIDHelper::LocalIDHelper() {
+    bool supportsNEON = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureNeon);
+    if (supportsNEON) {
+        LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
+        LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
+        LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
+    }
+}
+
+LocalIDHelper LocalIDHelper::initializer;
+
 void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
     auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
     bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);

diff --git a/shared/source/helpers/aarch64/local_id_gen_neon.cpp b/shared/source/helpers/aarch64/local_id_gen_neon.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/aarch64/uint16_neon.h"
+#include "shared/source/helpers/local_id_gen.inl"
+
+#include <array>
+
+namespace NEO {
+template void generateLocalIDsSimd<uint16x16_t, 8>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
+template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
+template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
+} // namespace NEO
diff --git a/shared/source/helpers/aarch64/uint16_neon.h b/shared/source/helpers/aarch64/uint16_neon.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#pragma once
+#include "shared/source/helpers/aligned_memory.h"
+#include "shared/source/helpers/debug_helpers.h"
+
+#include <arm_neon.h>
+#include <cstdint>
+
+namespace NEO {
+
+struct uint16x16_t {
+    enum { numChannels = 16 };
+
+    uint16x8x2_t value;
+
+    uint16x16_t() {
+        value.val[0] = vdupq_n_u16(0);
+        value.val[1] = vdupq_n_u16(0);
+    }
+
+    uint16x16_t(uint16x8_t lo, uint16x8_t hi) {
+        value.val[0] = lo;
+        value.val[1] = hi;
+    }
+
+    uint16x16_t(uint16_t a) {
+        value.val[0] = vdupq_n_u16(a);
+        value.val[1] = vdupq_n_u16(a);
+    }
+
+    explicit uint16x16_t(const void *alignedPtr) {
+        load(alignedPtr);
+    }
+
+    inline uint16_t get(unsigned int element) {
+        DEBUG_BREAK_IF(element >= numChannels);
+        uint16_t result;
+        // vgetq_lane requires constant immediate
+        switch (element) {
+        case 0:
+            result = vgetq_lane_u16(value.val[0], 0);
+            break;
+        case 1:
+            result = vgetq_lane_u16(value.val[0], 1);
+            break;
+        case 2:
+            result = vgetq_lane_u16(value.val[0], 2);
+            break;
+        case 3:
+            result = vgetq_lane_u16(value.val[0], 3);
+            break;
+        case 4:
+            result = vgetq_lane_u16(value.val[0], 4);
+            break;
+        case 5:
+            result = vgetq_lane_u16(value.val[0], 5);
+            break;
+        case 6:
+            result = vgetq_lane_u16(value.val[0], 6);
+            break;
+        case 7:
+            result = vgetq_lane_u16(value.val[0], 7);
+            break;
+        case 8:
+            result = vgetq_lane_u16(value.val[1], 0);
+            break;
+        case 9:
+            result = vgetq_lane_u16(value.val[1], 1);
+            break;
+        case 10:
+            result = vgetq_lane_u16(value.val[1], 2);
+            break;
+        case 11:
+            result = vgetq_lane_u16(value.val[1], 3);
+            break;
+        case 12:
+            result = vgetq_lane_u16(value.val[1], 4);
+            break;
+        case 13:
+            result = vgetq_lane_u16(value.val[1], 5);
+            break;
+        case 14:
+            result = vgetq_lane_u16(value.val[1], 6);
+            break;
+        case 15:
+            result = vgetq_lane_u16(value.val[1], 7);
+            break;
+        }
+
+        return result;
+    }
+
+    static inline uint16x16_t zero() {
+        return uint16x16_t(static_cast<uint16_t>(0u));
+    }
+
+    static inline uint16x16_t one() {
+        return uint16x16_t(static_cast<uint16_t>(1u));
+    }
+
+    static inline uint16x16_t mask() {
+        return uint16x16_t(static_cast<uint16_t>(0xffffu));
+    }
+
+    inline void load(const void *alignedPtr) {
+        DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
+        value = vld1q_u16_x2(reinterpret_cast<const uint16_t *>(alignedPtr));
+    }
+
+    inline void store(void *alignedPtr) {
+        DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
+        vst1q_u16_x2(reinterpret_cast<uint16_t *>(alignedPtr), value);
+    }
+
+    inline operator bool() const {
+        uint64x2_t hi = vreinterpretq_u64_u16(value.val[0]);
+        uint64x2_t lo = vreinterpretq_u64_u16(value.val[1]);
+        uint64x2_t tmp = vorrq_u64(hi, lo);
+        uint64_t result = vget_lane_u64(vorr_u64(vget_high_u64(tmp), vget_low_u64(tmp)), 0);
+
+        return result;
+    }
+
+    inline uint16x16_t &operator-=(const uint16x16_t &a) {
+        value.val[0] = vsubq_u16(value.val[0], a.value.val[0]);
+        value.val[1] = vsubq_u16(value.val[1], a.value.val[1]);
+
+        return *this;
+    }
+
+    inline uint16x16_t &operator+=(const uint16x16_t &a) {
+        value.val[0] = vaddq_u16(value.val[0], a.value.val[0]);
+        value.val[1] = vaddq_u16(value.val[1], a.value.val[1]);
+
+        return *this;
+    }
+
+    inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
+        uint16x16_t result;
+
+        result.value.val[0] = veorq_u16(mask().value.val[0],
+                                        vcgtq_u16(b.value.val[0], a.value.val[0]));
+        result.value.val[1] = veorq_u16(mask().value.val[1],
+                                        vcgtq_u16(b.value.val[1], a.value.val[1]));
+        return result;
+    }
+
+    inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
+        uint16x16_t result;
+
+        result.value.val[0] = vandq_u16(a.value.val[0], b.value.val[0]);
+        result.value.val[1] = vandq_u16(a.value.val[1], b.value.val[1]);
+
+        return result;
+    }
+
+    // NOTE: uint16x16_t::blend behaves like mask ? a : b
+    inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
+        uint16x16_t result;
+
+        result.value.val[0] = vbslq_u16(mask.value.val[0], a.value.val[0], b.value.val[0]);
+        result.value.val[1] = vbslq_u16(mask.value.val[1], a.value.val[1], b.value.val[1]);
+
+        return result;
+    }
+};
+} // namespace NEO
diff --git a/shared/source/utilities/aarch64/cpu_info_aarch64.cpp b/shared/source/utilities/aarch64/cpu_info_aarch64.cpp
@@ -1,13 +1,18 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
  */
 
 #include "shared/source/utilities/cpu_info.h"
 
+#include <asm/hwcap.h>
+
 namespace NEO {
 void CpuInfo::detect() const {
+    uint32_t cpuInfo[4] = {};
+    cpuid(cpuInfo, 0u);
+    features |= cpuInfo[0] & HWCAP_ASIMD ? featureNeon : featureNone;
 }
 } // namespace NEO
diff --git a/shared/source/utilities/cpu_info.h b/shared/source/utilities/cpu_info.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,6 +42,7 @@ struct CpuInfo {
     static const uint64_t featureHle = 0x000200000ULL;
     static const uint64_t featureRtm = 0x000400000ULL;
     static const uint64_t featureAvX2 = 0x000800000ULL;
+    static const uint64_t featureNeon = 0x001000000ULL;
     static const uint64_t featureKncni = 0x004000000ULL;
     static const uint64_t featureAvX512F = 0x008000000ULL;
     static const uint64_t featureAdx = 0x010000000ULL;

diff --git a/shared/source/utilities/linux/aarch64/cpu_info.cpp b/shared/source/utilities/linux/aarch64/cpu_info.cpp
@@ -11,10 +11,12 @@
 
 #include <cstdint>
 #include <fstream>
+#include <sys/auxv.h>
 
 namespace NEO {
 
 void cpuid_linux_wrapper(int cpuInfo[4], int functionId) {
+    cpuInfo[0] = getauxval(AT_HWCAP);
 }
 
 void cpuidex_linux_wrapper(int *cpuInfo, int functionId, int subfunctionId) {
@@ -24,7 +26,7 @@ void get_cpu_flags_linux(std::string &cpuFlags) {
     std::ifstream cpuinfo(std::string(Os::sysFsProcPathPrefix) + "/cpuinfo");
     std::string line;
     while (std::getline(cpuinfo, line)) {
-        if (line.substr(0, 5) == "flags") {
+        if (line.substr(0, 8) == "Features") {
             cpuFlags = line;
             break;
         }

diff --git a/shared/test/unit_test/helpers/CMakeLists.txt b/shared/test/unit_test/helpers/CMakeLists.txt
@@ -25,5 +25,11 @@ set(IGDRCL_SRCS_tests_helpers
     ${CMAKE_CURRENT_SOURCE_DIR}/test_hw_info_config.cpp
 )
 
+if(COMPILER_SUPPORTS_NEON)
+  list(APPEND IGDRCL_SRCS_tests_helpers
+       ${CMAKE_CURRENT_SOURCE_DIR}/uint16_neon_tests.cpp
+  )
+endif()
+
 target_sources(${TARGET_NAME} PRIVATE ${IGDRCL_SRCS_tests_helpers})
 add_subdirectories()