Skip to content

Commit

Permalink
Add neon intrinsics for aarch64
Browse files Browse the repository at this point in the history
Related-To: NEO-6452

Signed-off-by: Sebastian Luzynski <sebastian.jozef.luzynski@intel.com>
  • Loading branch information
SebastianLuzynski authored and Compute-Runtime-Automation committed Mar 29, 2022
1 parent c7d8915 commit cf90603
Show file tree
Hide file tree
Showing 16 changed files with 445 additions and 27 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Expand Up @@ -618,6 +618,7 @@ else()
endif()
check_cxx_compiler_flag(-msse4.2 COMPILER_SUPPORTS_SSE42)
check_cxx_compiler_flag(-mavx2 COMPILER_SUPPORTS_AVX2)
check_cxx_compiler_flag(-march=armv8-a+simd COMPILER_SUPPORTS_NEON)
endif()

if(NOT MSVC)
Expand Down
9 changes: 8 additions & 1 deletion shared/source/helpers/aarch64/CMakeLists.txt
@@ -1,5 +1,5 @@
#
# Copyright (C) 2019-2021 Intel Corporation
# Copyright (C) 2019-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
Expand All @@ -10,5 +10,12 @@ if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
)

if(COMPILER_SUPPORTS_NEON)
list(APPEND NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_neon.cpp
${CMAKE_CURRENT_SOURCE_DIR}/uint16_neon.h
)
endif()

set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
endif()
16 changes: 15 additions & 1 deletion shared/source/helpers/aarch64/local_id_gen.cpp
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand All @@ -9,10 +9,12 @@

#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/local_id_gen_special.inl"
#include "shared/source/utilities/cpu_info.h"

namespace NEO {

struct uint16x8_t;
struct uint16x16_t;

// This is the initial value of SIMD for local ID
// computation. It correlates to the SIMD lane.
Expand All @@ -27,6 +29,18 @@ void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3>
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;

// Initialize the lookup table based on CPU capabilities
LocalIDHelper::LocalIDHelper() {
bool supportsNEON = CpuInfo::getInstance().isFeatureSupported(CpuInfo::featureNeon);
if (supportsNEON) {
LocalIDHelper::generateSimd8 = generateLocalIDsSimd<uint16x8_t, 8>;
LocalIDHelper::generateSimd16 = generateLocalIDsSimd<uint16x16_t, 16>;
LocalIDHelper::generateSimd32 = generateLocalIDsSimd<uint16x16_t, 32>;
}
}

LocalIDHelper LocalIDHelper::initializer;

void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
Expand Down
17 changes: 17 additions & 0 deletions shared/source/helpers/aarch64/local_id_gen_neon.cpp
@@ -0,0 +1,17 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/

#include "shared/source/helpers/aarch64/uint16_neon.h"
#include "shared/source/helpers/local_id_gen.inl"

#include <array>

namespace NEO {
template void generateLocalIDsSimd<uint16x16_t, 8>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
template void generateLocalIDsSimd<uint16x16_t, 16>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
template void generateLocalIDsSimd<uint16x16_t, 32>(void *b, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize);
} // namespace NEO
173 changes: 173 additions & 0 deletions shared/source/helpers/aarch64/uint16_neon.h
@@ -0,0 +1,173 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/

#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"

#include <arm_neon.h>
#include <cstdint>

namespace NEO {

struct uint16x16_t {
enum { numChannels = 16 };

uint16x8x2_t value;

uint16x16_t() {
value.val[0] = vdupq_n_u16(0);
value.val[1] = vdupq_n_u16(0);
}

uint16x16_t(uint16x8_t lo, uint16x8_t hi) {
value.val[0] = lo;
value.val[1] = hi;
}

uint16x16_t(uint16_t a) {
value.val[0] = vdupq_n_u16(a);
value.val[1] = vdupq_n_u16(a);
}

explicit uint16x16_t(const void *alignedPtr) {
load(alignedPtr);
}

inline uint16_t get(unsigned int element) {
DEBUG_BREAK_IF(element >= numChannels);
uint16_t result;
// vgetq_lane requires constant immediate
switch (element) {
case 0:
result = vgetq_lane_u16(value.val[0], 0);
break;
case 1:
result = vgetq_lane_u16(value.val[0], 1);
break;
case 2:
result = vgetq_lane_u16(value.val[0], 2);
break;
case 3:
result = vgetq_lane_u16(value.val[0], 3);
break;
case 4:
result = vgetq_lane_u16(value.val[0], 4);
break;
case 5:
result = vgetq_lane_u16(value.val[0], 5);
break;
case 6:
result = vgetq_lane_u16(value.val[0], 6);
break;
case 7:
result = vgetq_lane_u16(value.val[0], 7);
break;
case 8:
result = vgetq_lane_u16(value.val[1], 0);
break;
case 9:
result = vgetq_lane_u16(value.val[1], 1);
break;
case 10:
result = vgetq_lane_u16(value.val[1], 2);
break;
case 11:
result = vgetq_lane_u16(value.val[1], 3);
break;
case 12:
result = vgetq_lane_u16(value.val[1], 4);
break;
case 13:
result = vgetq_lane_u16(value.val[1], 5);
break;
case 14:
result = vgetq_lane_u16(value.val[1], 6);
break;
case 15:
result = vgetq_lane_u16(value.val[1], 7);
break;
}

return result;
}

static inline uint16x16_t zero() {
return uint16x16_t(static_cast<uint16_t>(0u));
}

static inline uint16x16_t one() {
return uint16x16_t(static_cast<uint16_t>(1u));
}

static inline uint16x16_t mask() {
return uint16x16_t(static_cast<uint16_t>(0xffffu));
}

inline void load(const void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
value = vld1q_u16_x2(reinterpret_cast<const uint16_t *>(alignedPtr));
}

inline void store(void *alignedPtr) {
DEBUG_BREAK_IF(!isAligned<32>(alignedPtr));
vst1q_u16_x2(reinterpret_cast<uint16_t *>(alignedPtr), value);
}

inline operator bool() const {
uint64x2_t hi = vreinterpretq_u64_u16(value.val[0]);
uint64x2_t lo = vreinterpretq_u64_u16(value.val[1]);
uint64x2_t tmp = vorrq_u64(hi, lo);
uint64_t result = vget_lane_u64(vorr_u64(vget_high_u64(tmp), vget_low_u64(tmp)), 0);

return result;
}

inline uint16x16_t &operator-=(const uint16x16_t &a) {
value.val[0] = vsubq_u16(value.val[0], a.value.val[0]);
value.val[1] = vsubq_u16(value.val[1], a.value.val[1]);

return *this;
}

inline uint16x16_t &operator+=(const uint16x16_t &a) {
value.val[0] = vaddq_u16(value.val[0], a.value.val[0]);
value.val[1] = vaddq_u16(value.val[1], a.value.val[1]);

return *this;
}

inline friend uint16x16_t operator>=(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;

result.value.val[0] = veorq_u16(mask().value.val[0],
vcgtq_u16(b.value.val[0], a.value.val[0]));
result.value.val[1] = veorq_u16(mask().value.val[1],
vcgtq_u16(b.value.val[1], a.value.val[1]));
return result;
}

inline friend uint16x16_t operator&&(const uint16x16_t &a, const uint16x16_t &b) {
uint16x16_t result;

result.value.val[0] = vandq_u16(a.value.val[0], b.value.val[0]);
result.value.val[1] = vandq_u16(a.value.val[1], b.value.val[1]);

return result;
}

// NOTE: uint16x16_t::blend behaves like mask ? a : b
inline friend uint16x16_t blend(const uint16x16_t &a, const uint16x16_t &b, const uint16x16_t &mask) {
uint16x16_t result;

result.value.val[0] = vbslq_u16(mask.value.val[0], a.value.val[0], b.value.val[0]);
result.value.val[1] = vbslq_u16(mask.value.val[1], a.value.val[1], b.value.val[1]);

return result;
}
};
} // namespace NEO
7 changes: 6 additions & 1 deletion shared/source/utilities/aarch64/cpu_info_aarch64.cpp
@@ -1,13 +1,18 @@
/*
* Copyright (C) 2021 Intel Corporation
* Copyright (C) 2021-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/

#include "shared/source/utilities/cpu_info.h"

#include <asm/hwcap.h>

namespace NEO {
void CpuInfo::detect() const {
uint32_t cpuInfo[4] = {};
cpuid(cpuInfo, 0u);
features |= cpuInfo[0] & HWCAP_ASIMD ? featureNeon : featureNone;
}
} // namespace NEO
3 changes: 2 additions & 1 deletion shared/source/utilities/cpu_info.h
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
* Copyright (C) 2018-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
Expand Down Expand Up @@ -42,6 +42,7 @@ struct CpuInfo {
static const uint64_t featureHle = 0x000200000ULL;
static const uint64_t featureRtm = 0x000400000ULL;
static const uint64_t featureAvX2 = 0x000800000ULL;
static const uint64_t featureNeon = 0x001000000ULL;
static const uint64_t featureKncni = 0x004000000ULL;
static const uint64_t featureAvX512F = 0x008000000ULL;
static const uint64_t featureAdx = 0x010000000ULL;
Expand Down
4 changes: 3 additions & 1 deletion shared/source/utilities/linux/aarch64/cpu_info.cpp
Expand Up @@ -11,10 +11,12 @@

#include <cstdint>
#include <fstream>
#include <sys/auxv.h>

namespace NEO {

void cpuid_linux_wrapper(int cpuInfo[4], int functionId) {
cpuInfo[0] = getauxval(AT_HWCAP);
}

void cpuidex_linux_wrapper(int *cpuInfo, int functionId, int subfunctionId) {
Expand All @@ -24,7 +26,7 @@ void get_cpu_flags_linux(std::string &cpuFlags) {
std::ifstream cpuinfo(std::string(Os::sysFsProcPathPrefix) + "/cpuinfo");
std::string line;
while (std::getline(cpuinfo, line)) {
if (line.substr(0, 5) == "flags") {
if (line.substr(0, 8) == "Features") {
cpuFlags = line;
break;
}
Expand Down
6 changes: 6 additions & 0 deletions shared/test/unit_test/helpers/CMakeLists.txt
Expand Up @@ -25,5 +25,11 @@ set(IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/test_hw_info_config.cpp
)

if(COMPILER_SUPPORTS_NEON)
list(APPEND IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/uint16_neon_tests.cpp
)
endif()

target_sources(${TARGET_NAME} PRIVATE ${IGDRCL_SRCS_tests_helpers})
add_subdirectories()

0 comments on commit cf90603

Please sign in to comment.