Skip to content

Commit

Permalink
initial Neo enabling on architectures other than x86
Browse files Browse the repository at this point in the history
Related-To: NEO-6011
Signed-off-by: Artur Harasimiuk <artur.harasimiuk@intel.com>
  • Loading branch information
ArturHarasimiuk authored and Compute-Runtime-Automation committed Sep 14, 2021
1 parent f958b05 commit 895e9e5
Show file tree
Hide file tree
Showing 28 changed files with 8,801 additions and 100 deletions.
16 changes: 16 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,20 @@ else()
set(NEO_ARCH "x86")
endif()

if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set(NEO_TARGET_PROCESSOR "x86_64")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
set(NEO_TARGET_PROCESSOR "x86_64")
elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64")
set(NEO_TARGET_PROCESSOR "aarch64")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/sse2neon)
endif()
message(STATUS "Target processor: ${NEO_TARGET_PROCESSOR}")

if(NOT DEFINED NEO_TARGET_PROCESSOR)
message(FATAL_ERROR "Unsupported target processor: ${CMAKE_SYSTEM_PROCESSOR}")
endif()

if(NOT DEFINED BUILD_WITH_L0)
if("${NEO_BITS}" STREQUAL "64")
set(BUILD_WITH_L0 TRUE)
Expand Down Expand Up @@ -847,6 +861,8 @@ else()
else()
message(WARNING "Spectre mitigation DISABLED")
endif()
check_cxx_compiler_flag(-msse4.2 COMPILER_SUPPORTS_SSE42)
check_cxx_compiler_flag(-mavx2 COMPILER_SUPPORTS_AVX2)
endif()

if(NOT MSVC)
Expand Down
4 changes: 3 additions & 1 deletion level_zero/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,9 @@ if(BUILD_WITH_L0)
)

if(UNIX)
target_link_libraries(${TARGET_NAME_L0} ${GMM_LINK_NAME})
if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
target_link_libraries(${TARGET_NAME_L0} ${GMM_LINK_NAME})
endif()

set_property(TARGET ${TARGET_NAME_L0}
APPEND_STRING PROPERTY LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/core/source/dll/linux/ze.exports"
Expand Down
2 changes: 1 addition & 1 deletion manifests/manifest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ components:
infra:
branch: master
dest_dir: infra
revision: a6b4272e6e2ebd1965b656a0d247038a1111cc58
revision: 6f8216baa8dbd1c185c7dcd5349a8aa7ae0e5591
type: git
internal:
branch: master
Expand Down
4 changes: 3 additions & 1 deletion opencl/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,9 @@ if(${GENERATE_EXECUTABLE})
${NEO_SHARED_DIRECTORY}/os_interface/windows/gmm_interface_win.cpp
)
else()
target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${GMM_LINK_NAME})
if(${NEO_TARGET_PROCESSOR} STREQUAL "x86_64")
target_link_libraries(${NEO_DYNAMIC_LIB_NAME} ${GMM_LINK_NAME})
endif()
target_include_directories(${NEO_DYNAMIC_LIB_NAME} PRIVATE
${NEO_SHARED_DIRECTORY}/dll/devices${BRANCH_DIR_SUFFIX}
)
Expand Down
2 changes: 1 addition & 1 deletion opencl/test/unit_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ if(USE_ASAN)
set(GTEST_ENV "LSAN_OPTIONS=suppressions=${CMAKE_CURRENT_SOURCE_DIR}/lsan_suppressions.txt")
endif()

if(NOT MSVC)
if(COMPILER_SUPPORTS_SSE42)
set_source_files_properties(helpers/uint16_sse4_tests.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
endif()

Expand Down
7 changes: 6 additions & 1 deletion opencl/test/unit_test/helpers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,19 @@ set(IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/timestamp_packet_tests.h
${CMAKE_CURRENT_SOURCE_DIR}/transfer_properties_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ult_limits.h
${CMAKE_CURRENT_SOURCE_DIR}/uint16_sse4_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/validator_tests.cpp
${NEO_SHARED_TEST_DIRECTORY}/common/helpers/aligned_memory_tests.cpp
${NEO_SHARED_TEST_DIRECTORY}/common/helpers/debug_manager_state_restore.h
${NEO_SHARED_TEST_DIRECTORY}/common/helpers/unit_test_helper.h
${NEO_SHARED_TEST_DIRECTORY}/common/helpers/unit_test_helper.inl
)

if(MSVC OR COMPILER_SUPPORTS_SSE42)
list(APPEND IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/uint16_sse4_tests.cpp
)
endif()

if(TESTS_XEHP_AND_LATER)
list(APPEND IGDRCL_SRCS_tests_helpers
${CMAKE_CURRENT_SOURCE_DIR}/aub_helper_hw_tests_xehp_and_later.cpp
Expand Down
68 changes: 34 additions & 34 deletions opencl/test/unit_test/helpers/uint16_sse4_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,29 @@
using namespace NEO;

TEST(Uint16Sse4, GivenMaskWhenCastingToBoolThenTrueIsReturned) {
EXPECT_TRUE(static_cast<bool>(uint16x8_t::mask()));
EXPECT_TRUE(static_cast<bool>(NEO::uint16x8_t::mask()));
}

TEST(Uint16Sse4, GivenZeroWhenCastingToBoolThenFalseIsReturned) {
EXPECT_FALSE(static_cast<bool>(uint16x8_t::zero()));
EXPECT_FALSE(static_cast<bool>(NEO::uint16x8_t::zero()));
}

TEST(Uint16Sse4, WhenConjoiningMaskAndZeroThenBooleanResultIsCorrect) {
EXPECT_TRUE(uint16x8_t::mask() && uint16x8_t::mask());
EXPECT_FALSE(uint16x8_t::mask() && uint16x8_t::zero());
EXPECT_FALSE(uint16x8_t::zero() && uint16x8_t::mask());
EXPECT_FALSE(uint16x8_t::zero() && uint16x8_t::zero());
EXPECT_TRUE(NEO::uint16x8_t::mask() && NEO::uint16x8_t::mask());
EXPECT_FALSE(NEO::uint16x8_t::mask() && NEO::uint16x8_t::zero());
EXPECT_FALSE(NEO::uint16x8_t::zero() && NEO::uint16x8_t::mask());
EXPECT_FALSE(NEO::uint16x8_t::zero() && NEO::uint16x8_t::zero());
}

TEST(Uint16Sse4, GivenOneWhenCreatingThenInstancesAreSame) {
auto one = uint16x8_t::one();
uint16x8_t alsoOne(one.value);
EXPECT_EQ(0, memcmp(&alsoOne, &one, sizeof(uint16x8_t)));
auto one = NEO::uint16x8_t::one();
NEO::uint16x8_t alsoOne(one.value);
EXPECT_EQ(0, memcmp(&alsoOne, &one, sizeof(NEO::uint16x8_t)));
}

TEST(Uint16Sse4, GivenValueWhenCreatingThenConstructorIsReplicated) {
uint16x8_t allSevens(7u);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
NEO::uint16x8_t allSevens(7u);
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(7u, allSevens.get(i));
}
}
Expand All @@ -46,34 +46,34 @@ static const uint16_t laneValues[] = {
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};

TEST(Uint16Sse4, GivenArrayWhenCreatingThenConstructorIsReplicated) {
uint16x8_t lanes(laneValues);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
NEO::uint16x8_t lanes(laneValues);
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), lanes.get(i));
}
}

TEST(Uint16Sse4, WhenLoadingThenValuesAreSetCorrectly) {
uint16x8_t lanes;
NEO::uint16x8_t lanes;
lanes.load(laneValues);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), lanes.get(i));
}
}

TEST(Uint16Sse4, WhenLoadingUnalignedThenValuesAreSetCorrectly) {
uint16x8_t lanes;
NEO::uint16x8_t lanes;
lanes.loadUnaligned(laneValues + 1);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i + 1), lanes.get(i));
}
}

TEST(Uint16Sse4, WhenStoringThenValuesAreSetCorrectly) {
uint16_t *alignedMemory = reinterpret_cast<uint16_t *>(alignedMalloc(1024, 32));

uint16x8_t lanes(laneValues);
NEO::uint16x8_t lanes(laneValues);
lanes.store(alignedMemory);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), alignedMemory[i]);
}

Expand All @@ -83,49 +83,49 @@ TEST(Uint16Sse4, WhenStoringThenValuesAreSetCorrectly) {
TEST(Uint16Sse4, WhenStoringUnalignedThenValuesAreSetCorrectly) {
uint16_t *alignedMemory = reinterpret_cast<uint16_t *>(alignedMalloc(1024, 32));

uint16x8_t lanes(laneValues);
NEO::uint16x8_t lanes(laneValues);
lanes.storeUnaligned(alignedMemory + 1);
for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i), (alignedMemory + 1)[i]);
}

alignedFree(alignedMemory);
}

TEST(Uint16Sse4, WhenDecrementingThenValuesAreSetCorrectly) {
uint16x8_t result(laneValues);
result -= uint16x8_t::one();
NEO::uint16x8_t result(laneValues);
result -= NEO::uint16x8_t::one();

for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i - 1), result.get(i));
}
}

TEST(Uint16Sse4, WhenIncrementingThenValuesAreSetCorrectly) {
uint16x8_t result(laneValues);
result += uint16x8_t::one();
NEO::uint16x8_t result(laneValues);
result += NEO::uint16x8_t::one();

for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(static_cast<uint16_t>(i + 1), result.get(i));
}
}

TEST(Uint16Sse4, WhenBlendingThenValuesAreSetCorrectly) {
uint16x8_t a(uint16x8_t::one());
uint16x8_t b(uint16x8_t::zero());
uint16x8_t c;
NEO::uint16x8_t a(NEO::uint16x8_t::one());
NEO::uint16x8_t b(NEO::uint16x8_t::zero());
NEO::uint16x8_t c;

// c = mask ? a : b
c = blend(a, b, uint16x8_t::mask());
c = blend(a, b, NEO::uint16x8_t::mask());

for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(a.get(i), c.get(i));
}

// c = mask ? a : b
c = blend(a, b, uint16x8_t::zero());
c = blend(a, b, NEO::uint16x8_t::zero());

for (int i = 0; i < uint16x8_t::numChannels; ++i) {
for (int i = 0; i < NEO::uint16x8_t::numChannels; ++i) {
EXPECT_EQ(b.get(i), c.get(i));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ exposing hardware capabilities to applications.
-DRELEASE_WITH_REGKEYS=TRUE \
-DL0_INSTALL_UDEV_RULES=1 \
-DUDEV_RULES_DIR=/etc/udev/rules.d/ \
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF \
-Wno-dev
%make_build

Expand Down
1 change: 1 addition & 0 deletions scripts/packaging/opencl/sles_15.2/SPECS/opencl.spec
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Summary: ocloc package for opencl
-DCMAKE_INSTALL_PREFIX=/usr \
-DSKIP_UNIT_TESTS=1 \
-DRELEASE_WITH_REGKEYS=1 \
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF \
-Wno-dev
%make_build

Expand Down
10 changes: 7 additions & 3 deletions shared/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,14 @@ function(generate_shared_lib LIB_NAME MOCKABLE)

# Enable SSE4/AVX2 options for files that need them
if(MSVC)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/${NEO_TARGET_PROCESSOR}/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS /arch:AVX2)
else()
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_sse4.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
if(COMPILER_SUPPORTS_AVX2)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/${NEO_TARGET_PROCESSOR}/local_id_gen_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
endif()
if(COMPILER_SUPPORTS_SSE42)
set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/helpers/local_id_gen_sse4.cpp PROPERTIES COMPILE_FLAGS -msse4.2)
endif()
endif()

endfunction()
Expand Down
3 changes: 1 addition & 2 deletions shared/source/helpers/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,9 @@ set(NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.h
${CMAKE_CURRENT_SOURCE_DIR}/l3_range.h
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_avx2.cpp
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_special.inl
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen_sse4.cpp
${CMAKE_CURRENT_SOURCE_DIR}/non_copyable_or_moveable.h
${CMAKE_CURRENT_SOURCE_DIR}/options.h
Expand Down
14 changes: 14 additions & 0 deletions shared/source/helpers/aarch64/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#
# Copyright (C) 2019-2021 Intel Corporation
#
# SPDX-License-Identifier: MIT
#

if(${NEO_TARGET_PROCESSOR} STREQUAL "aarch64")
list(APPEND NEO_CORE_HELPERS
${CMAKE_CURRENT_SOURCE_DIR}/CMakeLists.txt
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
)

set_property(GLOBAL PROPERTY NEO_CORE_HELPERS ${NEO_CORE_HELPERS})
endif()
46 changes: 46 additions & 0 deletions shared/source/helpers/aarch64/local_id_gen.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Copyright (C) 2018-2021 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/

#include "shared/source/helpers/local_id_gen.h"

#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/local_id_gen_special.inl"

namespace NEO {

struct uint16x8_t;

// This is the initial value of SIMD for local ID
// computation. It correlates to the SIMD lane.
// Must be 32byte aligned for AVX2 usage
ALIGNAS(32)
const uint16_t initialLocalID[] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};

// Lookup table for generating LocalIDs based on the SIMD of the kernel
void (*LocalIDHelper::generateSimd8)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 8>;
void (*LocalIDHelper::generateSimd16)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 16>;
void (*LocalIDHelper::generateSimd32)(void *buffer, const std::array<uint16_t, 3> &localWorkgroupSize, uint16_t threadsPerWorkGroup, const std::array<uint8_t, 3> &dimensionsOrder, bool chooseMaxRowSize) = generateLocalIDsSimd<uint16x8_t, 32>;

void generateLocalIDs(void *buffer, uint16_t simd, const std::array<uint16_t, 3> &localWorkgroupSize, const std::array<uint8_t, 3> &dimensionsOrder, bool isImageOnlyKernel, uint32_t grfSize) {
auto threadsPerWorkGroup = static_cast<uint16_t>(getThreadsPerWG(simd, localWorkgroupSize[0] * localWorkgroupSize[1] * localWorkgroupSize[2]));
bool useLayoutForImages = isImageOnlyKernel && isCompatibleWithLayoutForImages(localWorkgroupSize, dimensionsOrder, simd);
if (useLayoutForImages) {
generateLocalIDsWithLayoutForImages(buffer, localWorkgroupSize, simd);
} else if (simd == 32) {
LocalIDHelper::generateSimd32(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else if (simd == 16) {
LocalIDHelper::generateSimd16(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else if (simd == 8) {
LocalIDHelper::generateSimd8(buffer, localWorkgroupSize, threadsPerWorkGroup, dimensionsOrder, grfSize != 32);
} else {
generateLocalIDsForSimdOne(buffer, localWorkgroupSize, dimensionsOrder, grfSize);
}
}

} // namespace NEO
Loading

0 comments on commit 895e9e5

Please sign in to comment.