diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 33c2269fb360b..fb593ee00b3eb 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -202,8 +202,7 @@ jobs: --ci-defaults ${{ inputs.build_configure_extra_args }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DLLVM_INSTALL_UTILS=ON \ - -DNATIVECPU_USE_OCK=Off + -DLLVM_INSTALL_UTILS=ON - name: Compile id: build # Emulate default value for manual dispatch as we've run out of available arguments. diff --git a/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt index fb2bf7703ab10..dce987133970b 100644 --- a/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt +++ b/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt @@ -1,3 +1,16 @@ +set(OCK_LIBS) +option(NATIVECPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" ON) + +# Don't use OCK compiler_passes if Native CPU is not enabled. +if(NOT "native_cpu" IN_LIST SYCL_ENABLE_BACKENDS) + set(NATIVECPU_USE_OCK Off CACHE BOOL "Use the oneAPI Construction Kit for Native CPU" FORCE) +endif() + +if(NATIVECPU_USE_OCK) + add_subdirectory(compiler_passes EXCLUDE_FROM_ALL) + set(OCK_LIBS NativeCPUPipeline NativeCPUVecz) +endif() + add_llvm_component_library(LLVMSYCLNativeCPUUtils PipelineSYCLNativeCPU.cpp PrepareSYCLNativeCPU.cpp @@ -17,80 +30,13 @@ add_llvm_component_library(LLVMSYCLNativeCPUUtils TargetParser TransformUtils ipo - ) + ${OCK_LIBS} +) -option(NATIVECPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" ON) - -# Don't fetch OCK if Native CPU is not enabled. -if(NOT "native_cpu" IN_LIST SYCL_ENABLE_BACKENDS) - set(NATIVECPU_USE_OCK Off CACHE BOOL "Use the oneAPI Construction Kit for Native CPU" FORCE) -endif() if(NATIVECPU_USE_OCK) - set(OCK_SEARCH_LOC "oneapi-construction-kit/compiler_passes") - if(NOT FETCHCONTENT_SOURCE_DIR_ONEAPI-CK) - find_path(OCK_SOURCE_DIR ${OCK_SEARCH_LOC} PATHS ${CMAKE_PREFIX_PATH}) - endif() - if(OCK_SOURCE_DIR) - message(STATUS "Found system source location of oneAPI Construction Kit in ${OCK_SOURCE_DIR}") - set(OCK_SOURCE_DIR "${OCK_SOURCE_DIR}/${OCK_SEARCH_LOC}") - set(OCK_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/oneapi-construction-kit") - else() - set(OCK_GIT_REPO "https://github.com/uxlfoundation/oneapi-construction-kit.git") - # commit d0a32d701e34b3285de7ce776ea36abfec673df7 - # Merge: a9f848e0e8 56473a8c25 - # Author: Harald van Dijk - # Date: Mon Jun 30 12:24:46 2025 +0100 - # - # Merge pull request #878 from hvdijk/specify-fuse-ld-lld - # - # [RefSi] Explicitly specify -fuse-ld=lld. - set(OCK_GIT_TAG d0a32d701e34b3285de7ce776ea36abfec673df7) - - include(FetchContent) - FetchContent_Declare(oneapi-ck - GIT_REPOSITORY "${OCK_GIT_REPO}" - GIT_TAG "${OCK_GIT_TAG}" - ) - FetchContent_GetProperties(oneapi-ck) - if(NOT oneapi-ck_POPULATED) - if(FETCHCONTENT_SOURCE_DIR_ONEAPI-CK) - message(STATUS "Using specified oneAPI Construction Kit repo location at ${FETCHCONTENT_SOURCE_DIR_ONEAPI-CK}") - else() - message(STATUS "Cloning oneAPI Construction Kit from ${OCK_GIT_REPO}, tag ${OCK_GIT_TAG}") - endif() - FetchContent_Populate(oneapi-ck) - message(STATUS "oneAPI Construction Kit cloned in ${oneapi-ck_SOURCE_DIR}") - set(OCK_SOURCE_DIR ${oneapi-ck_SOURCE_DIR}/compiler_passes) - set(OCK_BINARY_DIR ${oneapi-ck_BINARY_DIR}) - endif() - endif() - - set(CA_ENABLE_API "cl" CACHE STRING "" FORCE) - add_subdirectory( - ${OCK_SOURCE_DIR} - ${OCK_BINARY_DIR} EXCLUDE_FROM_ALL) - - install(TARGETS compiler-pipeline - EXPORT;LLVMExports - LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline - ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline - RUNTIME DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline) - set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS compiler-pipeline) - install(TARGETS vecz - EXPORT;LLVMExports - LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz - ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz - RUNTIME DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz) - set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS vecz) - install(TARGETS multi_llvm EXPORT;LLVMExports) - set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS multi_llvm) target_compile_definitions(LLVMSYCLNativeCPUUtils PRIVATE NATIVECPU_USE_OCK) target_include_directories(LLVMSYCLNativeCPUUtils PRIVATE - ${oneapi-ck_SOURCE_DIR}/modules/compiler/multi_llvm/include - ${oneapi-ck_SOURCE_DIR}/modules/cargo/include - ${oneapi-ck_SOURCE_DIR}/modules/compiler/vecz/include - ${oneapi-ck_SOURCE_DIR}/modules/compiler/utils/include) - target_link_libraries(LLVMSYCLNativeCPUUtils PRIVATE compiler-pipeline vecz) - + ${CMAKE_CURRENT_SOURCE_DIR}/compiler_passes/compiler_pipeline/include + ${CMAKE_CURRENT_SOURCE_DIR}/compiler_passes/vecz/include) endif() diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt new file mode 100644 index 0000000000000..de47b25e03a30 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/compiler_pipeline) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/vecz) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst new file mode 100644 index 0000000000000..cdfe3a9c79034 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst @@ -0,0 +1,63 @@ +Compiler passes +=============== + +Introduction +------------ + +Files under this directory are integrated from the `oneAPI Construction Kit`_ +using `git-filter-repo`. They are used by Native CPU to help create a pipeline for +turning a base kernel into something which can be executed across multiple work +items, including auto-vectorization. + +These files are largely from the sub-directories +**modules/compiler/compiler_pipeline**, **modules/compiler/vecz** and +**modules/compiler/multi_llvm**. Only files that are used have been integrated +and the **CMake** files have been updated to fit in with LLVM components. + +These sub-directories are used as follows: + +* **compiler_pipeline** provides the passes to build a pipeline from the initial + kernel, including generating working item loops, handling local memory, + handling metadata and calling the vectorizer **vecz**. + +* **vecz** provides a full function vectorizer, which generates a copy of the + original function but vectorized across the work group, taking into account + subgroups. + +* **multi_llvm**. This provides some support for these functions to work across + multiple LLVM versions. Although this is not strictly needed in LLVM, it has + been integrated to allow the integration to go smoothly, without changing files + directly. Note this is header only and exists under + **compiler_pipeline/include/multi_llvm**. + +**compiler_pipeline** and **vecz** will be documented under `sycl/docs`. Note +that there are several limitations in the code that are a result of the initial +integration. These should be addressed over time for maintainability reasons, +they are not necessary for correctness or performance reasons. + +General limitations +------------------- + +To simplify the integration and reduce risk, most of the files were integrated +with no changes at all. This means there are currently the following limitations: + +* The namespace in **compiler_pipeline** is **compiler/utils**, the namespace in + multi_llvm is **multi_llvm** and the namespace in **vecz** is **vecz**. These should + be updated to reflect being under **LLVM**. +* include files should ideally be moved to under **llvm/include** but remain under + these directories after the integration. +* **vecz** has a test tool **veczc** and associated **lit** tests. This tool if + required should be moved under **llvm/tools** or **llvm/test**. This is also + requires `NATIVE_CPU_BUILD_VECZ_TEST_TOOLS` **CMake** option to build. This can be + run using the target `check-sycl-vecz`. +* **compiler_pipeline** has lit tests for the passes which have not been integrated. + This is because they use a tool **muxc**, but these passes should be + able to be tested using **opt**. These lit tests can be found in the + `pipeline pass tests`_. +* There are many integrated files that are unlikely to have any code coverage but because + there are referred to in other files which we do need, they exist here. These + should be pruned over time as a better understanding is made of what is + essential. + +.. _oneAPI Construction Kit: https://github.com/uxlfoundation/oneapi-construction-kit +.. _pipeline pass tests: https://github.com/uxlfoundation/oneapi-construction-kit/tree/main/modules/compiler/test/lit/passes diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt new file mode 100644 index 0000000000000..90981a1718dac --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt @@ -0,0 +1,32 @@ +add_llvm_component_library(LLVMNativeCPUPipeline + ${CMAKE_CURRENT_SOURCE_DIR}/source/attributes.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/barrier_regions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/builtin_info.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/cl_builtin_info.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/define_mux_builtins_pass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/dma.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/encode_kernel_metadata_pass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/group_collective_helpers.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/mangling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/metadata.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/mux_builtin_info.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/pass_functions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/optimal_builtin_replacement_pass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/pass_machinery.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/prepare_barriers_pass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/replace_local_module_scope_variables_pass.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/scheduling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/sub_group_analysis.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/target_extension_types.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/source/work_item_loops_pass.cpp + + LINK_COMPONENTS + Passes + Core + ) + +# TODO: Move to under LLVM include and work out why ADDITIONAL_HEADER_DIRS +# does not capture it. +target_include_directories(LLVMNativeCPUPipeline PUBLIC +$ +) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h new file mode 100644 index 0000000000000..228097d1434d8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h @@ -0,0 +1,38 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// LLVM address space identifiers. + +#ifndef COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED +#define COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED + +namespace compiler { +namespace utils { +namespace AddressSpace { +enum { + Private = 0, + Global = 1, + Constant = 2, + Local = 3, + Generic = 4, +}; +} +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h new file mode 100644 index 0000000000000..3ea0a5fad08ca --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h @@ -0,0 +1,186 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef COMPILER_UTILS_ATTRIBUTES_H_INCLUDED +#define COMPILER_UTILS_ATTRIBUTES_H_INCLUDED + +#include + +#include + +namespace llvm { +class CallInst; +class Function; +} // namespace llvm + +namespace compiler { +namespace utils { + +/// @brief Encodes information that a function is a kernel +/// +/// @param[in] F Function in which to encode the information. +void setIsKernel(llvm::Function &F); + +/// @brief Encodes information that a function is a kernel entry point +/// +/// @param[in] F Function in which to encode the information. +void setIsKernelEntryPt(llvm::Function &F); + +/// @brief Returns whether the function is a kernel under compilation. +/// +/// @param[in] F Function to check. +bool isKernel(const llvm::Function &F); + +/// @brief Returns whether the function is a kernel entry point under +/// compilation. +/// +/// @param[in] F Function to check. +bool isKernelEntryPt(const llvm::Function &F); + +/// @brief Drops any information about whether a function is a kernel. +/// +/// @param[in] F Function to drop information from. +void dropIsKernel(llvm::Function &F); + +/// @brief Takes information about kernels from one function to another. +/// +/// Removes information from the old function, and overwrites any such +/// information in the new function. +/// +/// @param[in] ToF Function to copy to. +/// @param[in] FromF Function to copy from. +void takeIsKernel(llvm::Function &ToF, llvm::Function &FromF); + +/// @brief Sets the original function name as an attribute. +void setOrigFnName(llvm::Function &F); + +/// @brief Retrieves the original function name from the given Function. +/// +/// @return The original function name (via function attributes) or an empty +/// string if none is found. +llvm::StringRef getOrigFnName(const llvm::Function &F); + +/// @brief Retrieves the original function name from the given Function, or the +/// Function's name. +/// +/// @return The original function name (via function attributes) or the +/// function's name if none is found. +llvm::StringRef getOrigFnNameOrFnName(const llvm::Function &F); + +/// @brief Sets the original function name as an attribute. +void setBaseFnName(llvm::Function &F, llvm::StringRef N); + +/// @brief Retrieves the base function name component from the given Function. +/// +/// @return The base function name (via function attributes) or an empty string +/// if none is found. +llvm::StringRef getBaseFnName(const llvm::Function &F); + +/// @brief Retrieves the base function name component from the given Function, +/// or the Function's name. +/// +/// @return The base function name (via function attributes) or the function's +/// name if none is found. +llvm::StringRef getBaseFnNameOrFnName(const llvm::Function &F); + +/// @brief Retrieves the base function name from the given Function and +/// sets it if none is found. +/// @param F The function to read "base function name" attributes from +/// @param SetFromF The function whose name is set as F's base function +/// name if none is found in F. +llvm::StringRef getOrSetBaseFnName(llvm::Function &F, + const llvm::Function &SetFromF); + +/// @brief Sets the local memory usage estimation for the given function. +/// +/// @param[in] F the function in which to add the attribute +/// @param[in] LocalMemUsage the (estimated) local memory usage in bytes +void setLocalMemoryUsage(llvm::Function &F, uint64_t LocalMemUsage); + +/// @brief Gets the local memory usage estimation for the given function. +/// +/// @param[in] F Function from which to pull the attribute +/// @return the (estimated) local memory usage in bytes if present, +/// std::nullopt otherwise. +std::optional getLocalMemoryUsage(const llvm::Function &F); + +/// @brief Sets information about a function's required DMA size as an +/// attribute. +/// +/// @param[in] F Function in which to add the attribute. +/// @param[in] DMASizeBytes DMA size in bytes. +void setDMAReqdSizeBytes(llvm::Function &F, uint32_t DMASizeBytes); + +/// @brief Retrieves information about a function's required DMA size as an +/// attribute. +/// +/// @param[in] F Function from which to pull the attribute +/// @return The required DMA size order if present, else `std::nullopt` +std::optional getDMAReqdSizeBytes(const llvm::Function &F); + +/// @brief Determines the ordering of work item execution after a barrier. +enum class BarrierSchedule { + /// @brief The barrier pass is free to schedule work items in any order. + Unordered = 0, + /// @brief The barrier region is entirely uniform (no dependence on work item + /// ID) such that execution of multiple work items is redundant and we are + /// free to execute the region for only a single work item. Additionally, + /// such a region is not allowed to read from or write to the barrier struct + /// (the region cannot use any variables defined outwith it, nor define any + /// variables used outwith it). Used by work group collectives to initialize + /// their accumulators. + Once, + /// @brief The barrier region should execute all vectorized work items first, + /// followed by the scalar tail. + ScalarTail, + /// @brief The barrier region must be executed in Local Linear ID order. + Linear, +}; + +/// @brief Sets the work item execution schedule for the given barrier. +/// +/// @param[in] CI the barrier call instruction +/// @param[in] Sched the execution schedule to set +void setBarrierSchedule(llvm::CallInst &CI, BarrierSchedule Sched); + +/// @brief Gets the work item execution schedule for the given barrier. +/// +/// @param[in] CI the barrier call instruction +/// @return the execution schedule for this barrier +BarrierSchedule getBarrierSchedule(const llvm::CallInst &CI); + +/// @brief Marks a function as not explicitly using subgroups +/// +/// May be set even with unresolved external functions, assuming those don't +/// explicitly use subgroups. +/// +/// @param[in] F Function in which to encode the information. +void setHasNoExplicitSubgroups(llvm::Function &F); + +/// @brief Returns whether the kernel does not explicitly use subgroups +/// +/// @param[in] F Function to check. +bool hasNoExplicitSubgroups(const llvm::Function &F); + +/// @brief Returns the mux subgroup size for the current function. +/// +/// Currently always returns 1! +unsigned getMuxSubgroupSize(const llvm::Function &F); + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_ATTRIBUTES_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h new file mode 100644 index 0000000000000..701ac4d0f3102 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h @@ -0,0 +1,365 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// Barrier regions, used by the WorkItemLoopsPass. + +#ifndef COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED +#define COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "pass_functions.h" + +namespace llvm { +class BasicBlock; +class CallInst; +class FenceInst; +class Function; +class Instruction; +class Module; +class StructType; +class Type; +class Value; +} // namespace llvm + +namespace compiler { +namespace utils { + +enum { kBarrier_EndID = 0, kBarrier_FirstID, kBarrier_StartNewID }; + +class Barrier; +class BuiltinInfo; + +template +using OrderedSet = + llvm::SetVector, llvm::SmallPtrSet>; + +/// @brief Struct to store information about an inter-barrier region. +struct BarrierRegion { + /// @brief the barrier id of this region + unsigned id = 0; + /// @brief the barrier call instruction for this region + llvm::Instruction *barrier_inst = nullptr; + /// @brief the entry block of this region + llvm::BasicBlock *entry = nullptr; + + llvm::DenseSet defs; + /// @brief barrier crossing uses that are defined in this region + OrderedSet uses_int; + /// @brief barrier crossing uses that are defined in another region + OrderedSet uses_ext; + /// @brief the blocks in this region + std::vector blocks; + /// @brief the exit blocks of this region + llvm::SmallPtrSet barrier_blocks; + /// @brief the barrier ids of the successor regions + llvm::SmallVector successor_ids; + /// @brief the work item execution schedule for this region + BarrierSchedule schedule = BarrierSchedule::Unordered; +}; + +class Barrier { +public: + /// @brief Type for ids of new kernel functions + using kernel_id_map_t = std::map; + + Barrier(llvm::Module &m, llvm::Function &f, bool IsDebug) + : live_var_mem_ty_(nullptr), + size_t_bytes(compiler::utils::getSizeTypeBytes(m)), module_(m), + func_(f), is_debug_(IsDebug), max_live_var_alignment(0) {} + + /// @brief perform the Barrier Region analysis and kernel splitting + void Run(llvm::ModuleAnalysisManager &mam); + + /// @brief return whether the barrier struct needs to contain anything + bool hasLiveVars() const { return !whole_live_variables_set_.empty(); } + + /// @brief returns the StructType of the barrier struct + llvm::StructType *getLiveVarsType() const { return live_var_mem_ty_; } + + /// @brief returns the maximum alignment of the barrier struct + unsigned getLiveVarMaxAlignment() const { return max_live_var_alignment; } + + /// @brief gets the split subkernels + const kernel_id_map_t &getSubkernels() const { return kernel_id_map_; } + + /// @brief gets the split subkernel for the given barrier id + llvm::Function *getSubkernel(unsigned id) const { + return kernel_id_map_.find(id)->second; + } + + /// @brief gets the number of regions/subkernels + size_t getNumSubkernels() const { return kernel_id_map_.size(); } + + llvm::CallInst *getBarrierCall(unsigned id) const { + return llvm::dyn_cast_or_null( + barrier_region_id_map_.find(id)->second.barrier_inst); + } + + /// @brief gets the size of the fixed sized part of the barrier struct + size_t getLiveVarMemSizeFixed() const { return live_var_mem_size_fixed; } + + /// @brief gets the minimum size of the scalable part of the barrier struct + size_t getLiveVarMemSizeScalable() const { + return live_var_mem_size_scalable; + } + + /// @brief gets the element index of the first scalable member of the barrier + /// struct + size_t getLiveVarMemScalablesIndex() const { + return live_var_mem_scalables_index; + } + + /// @brief gets the barrier IDs of the successors of the given barrier region + const llvm::SmallVectorImpl &getSuccessorIds(unsigned id) const { + return barrier_region_id_map_.find(id)->second.successor_ids; + } + + /// @brief gets the barrier IDs of the successors of the given barrier region + BarrierSchedule getSchedule(unsigned id) const { + return barrier_region_id_map_.find(id)->second.schedule; + } + + /// @brief replaces a subkernel with a given function + void replaceSubkernel(llvm::Function *from, llvm::Function *to); + + using debug_variable_records_t = + llvm::SmallVector, 4>; + const debug_variable_records_t &getDebugDbgVariableRecords() const { + return debug_variable_records_; + } + + /// @brief gets the original function + llvm::Function &getFunc() { return func_; } + const llvm::Function &getFunc() const { return func_; } + + /// @brief struct to help retrieval of values from the barrier struct + struct LiveValuesHelper { + const Barrier &barrier; + /// @brief A cache of queried live-values addresses (inside the live + /// variables struct), stored by the pair (value, member_idx). + llvm::DenseMap, llvm::Value *> + live_GEPs; + llvm::DenseMap reloads; + llvm::IRBuilder<> gepBuilder; + llvm::Value *barrier_struct = nullptr; + llvm::Value *vscale = nullptr; + + LiveValuesHelper(const Barrier &b, llvm::Instruction *i, llvm::Value *s) + : barrier(b), gepBuilder(i), barrier_struct(s) {} + + LiveValuesHelper(const Barrier &b, llvm::BasicBlock *bb, llvm::Value *s) + : barrier(b), gepBuilder(bb), barrier_struct(s) {} + + /// @brief Return a GEP instruction pointing to the given value/idx pair in + /// the barrier struct. + /// + /// @return The GEP corresponding to the address of the value in the + /// struct, or nullptr if the value could not be found in the struct. + llvm::Value *getGEP(const llvm::Value *live, unsigned member_idx = 0); + + /// @brief Return a GEP instruction corresponding to the address of + /// the given ExtractValueInst in the barriers struct. + /// + /// @return The GEP corresponding to the address of the value in the + /// struct, or nullptr if the value is not an ExtractValueInst. + llvm::Value *getExtractValueGEP(const llvm::Value *live); + + /// @brief get a value reloaded from the barrier struct. + /// + /// @param[in] live the live value to retrieve from the barrier + /// @param[in] ir where to insert new instructions + /// @param[in] name a postfix to append to new value names + /// @param[in] reuse whether to generate the load for a given value only + /// once, returning the previously cached value on further requests. + llvm::Value *getReload(llvm::Value *live, llvm::IRBuilderBase &ir, + const char *name, bool reuse = false); + }; + +private: + /// @brief The first is set for livein and the second is set for liveout + using live_in_out_t = + std::pair, llvm::DenseSet>; + /// @brief Type for memory allocation of live variables at all of barriers + using live_variable_mem_t = OrderedSet; + /// @brief Type for index of live variables on live variable information + /// Indexed by the pair (value, member_idx) + using live_variable_index_map_t = + llvm::DenseMap, unsigned>; + /// @brief Type for index of live variables on live variable information + /// Indexed by the pair (value, member_idx) + using live_variable_scalables_map_t = live_variable_index_map_t; + /// @brief Type for ids of barriers + using barrier_id_map_t = llvm::DenseMap; + /// @brief Type for ids of barrier regions + using barrier_region_id_map_t = std::map; + /// @brief Type for map from ids to fence instructions + using fence_id_map_t = llvm::DenseMap; + /// @brief Type between block and instruction for barrier. + using barrier_block_inst_map_t = + llvm::DenseMap; + /// @brief Type between block and block for barrier. + using barrier_block_block_set_t = llvm::DenseSet; + /// @brief Type between barrier id and stub call instructions. First + /// component of the pair is invoked before the barrier, the second after. + using debug_stub_map_t = + llvm::DenseMap>; + + /// @brief Keep whole live variables at all of barriers. + live_variable_mem_t whole_live_variables_set_; + /// @brief Keep index of live variables on live variable information. + live_variable_index_map_t live_variable_index_map_; + /// @brief Keep offsets of scalable live variables. + live_variable_scalables_map_t live_variable_scalables_map_; + /// @brief Keep ids of barriers. + barrier_id_map_t barrier_id_map_; + /// @brief Look up a barrier region by its id. + barrier_region_id_map_t barrier_region_id_map_; + /// @brief Keep ids of barriers. + kernel_id_map_t kernel_id_map_; + /// @brief Keep struct types for live variables' memory layout. + llvm::StructType *live_var_mem_ty_; + /// @brief The total size of the non-scalable barrier struct + size_t live_var_mem_size_fixed = 0; + /// @brief The total unscaled size of the scalable barrier struct + size_t live_var_mem_size_scalable = 0; + /// @brief The index of the scalables buffer array in the barrier struct. + size_t live_var_mem_scalables_index = 0; + /// @brief Keep barriers. + llvm::SmallVector barriers_; + /// @brief Set of basic blocks that have a barrier as their successor + barrier_block_block_set_t barrier_successor_set_; + /// @brief Map between barrier ids and call instructions invoking stubs + debug_stub_map_t barrier_stub_call_map_; + /// @brief List of debug DbgVariableRecords and byte offsets into live + /// variable struct + debug_variable_records_t debug_variable_records_; + + size_t size_t_bytes; + + llvm::Module &module_; + llvm::Function &func_; + + BuiltinInfo *bi_ = nullptr; + + /// @brief Set to true if we want to debug the kernel. This involves adding + /// debug stub functions and an extra alloca to aide debugging. + const bool is_debug_; + + // @brief max alignment required for the live variables. + unsigned max_live_var_alignment; + + /// @brief Find Barriers. + void FindBarriers(); + + /// @brief Split block with barrier. + void SplitBlockwithBarrier(); + + /// @brief Generate an empty kernel that only duplicates the source kernel's + /// CFG + /// + /// This is used to do a "dry run" of kernel splitting in order to obtain the + /// dominator tree, which is needed for correct identification of values that + /// cross the barrier. + /// + /// @param[in] region the region to clone into the new kernel. + /// @param[out] bbmap a mapping of original blocks onto the empty clones. + /// @return the fake kernel + llvm::Function *GenerateFakeKernel( + BarrierRegion ®ion, + llvm::DenseMap &bbmap); + + /// @brief Obtain a set of Basic Blocks for an inter-barrier region + /// + /// It traverses the CFG, following successors, until it hits a barrier, + /// building the region's internal data. + /// + /// @param[out] region the region to process + void GatherBarrierRegionBlocks(BarrierRegion ®ion); + + /// @brief Obtain a set of Values used in a region that cross a barrier + /// + /// A value use crosses a barrier in the following cases: + /// * Its use is not in the same region as the defintion + /// * Its definition does not dominate the use + /// + /// @param[in] region The inter-barrier region + /// @param[in] ignore set of values to ignore + void GatherBarrierRegionUses(BarrierRegion ®ion, + llvm::DenseSet &ignore); + + /// @brief Find livein and liveout variables per each basic block. + void FindLiveVariables(); + + /// @brief Remove variables that are better recalculated than stored in the + /// barrier, for instance casts and vector splats. + void TidyLiveVariables(); + + /// @brief Pad the field types to an alignment by adding an int array if + /// needed + /// @param field_tys The vector of types representing the final structure + /// @param offset The current offset in the structure + /// @param alignment The required alignment + /// @return The new offset (or original offset if no padding needed) + unsigned PadTypeToAlignment(llvm::SmallVectorImpl &field_tys, + unsigned offset, unsigned alignment); + + /// @brief Make type for whole live variables. + void MakeLiveVariableMemType(); + + /// @brief Generate new kernel from an inter-barrier region such that no call + /// to barriers occur within it. + /// + /// @param[in] region the inter-barrier region to create the kernel from + /// @return the new kernel + llvm::Function *GenerateNewKernel(BarrierRegion ®ion); + + /// @brief This function is a copy from llvm::CloneBasicBlock. In order to + /// update live variable information, some of codes are added. + /// + /// @param[in] bb Basic block to copy. + /// @param[out] vmap Map for value for cloning. + /// @param[in] name_suffix Name for suffix. + /// @param[out] live_defs_info Live definitions' info current basic block. + /// @param[in] F Current function. + /// + /// @return Return cloned basic block. + llvm::BasicBlock *CloneBasicBlock(llvm::BasicBlock *bb, + llvm::ValueToValueMapTy &vmap, + const llvm::Twine &name_suffix, + live_variable_mem_t &live_defs_info, + llvm::Function *F); + + /// @brief Seperate kernel function with barrier boundary. + void SeperateKernelWithBarrier(); +}; + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h new file mode 100644 index 0000000000000..b88b82aab6123 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h @@ -0,0 +1,860 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Information about compiler builtins. + +#ifndef COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED +#define COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace compiler { +namespace utils { +/// @addtogroup utils +/// @{ + +using BuiltinID = int32_t; + +enum BaseBuiltinID { + eBuiltinUnknown, + + // Mux builtins + eMuxBuiltinIsFTZ, + eMuxBuiltinUseFast, + eMuxBuiltinIsEmbeddedProfile, + eMuxBuiltinGetGlobalSize, + eMuxBuiltinGetGlobalId, + eMuxBuiltinGetGlobalOffset, + eMuxBuiltinGetLocalSize, + eMuxBuiltinGetLocalId, + eMuxBuiltinSetLocalId, + eMuxBuiltinGetSubGroupId, + eMuxBuiltinSetSubGroupId, + eMuxBuiltinGetNumGroups, + eMuxBuiltinGetNumSubGroups, + eMuxBuiltinSetNumSubGroups, + eMuxBuiltinGetMaxSubGroupSize, + eMuxBuiltinSetMaxSubGroupSize, + eMuxBuiltinGetGroupId, + eMuxBuiltinGetWorkDim, + eMuxBuiltinDMARead1D, + eMuxBuiltinDMARead2D, + eMuxBuiltinDMARead3D, + eMuxBuiltinDMAWrite1D, + eMuxBuiltinDMAWrite2D, + eMuxBuiltinDMAWrite3D, + eMuxBuiltinDMAWait, + eMuxBuiltinGetGlobalLinearId, + eMuxBuiltinGetLocalLinearId, + eMuxBuiltinGetEnqueuedLocalSize, + eMuxBuiltinGetSubGroupSize, + eMuxBuiltinGetSubGroupLocalId, + // Synchronization builtins + eMuxBuiltinMemBarrier, + eMuxBuiltinSubGroupBarrier, + eMuxBuiltinWorkGroupBarrier, +#define GROUP_BUILTINS(SCOPE) \ + eFirstMux##SCOPE##groupCollectiveBuiltin, \ + eMuxBuiltin##SCOPE##groupAll = eFirstMux##SCOPE##groupCollectiveBuiltin, \ + eMuxBuiltin##SCOPE##groupAny, eMuxBuiltin##SCOPE##groupBroadcast, \ + eMuxBuiltin##SCOPE##groupReduceAdd, eMuxBuiltin##SCOPE##groupReduceFAdd, \ + eMuxBuiltin##SCOPE##groupReduceSMin, \ + eMuxBuiltin##SCOPE##groupReduceUMin, \ + eMuxBuiltin##SCOPE##groupReduceFMin, \ + eMuxBuiltin##SCOPE##groupReduceSMax, \ + eMuxBuiltin##SCOPE##groupReduceUMax, \ + eMuxBuiltin##SCOPE##groupReduceFMax, eMuxBuiltin##SCOPE##groupReduceMul, \ + eMuxBuiltin##SCOPE##groupReduceFMul, eMuxBuiltin##SCOPE##groupReduceAnd, \ + eMuxBuiltin##SCOPE##groupReduceOr, eMuxBuiltin##SCOPE##groupReduceXor, \ + eMuxBuiltin##SCOPE##groupReduceLogicalAnd, \ + eMuxBuiltin##SCOPE##groupReduceLogicalOr, \ + eMuxBuiltin##SCOPE##groupReduceLogicalXor, \ + eMuxBuiltin##SCOPE##groupScanAddInclusive, \ + eMuxBuiltin##SCOPE##groupScanFAddInclusive, \ + eMuxBuiltin##SCOPE##groupScanAddExclusive, \ + eMuxBuiltin##SCOPE##groupScanFAddExclusive, \ + eMuxBuiltin##SCOPE##groupScanSMinInclusive, \ + eMuxBuiltin##SCOPE##groupScanUMinInclusive, \ + eMuxBuiltin##SCOPE##groupScanFMinInclusive, \ + eMuxBuiltin##SCOPE##groupScanSMinExclusive, \ + eMuxBuiltin##SCOPE##groupScanUMinExclusive, \ + eMuxBuiltin##SCOPE##groupScanFMinExclusive, \ + eMuxBuiltin##SCOPE##groupScanSMaxInclusive, \ + eMuxBuiltin##SCOPE##groupScanUMaxInclusive, \ + eMuxBuiltin##SCOPE##groupScanFMaxInclusive, \ + eMuxBuiltin##SCOPE##groupScanSMaxExclusive, \ + eMuxBuiltin##SCOPE##groupScanUMaxExclusive, \ + eMuxBuiltin##SCOPE##groupScanFMaxExclusive, \ + eMuxBuiltin##SCOPE##groupScanMulInclusive, \ + eMuxBuiltin##SCOPE##groupScanFMulInclusive, \ + eMuxBuiltin##SCOPE##groupScanMulExclusive, \ + eMuxBuiltin##SCOPE##groupScanFMulExclusive, \ + eMuxBuiltin##SCOPE##groupScanAndInclusive, \ + eMuxBuiltin##SCOPE##groupScanAndExclusive, \ + eMuxBuiltin##SCOPE##groupScanOrInclusive, \ + eMuxBuiltin##SCOPE##groupScanOrExclusive, \ + eMuxBuiltin##SCOPE##groupScanXorInclusive, \ + eMuxBuiltin##SCOPE##groupScanXorExclusive, \ + eMuxBuiltin##SCOPE##groupScanLogicalAndInclusive, \ + eMuxBuiltin##SCOPE##groupScanLogicalAndExclusive, \ + eMuxBuiltin##SCOPE##groupScanLogicalOrInclusive, \ + eMuxBuiltin##SCOPE##groupScanLogicalOrExclusive, \ + eMuxBuiltin##SCOPE##groupScanLogicalXorInclusive, \ + eMuxBuiltin##SCOPE##groupScanLogicalXorExclusive + GROUP_BUILTINS(Work), + eLastMuxWorkgroupCollectiveBuiltin = + eMuxBuiltinWorkgroupScanLogicalXorExclusive, + GROUP_BUILTINS(Sub), + // Extra subgroup shuffle operations + eMuxBuiltinSubgroupShuffle, + eMuxBuiltinSubgroupShuffleUp, + eMuxBuiltinSubgroupShuffleDown, + eMuxBuiltinSubgroupShuffleXor, + eLastMuxSubgroupCollectiveBuiltin = eMuxBuiltinSubgroupShuffleXor, + GROUP_BUILTINS(Vec), + eLastMuxVecgroupCollectiveBuiltin = + eMuxBuiltinVecgroupScanLogicalXorExclusive, + + // Marker - target builtins should start from here. + eFirstTargetBuiltin, +}; + +/// @brief Describes the uniformity of a builtin's return values. An uniform +/// value is the same for all instances (e.g. SIMD lanes). +enum BuiltinUniformity : int32_t { + /// @brief The uniformity of the builtin's return value cannot be determined. + eBuiltinUniformityUnknown, + /// @brief The builtin never returns uniform values. + eBuiltinUniformityNever, + /// @brief The builtin always returns uniform values. + eBuiltinUniformityAlways, + /// @brief The builtin returns uniform values if its inputs are uniform. + eBuiltinUniformityLikeInputs, + /// @brief The builtin returns a sequential instance ID value + /// (e.g. get_local_id in OpenCL). + eBuiltinUniformityInstanceID, + /// @brief The builtin might return a sequential instance ID value, + /// if its argument can be zero (e.g. get_local_id(x)). + eBuiltinUniformityMaybeInstanceID +}; + +/// @brief Describes certain properties of builtin functions that the vectorizer +/// needs to know about. +enum BuiltinProperties : int32_t { + /// @brief The builtin has no special propery. + eBuiltinPropertyNone = 0, + /// @brief The builtin returns a value related to the geometry of the work + /// space, such as its dimension or an index into that dimensions. + eBuiltinPropertyWorkItem = (1 << 0), + /// @brief The builtin can affect the execution flow (e.g. barrier). + eBuiltinPropertyExecutionFlow = (1 << 1), + /// @brief The builtin implements a reduction, that is, it takes vector + /// arguments and returns a scalar value. + eBuiltinPropertyReduction = (1 << 2), + /// @brief The builtin has known side-effects. + eBuiltinPropertySideEffects = (1 << 3), + /// @brief The builtin is known to have no runtime side-effects. This is + /// equivalent to 'readonly' or 'readnone' in IR. The return value depends + /// only on the values of the arguments. + eBuiltinPropertyNoSideEffects = (1 << 4), + /// @brief The builtin can be instantiated, even if it has side-effects. + /// Builtins with 'NoSideEffects' should not be instantiated unless they + /// also have this flag, because of the 'noduplicate' IR attribute. + eBuiltinPropertySupportsInstantiation = (1 << 5), + /// @brief The builtin has no vector equivalent. There may be functions that + /// have the same signature that a vector equivalent function would have, + /// but these functions should not be used for that purpose. This can also + /// mean that a vector builtin has no scalar equivalent. + eBuiltinPropertyNoVectorEquivalent = (1 << 6), + /// @brief The builtin has a vector equivalent. This is used for the LLVM + /// intrinsics, since for the OpenCL builtins we can determine that + /// programmatically. It can also mean that a builtin has a scalar equivalent. + eBuiltinPropertyVectorEquivalent = (1 << 7), + /// @brief The builtin can be emitted inline. + eBuiltinPropertyCanEmitInline = (1 << 8), + /// @brief The builtin returns a value through its pointer argument. The + /// returned type is equal to the function return type. + eBuiltinPropertyPointerReturnEqualRetTy = (1 << 9), + /// @brief The builtin wants to be inlined post vectorization + eBuiltinPropertyInlinePostVectorization = (1 << 10), + /// @brief The builtin returns a value through its pointer argument. The + /// returned value is an i32 scalar or vector, matching the function return + /// type: float -> i32, <4 x double> -> <4 x i32>, etc + eBuiltinPropertyPointerReturnEqualIntRetTy = (1 << 11), + /// @brief The builtin returns local work item ID. + eBuiltinPropertyLocalID = (1 << 12), + /// @brief The builtin is atomic + eBuiltinPropertyAtomic = (1 << 13), + /// @brief The builtin is rematerializable on the other side of a barrier + /// + /// The WorkItemLoopsPass queries this property to prune the number of live + /// variables that are stored and passed between barrier regions. Calls to + /// rematerializable builtins are removed from the live variable structure, + /// and are re-inserted into each barrier region that requires their results. + eBuiltinPropertyRematerializable = (1 << 14), + /// @brief The builtin should be lowered to a mux builtin. + /// + /// This mapping takes place in BuiltinInfo::lowerBuiltinToMuxBuiltin. + eBuiltinPropertyLowerToMuxBuiltin = (1 << 15), + /// @brief The builtin is known not be be convergent, i.e., it does not + /// depend on any other work-item in any way. + eBuiltinPropertyKnownNonConvergent = (1 << 16), +}; + +/// @brief struct to hold information about a builtin function +struct Builtin { + /// @brief the builtin Function + const llvm::Function &function; + /// @brief ID for internal use + const BuiltinID ID; + /// @brief the Builtin Properties + const BuiltinProperties properties; + /// @brief list of types used in overloading this builtin (only relevant for + /// overloadable mux builtins) + std::vector mux_overload_info = {}; + + /// @brief returns whether the builtin is unknown + bool isUnknown() const { return ID == eBuiltinUnknown; } +}; + +/// @brief struct to hold information about a builtin function call +struct BuiltinCall : public Builtin { + /// @brief the call instruction + const llvm::CallInst &call; + /// @brief the uniformity of the builtin call + const BuiltinUniformity uniformity; + + /// @brief constructor + BuiltinCall(const Builtin &B, const llvm::CallInst &CI, BuiltinUniformity U) + : Builtin(B), call(CI), uniformity(U) {} +}; + +namespace MuxBuiltins { +constexpr const char isftz[] = "__mux_isftz"; +constexpr const char usefast[] = "__mux_usefast"; +constexpr const char isembeddedprofile[] = "__mux_isembeddedprofile"; +constexpr const char get_global_size[] = "__mux_get_global_size"; +constexpr const char get_global_id[] = "__mux_get_global_id"; +constexpr const char get_global_offset[] = "__mux_get_global_offset"; +constexpr const char get_local_size[] = "__mux_get_local_size"; +constexpr const char get_local_id[] = "__mux_get_local_id"; +constexpr const char get_sub_group_id[] = "__mux_get_sub_group_id"; +constexpr const char get_num_groups[] = "__mux_get_num_groups"; +constexpr const char get_num_sub_groups[] = "__mux_get_num_sub_groups"; +constexpr const char get_max_sub_group_size[] = "__mux_get_max_sub_group_size"; +constexpr const char get_group_id[] = "__mux_get_group_id"; +constexpr const char get_work_dim[] = "__mux_get_work_dim"; +constexpr const char dma_read_1d[] = "__mux_dma_read_1D"; +constexpr const char dma_read_2d[] = "__mux_dma_read_2D"; +constexpr const char dma_read_3d[] = "__mux_dma_read_3D"; +constexpr const char dma_write_1d[] = "__mux_dma_write_1D"; +constexpr const char dma_write_2d[] = "__mux_dma_write_2D"; +constexpr const char dma_write_3d[] = "__mux_dma_write_3D"; +constexpr const char dma_wait[] = "__mux_dma_wait"; +constexpr const char get_global_linear_id[] = "__mux_get_global_linear_id"; +constexpr const char get_local_linear_id[] = "__mux_get_local_linear_id"; +constexpr const char get_enqueued_local_size[] = + "__mux_get_enqueued_local_size"; +constexpr const char get_sub_group_size[] = "__mux_get_sub_group_size"; +constexpr const char get_sub_group_local_id[] = "__mux_get_sub_group_local_id"; + +// Barriers +constexpr const char mem_barrier[] = "__mux_mem_barrier"; +constexpr const char sub_group_barrier[] = "__mux_sub_group_barrier"; +constexpr const char work_group_barrier[] = "__mux_work_group_barrier"; + +// DMA Event Type +constexpr const char dma_event_type[] = "__mux_dma_event_t"; + +// Internal Mux Functions +constexpr const char set_local_id[] = "__mux_set_local_id"; +constexpr const char set_sub_group_id[] = "__mux_set_sub_group_id"; +constexpr const char set_num_sub_groups[] = "__mux_set_num_sub_groups"; +constexpr const char set_max_sub_group_size[] = "__mux_set_max_sub_group_size"; +} // namespace MuxBuiltins + +static inline llvm::Type *getPointerReturnPointeeTy(const llvm::Function &F, + BuiltinProperties Props) { + if (Props & eBuiltinPropertyPointerReturnEqualRetTy) { + return F.getReturnType(); + } + if (Props & eBuiltinPropertyPointerReturnEqualIntRetTy) { + llvm::Type *I32Ty = llvm::IntegerType::getInt32Ty(F.getContext()); + if (auto *VTy = llvm::dyn_cast(F.getReturnType())) { + return llvm::VectorType::get(I32Ty, + multi_llvm::getVectorElementCount(VTy)); + } + return I32Ty; + } + return nullptr; +} + +/// @brief Describes how builtins should be materialized. +enum BuiltinMatFlags : int32_t { + /// @brief Use default materialization options. + eBuiltinMatDefault = 0, + /// @brief The body of the builtin should be materialized. + eBuiltinMatDefinition = (1 << 0) +}; + +class BIMuxInfoConcept; +class BILangInfoConcept; + +/// @brief A class that encapsulates information and transformations concerning +/// compiler builtin functions. +/// +/// It provides methods for querying data about builtin functions, methods for +/// emitting bodies of builtins "inline", and methods for materializing +/// builtins from an external source. +/// +/// It contains a BIMuxInfoConcept implementation to provide mux builtin +/// information on a target-by-target basis. +/// +/// It contains an optional BILangInfoConcept implementation to provide builtin +/// information on a target-by-target basis. +class BuiltinInfo { +public: + // Default-construct a BuiltinInfo without a concrete set of language-level + // builtins. + BuiltinInfo() : MuxImpl(std::make_unique()) {} + + BuiltinInfo(std::unique_ptr &&LangImpl) + : MuxImpl(std::make_unique()), + LangImpl(std::move(LangImpl)) {} + + BuiltinInfo(std::unique_ptr &&MuxImpl, + std::unique_ptr &&LangImpl) + : MuxImpl(std::move(MuxImpl)), LangImpl(std::move(LangImpl)) {} + + BuiltinInfo(BuiltinInfo &&) = default; + BuiltinInfo &operator=(BuiltinInfo &&RHS) = default; + + /// @brief Retrieves the optional module containing builtin definitions. + llvm::Module *getBuiltinsModule(); + + /// @brief Determine general properties for the given builtin function. + /// @param[in] F Function to analyze. + /// @return Analyzed properties for the builtin. + std::optional analyzeBuiltin(const llvm::Function &F) const; + + /// @brief Determine general properties for the given builtin function. + /// @param[in] CI Call instruction to analyze. + /// @return Analyzed properties for the builtin call. + std::optional analyzeBuiltinCall(const llvm::CallInst &CI, + unsigned SimdDimIdx) const; + + /// @brief Try to find a builtin function that is a vector equivalent of the + /// given function with the given vector width, if it exists. + /// @param[in] B Builtin to query for a vector equivalent. + /// @param[in] Width Vector width. + /// @param[in] M Optional module where the vector equivalent should be + /// declared. + /// @return Equivalent vector builtin function on success. + llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width, + llvm::Module *M = nullptr); + + /// @brief Try to find a builtin function that is a scalar equivalent of the + /// given function, if it exists. + /// @param[in] B Builtin to query for a scalar equivalent. + /// @param[in] M Optional module where the vector equivalent should be + /// declared. + /// @return Equivalent scalar builtin function on success. + llvm::Function *getScalarEquivalent(const Builtin &B, llvm::Module *M); + + /// @brief Emit an inline implementation of the builtin function F. + /// @param[in] Builtin Builtin function to emit an implementation for. + /// @param[in] B Insertion point for the implementation. + /// @param[in] Args Arguments to the builtin function. + /// @return A value that implements the builtin function or null. + llvm::Value *emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + + /// @brief Return a known range of values this call may return. + /// @param[in] CI Call instruction to analyze. + /// @param[in] MaxLocalSizes The maximum local work-group sizes in each of + /// the 3 dimensions that this target supports. + /// @param[in] MaxGlobalSizes The maximum global work-group sizes in each of + /// the 3 dimensions that this target supports. + std::optional + getBuiltinRange(llvm::CallInst &CI, + std::array, 3> MaxLocalSizes, + std::array, 3> MaxGlobalSizes) const; + + /// @brief Lowers a call to a language-level builtin to an instruction + /// sequences calling a mux builtin. + /// + /// For a call to a builtin for which the property + /// eBuiltinPropertyLowerToMuxBuiltin is set, the target must then re-express + /// the call to a new sequence, usually involving mux builtins. + llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &CI); + + /// @brief Get a builtin for printf. + /// @return An identifier for the builtin, or the invalid builtin if there + /// is none. This builtin should have a signature of ` (, ...)`. + std::optional getPrintfBuiltin() const; + + /// @brief Returns true if the given ID is a ComputeMux builtin ID. + static bool isMuxBuiltinID(BuiltinID ID) { + return ID > eBuiltinUnknown && ID < eFirstTargetBuiltin; + } + + /// @brief Returns true if the given ID is an overloadable ComputeMux builtin + /// ID. + /// + /// These builtins *require* extra overloading info when declaring or + /// defining. + static bool isOverloadableMuxBuiltinID(BuiltinID ID); + + /// @brief Returns true if the given ID is a ComputeMux barrier builtin ID. + static bool isMuxControlBarrierID(BuiltinID ID) { + return ID == eMuxBuiltinSubGroupBarrier || + ID == eMuxBuiltinWorkGroupBarrier; + } + + /// @brief Returns true if the given ID is a ComputeMux DMA builtin ID. + static bool isMuxDmaBuiltinID(BuiltinID ID) { + return ID == eMuxBuiltinDMAWait || ID == eMuxBuiltinDMARead1D || + ID == eMuxBuiltinDMARead2D || ID == eMuxBuiltinDMARead3D || + ID == eMuxBuiltinDMAWrite1D || ID == eMuxBuiltinDMAWrite2D || + ID == eMuxBuiltinDMAWrite3D; + } + + /// @brief Gets information about a mux group operation builtin + static std::optional isMuxGroupCollective(BuiltinID ID); + + /// @brief Returns the mux builtin ID matching the group collective, or + /// eBuiltinInvalid. + static std::optional + getMuxGroupCollective(const GroupCollective &Group); + + /// @brief Returns true if the mux builtin has a barrier ID as its first + /// operand. + static bool isMuxBuiltinWithBarrierID(BuiltinID ID) { + if (isMuxControlBarrierID(ID)) { + return true; + } + auto Info = isMuxGroupCollective(ID); + return Info && Info->isWorkGroupScope(); + } + + /// @brief Returns true if the mux builtin has a barrier ID as its first + /// operand, and applies at Work Group scope. + static bool isMuxBuiltinWithWGBarrierID(BuiltinID ID) { + if (ID == eMuxBuiltinWorkGroupBarrier) { + return true; + } + auto Info = isMuxGroupCollective(ID); + return Info && Info->isWorkGroupScope(); + } + + /// @brief Maps a ComputeMux builtin ID to its function name. + /// + /// @param OverloadInfo An array of types required to resolve certain + /// overloadable builtins, e.g., group builtins. + static std::string + getMuxBuiltinName(BuiltinID ID, + llvm::ArrayRef OverloadInfo = {}); + + /// @brief Mangles a type using the LLVM intrinsic scheme + /// + /// This is an extremely simple mangling scheme matching LLVM's intrinsic + /// mangling system. It is only designed to be used with a specific set of + /// types and is not a general-purpose mangler. + /// + /// * iXXX -> iXXX + /// * half -> f16 + /// * float -> f32 + /// * double -> f64 + /// * -> vNTy + /// * -> nxvNTy + static std::string getMangledTypeStr(llvm::Type *Ty); + + /// @brief Demangles a type using the LLVM intrinsic scheme - returns nullptr + /// if it was unable to demangle a type. + /// + /// @see getMangledTypeStr + static std::pair + getDemangledTypeFromStr(llvm::StringRef TyStr, llvm::LLVMContext &Ctx); + + /// @brief Defines the body of a ComputeMux builtin declaration + /// + /// If the Module already has a function definition with the corresponding + /// function name, it is left alone and returned. + /// + /// Will declare any builtins it requires as transitive dependencies. + /// + /// @param OverloadInfo An array of types required to resolve certain + /// overloadable builtins, e.g., group builtins. + llvm::Function * + defineMuxBuiltin(BuiltinID, llvm::Module &M, + llvm::ArrayRef OverloadInfo = {}); + + /// @brief Gets a ComputeMux builtin from the module, or declares it + /// + /// @param OverloadInfo An array of types required to resolve certain + /// overloadable builtins, e.g., group builtins. + llvm::Function * + getOrDeclareMuxBuiltin(BuiltinID, llvm::Module &M, + llvm::ArrayRef OverloadInfo = {}); + + struct SchedParamInfo { + /// @brief An identifier providing resolution for targets to identify + /// specific scheduling parameters. + /// + /// By default, will be the index into the list returned by + /// getMuxSchedulingParameters. + unsigned ID; + /// @brief The parameter type + llvm::Type *ParamTy; + /// @brief A (possibly empty) set of parameter attributes to apply to all + /// functions featuring this parameter. + llvm::AttributeSet ParamAttrs; + /// @brief The name of the parameter, to aid debugging. May be empty. + std::string ParamName; + /// @brief A human-readable name to be emitted in !mux-scheduling-params + std::string ParamDebugName; + /// @brief True if the parameter is passed externally by the driver to the + /// kernel entry point, else false if this parameter is initialized by the + /// kernel at the top level. + /// + /// This provides an interface to passes such as AddKernelWrapperPass. + /// + /// If true, the parameter is passed through every layer of kernels. If + /// false, the parameter must be initialized by + /// initializeSchedulingParamForWrappedKernel. + bool PassedExternally; + /// @brief An optional type to aid targets in remembering the underlying + /// parameter type, if the parameter is a pointer. + llvm::Type *ParamPointeeTy = nullptr; + /// @brief An optional value specifying the concrete function argument. + llvm::Argument *ArgVal = nullptr; + }; + + /// @brief Returns a target-specific list of scheduling parameters to be + /// applied to all builtins for which requiresSchedulingParameters returns + /// true. + /// + /// This list of parameters that dictates the order of parameters added to + /// each builtin. As such it must be constant and immutable for each Module. + /// + /// This list is emitted into the module as metadata by the + /// AddSchedulingParametersPass for user reference. + /// + /// This function does not have to fill in SchedParamInfo::ArgVal, as this + /// query is not specific to one function. + llvm::SmallVector + getMuxSchedulingParameters(llvm::Module &); + + /// @brief Returns target-specific scheduling parameters from a concrete + /// function. + /// + /// Uses metadata returned via + /// compiler::utils::getSchedulingParameterFunctionMetadata to determine + /// whether the function contains scheduling parameters. + /// + /// If set, this function should return the same result as + /// getMuxSchedulingParameters, but with SchedParamInfo::ArgVal filled in to + /// correspond to the actual concrete llvm::Argument values of the given + /// function. Note that not all ArgVals are guaranteed to be populated, as a + /// function may contain only a subset of the target's list of scheduling + /// parameters. + /// + /// If not set, this function returns an empty list. + llvm::SmallVector + getFunctionSchedulingParameters(llvm::Function &); + + /// @brief Responsible for initializing a scheduling parameter for which + /// PassedExternally is 'false'. + /// + /// This is conceptually used to initialize scheduling parameters which are + /// used for scheduling "internally" and do not make up the driver-facing + /// kernel ABI. + /// + /// @param Info The SchedParamInfo dictating which kind of scheduling + /// parameter to initialize. + /// @param B An IRBuilder providing the insertion point at which to insert + /// initialization instructions. + /// @param IntoF The function into which initialization instructions are to be + /// inserted. + /// @param CalleeF The function for which the initialization is taking place. + /// CalleeF will be called by IntoF. + llvm::Value *initializeSchedulingParamForWrappedKernel( + const SchedParamInfo &Info, llvm::IRBuilder<> &B, llvm::Function &IntoF, + llvm::Function &CalleeF); + + /// @brief Returns true if the builtin ID requires extra scheduling + /// parameters to function. + /// + /// This function only handles mux builtins, and does not to defer any of + /// BuiltinInfo's implementation instances. + /// + /// These parameters will to be added to the function (and its callers) by + /// the AddSchedulingParametersPass. + bool requiresSchedulingParameters(BuiltinID ID); + + /// @brief Returns the remapped type for a target extension type + /// + /// This method is intended for target implementations to be able signal to + /// the DefineTargetExtTysPass how LLVM's target extension types should be + /// remapped across the module. There is a default implementation: see + /// BIMuxInfoConcept::getRemappedTargetExtTy + /// + /// This method is safe to call before LLVM 17 but will do nothing (there are + /// no target extension types before LLVM 17). Otherwise this method asserts + /// that the type is a target extension type. + /// + /// @param Ty The target extension type to remap + /// @param M The Module in which to replace the type + /// @return The remapped type, or nullptr if the type does not require + /// remapping + llvm::Type *getRemappedTargetExtTy(llvm::Type *Ty, llvm::Module &M); + + /// Handle the invalidation of this information. + /// + /// When used as a result of BuiltinInfoAnalysis this method will be called + /// when the function this was computed for changes. When it returns false, + /// the information is preserved across those changes. + bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &, + llvm::ModuleAnalysisManager::Invalidator &) { + return false; + } + +private: + /// @brief Try to identify a builtin function. + /// @param[in] F The function to identify. + /// @return Valid builtin ID if the name was identified, as well as any types + /// required to overload the builtin ID. + std::optional>> + identifyMuxBuiltin(const llvm::Function &F) const; + + /// @brief Determine whether the given builtin function returns uniform values + /// or not. An optional call instruction can be passed for more accuracy. + /// @param[in] B the builtin to analyze uniformity. + /// @param[in] CI Optional argument list from a call instruction. + /// @param[in] SimdDimIdx Index of current vectorization dimension. + /// @return Uniformity value for the builtin. + BuiltinUniformity isBuiltinUniform(const Builtin &B, const llvm::CallInst *CI, + unsigned SimdDimIdx) const; + + std::unique_ptr MuxImpl; + std::unique_ptr LangImpl; +}; + +/// @brief An interface class that provides mux- and target-specific +/// information and transformations to an instance of BuiltinInfo. All methods +/// are to be called through from the equivalent methods in BuiltinInfo. +class BIMuxInfoConcept { +public: + virtual ~BIMuxInfoConcept() = default; + + /// @brief See BuiltinInfo::defineMuxBuiltin. + virtual llvm::Function * + defineMuxBuiltin(BuiltinID, llvm::Module &M, + llvm::ArrayRef OverloadInfo = {}); + + /// @brief See BuiltinInfo::getOrDeclareMuxBuiltin. + virtual llvm::Function * + getOrDeclareMuxBuiltin(BuiltinID, llvm::Module &M, + llvm::ArrayRef OverloadInfo = {}); + + /// @brief See BuiltinInfo::getMuxSchedulingParameters + virtual llvm::SmallVector + getMuxSchedulingParameters(llvm::Module &); + + /// @brief See BuiltinInfo::getFunctionSchedulingParameters + virtual llvm::SmallVector + getFunctionSchedulingParameters(llvm::Function &); + + /// @brief See BuiltinInfo::initializeSchedulingParamForWrappedKernel + virtual llvm::Value *initializeSchedulingParamForWrappedKernel( + const BuiltinInfo::SchedParamInfo &Info, llvm::IRBuilder<> &B, + llvm::Function &IntoF, llvm::Function &CalleeF); + + /// @brief Sets default builtin attributes on the given function. + static void setDefaultBuiltinAttributes(llvm::Function &F, + bool AlwaysInline = true); + + /// @brief Returns true if the mux builtin requires scheduling parameters to + /// function. + virtual bool requiresSchedulingParameters(BuiltinID); + + /// @brief See BuiltinInfo::getRemappedTargetExtTy + /// + /// This method is overridable but the default implementation provides the + /// following mappings: + /// * spirv.Event -> i32 + /// * spirv.Sampler -> i32 + /// * spirv.Image -> MuxImage* (regardless of image parameters) + virtual llvm::Type *getRemappedTargetExtTy(llvm::Type *Ty, llvm::Module &M); + + /// @see BuiltinInfo::getBuiltinRange + virtual std::optional + getBuiltinRange(llvm::CallInst &, BuiltinID ID, + std::array, 3>, + std::array, 3>) const; + + enum MemScope : uint32_t { + MemScopeCrossDevice = 0, + MemScopeDevice = 1, + MemScopeWorkGroup = 2, + MemScopeSubGroup = 3, + MemScopeWorkItem = 4, + }; + + enum MemSemantics : uint32_t { + // Only set one of the following bits at a time: + MemSemanticsRelaxed = 0x0, + MemSemanticsAcquire = 0x2, + MemSemanticsRelease = 0x4, + MemSemanticsAcquireRelease = 0x8, + MemSemanticsSequentiallyConsistent = 0x10, + MemSemanticsMask = 0x1F, + // What kind of memory is controlled by a barrier + MemSemanticsSubGroupMemory = 0x80, + MemSemanticsWorkGroupMemory = 0x100, + MemSemanticsCrossWorkGroupMemory = 0x200, + }; + +protected: + llvm::Function *defineGetGlobalId(llvm::Module &M); + llvm::Function *defineGetGlobalSize(llvm::Module &M); + llvm::Function *defineGetLocalLinearId(llvm::Module &M); + llvm::Function *defineGetGlobalLinearId(llvm::Module &M); + llvm::Function *defineGetEnqueuedLocalSize(llvm::Module &M); + llvm::Function *defineMemBarrier(llvm::Function &F, unsigned ScopeIdx, + unsigned SemanticsIdx); + llvm::Function *defineGetSubGroupSize(llvm::Function &F); + llvm::Function *defineGetSubGroupLocalId(llvm::Function &F); + /// @brief Provides a default implementation for `__mux_dma_read_1D` and + /// `__mux_dma_write_1D`. + /// + /// These routines are not intended to be efficient for a + /// particular architecture and are really a placeholder for customers until + /// they are ready to define these functions with DMA calls. They are + /// essentially a memcpy. + llvm::Function *defineDMA1D(llvm::Function &F); + /// @brief Provides a default implementation for `__mux_dma_read_2D` + /// and `__mux_dma_write_2D`. + /// + /// These routines are not intended to be efficient for a + /// particular architecture and are really a placeholder for customers until + /// they are ready to define these functions with DMA calls. They are + /// essentially a memcpy. + llvm::Function *defineDMA2D(llvm::Function &F); + /// @brief Provides a default implementation for `__mux_dma_read_3D` + /// and `__mux_dma_write_3D`. + /// + /// These routines are not intended to be efficient for a + /// particular architecture and are really a placeholder for customers until + /// they are ready to define these functions with DMA calls. They are + /// essentially a memcpy. + llvm::Function *defineDMA3D(llvm::Function &F); + /// @brief Provides a default implementation for `__mux_dma_wait`. + /// + /// This routine is not intended to be efficient for a + /// particular architecture and are really a placeholder for customers until + /// they are ready to define these functions with DMA calls. This + /// implementation does nothing and simply returns. + llvm::Function *defineDMAWait(llvm::Function &F); +}; + +/// @brief An interface class that provides language-specific information and +/// transformations to an instance of BuiltinInfo. All methods are to be called +/// through from the equivalent methods in BuiltinInfo. +class BILangInfoConcept { +public: + virtual ~BILangInfoConcept() = default; + + /// @see BuiltinInfo::getBuiltinsModule + virtual llvm::Module *getBuiltinsModule() { return nullptr; } + /// @see BuiltinInfo::analyzeBuiltin + virtual std::optional + analyzeBuiltin(const llvm::Function &F) const = 0; + /// @see BuiltinInfo::isBuiltinUniform + virtual BuiltinUniformity isBuiltinUniform(const Builtin &B, + const llvm::CallInst *, + unsigned) const = 0; + /// @see BuiltinInfo::getVectorEquivalent + virtual llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width, + llvm::Module *M = nullptr) = 0; + /// @see BuiltinInfo::getScalarEquivalent + virtual llvm::Function *getScalarEquivalent(const Builtin &B, + llvm::Module *M) = 0; + /// @see BuiltinInfo::emitBuiltinInline + virtual llvm::Value * + emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B, + llvm::ArrayRef Args) = 0; + /// @see BuiltinInfo::getBuiltinRange + virtual std::optional + getBuiltinRange(llvm::CallInst &, std::array, 3>, + std::array, 3>) const { + return std::nullopt; + } + + /// @see BuiltinInfo::lowerBuiltinToMuxBuiltin + virtual llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &, + BIMuxInfoConcept &) { + return nullptr; + } + /// @see BuiltinInfo::getPrintfBuiltin + virtual std::optional getPrintfBuiltin() const = 0; +}; + +/// @brief Caches and returns the BuiltinInfo for a Module. +class BuiltinInfoAnalysis + : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + using Result = BuiltinInfo; + using CallbackFn = std::function; + + BuiltinInfoAnalysis(); + + BuiltinInfoAnalysis(CallbackFn BICallback) : BICallback(BICallback) {} + + /// @brief Retrieve the BuiltinInfo for the requested module. + Result run(llvm::Module &M, llvm::ModuleAnalysisManager &) { + return BICallback(M); + } + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "BuiltinInfo analysis"; } + +private: + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; + + /// @brief Callback function producing a BuiltinInfo on demand. + CallbackFn BICallback; +}; + +/// @} +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h new file mode 100644 index 0000000000000..16be8450d5124 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h @@ -0,0 +1,216 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief OpenCL's BuiltinInfo implementation. + +#ifndef COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED +#define COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED + +#include +#include + +namespace compiler { +namespace utils { +/// @addtogroup utils +/// @{ + +/// @brief Convenience function for constructing a CLBuiltinInfo as a unique_ptr +/// @param[in] builtins the Builtin module +/// @return a std::unique_ptr to a new CLBuiltinInfo +std::unique_ptr createCLBuiltinInfo(llvm::Module *builtins); + +/// @brief Builtin loader base class. +class CLBuiltinLoader { +protected: + CLBuiltinLoader() = default; + +public: + virtual ~CLBuiltinLoader() = default; + + /// @brief Load a builtin function. + /// @param[in] BuiltinName Name of the builtin function to materialize. + /// @param[in] DestM Optional module in which to load the builtin function. + /// @param[in] Flags Materialization flags to use. + /// @return Pointer to the materialized builtin function on success. + /// If a module is passed, the returned builtin function must live in + /// that module. + virtual llvm::Function *materializeBuiltin(llvm::StringRef BuiltinName, + llvm::Module *DestM, + BuiltinMatFlags Flags); + + /// @brief Expose any builtins Module + virtual llvm::Module *getBuiltinsModule() { return nullptr; } +}; + +/// @brief Simple Builtin loader wrapping a given builtins module. +class SimpleCLBuiltinLoader final : public CLBuiltinLoader { +public: + SimpleCLBuiltinLoader(llvm::Module *builtins) : BuiltinModule(builtins) {} + + ~SimpleCLBuiltinLoader() = default; + + /// @brief Expose any builtins Module + virtual llvm::Module *getBuiltinsModule() override { return BuiltinModule; } + +private: + /// @brief Loaded builtins module. + llvm::Module *BuiltinModule; +}; + +/// @brief A class that encapsulates information and transformations concerning +/// compiler OpenCL builtin functions. +class CLBuiltinInfo : public BILangInfoConcept { +public: + /// @brief Constructs a CLBuiltinInfo from a given Builtins module + CLBuiltinInfo(llvm::Module *Builtins); + + /// @brief Constructs a CLBuiltinInfo with a user-provided loader + CLBuiltinInfo(std::unique_ptr L) : Loader(std::move(L)) {} + + ~CLBuiltinInfo(); + + llvm::Module *getBuiltinsModule() override; + + /// @see BuiltinInfo::isBuiltinUniform + BuiltinUniformity isBuiltinUniform(const Builtin &B, const llvm::CallInst *CI, + unsigned SimdDimIdx) const override; + + /// @see BuiltinInfo::analyzeBuiltin + std::optional analyzeBuiltin(const llvm::Function &F) const override; + /// @see BuiltinInfo::getVectorEquivalent + llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width, + llvm::Module *M = nullptr) override; + /// @see BuiltinInfo::getScalarEquivalent + llvm::Function *getScalarEquivalent(const Builtin &B, + llvm::Module *M) override; + /// @see BuiltinInfo::emitBuiltinInline + llvm::Value *emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B, + llvm::ArrayRef Args) override; + + /// @see BuiltinInfo::lowerBuiltinToMuxBuiltin + llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &, + BIMuxInfoConcept &) override; + /// @see BuiltinInfo::getPrintfBuiltin + std::optional getPrintfBuiltin() const override; + +private: + std::optional identifyBuiltin(const llvm::Function &) const; + + llvm::Function * + materializeBuiltin(llvm::StringRef BuiltinName, llvm::Module *DestM = nullptr, + BuiltinMatFlags Flags = eBuiltinMatDefault); + + llvm::Instruction *lowerGroupBuiltinToMuxBuiltin(llvm::CallInst &CI, + BuiltinID ID, + BIMuxInfoConcept &BIMuxImpl); + llvm::Instruction *lowerAsyncBuiltinToMuxBuiltin(llvm::CallInst &CI, + BuiltinID ID, + BIMuxInfoConcept &BIMuxImpl); + + llvm::Value *emitBuiltinInline(BuiltinID ID, llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineAsLLVMBinaryIntrinsic(llvm::IRBuilder<> &B, + llvm::Value *LHS, + llvm::Value *RHS, + llvm::Intrinsic::ID ID); + // 6.2 Conversions & Type Casting + llvm::Value *emitBuiltinInlineAs(llvm::Function *F, llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineConvert(llvm::Function *F, BuiltinID ID, + llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + + // 6.11.5 Geometric Built-in Functions + llvm::Value *emitBuiltinInlineGeometrics(BuiltinID builtinID, + llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineDot(llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineCross(llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineLength(llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineNormalize(llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + + // 6.11.6 Relational Built-in Functions + llvm::Value *emitBuiltinInlineRelationalsWithTwoArguments( + BuiltinID BuiltinID, llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineRelationalsWithOneArgument(BuiltinID BuiltinID, + llvm::IRBuilder<> &B, + llvm::Value *Arg); + llvm::Value *emitBuiltinInlineAll(llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineAny(llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineSelect(llvm::Function *F, llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + + // 6.11.7 Vector Data Load/Store Functions + llvm::Value *emitBuiltinInlineVLoad(llvm::Function *F, unsigned Width, + llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineVStore(llvm::Function *F, unsigned Width, + llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineVLoadHalf(llvm::Function *F, + llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + llvm::Value *emitBuiltinInlineVStoreHalf(llvm::Function *F, + llvm::StringRef Mode, + llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + + // 6.11.12 Miscellaneous Vector Functions + llvm::Value *emitBuiltinInlineShuffle(BuiltinID BuiltinID, + llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + + llvm::Value *emitBuiltinInlinePrintf(BuiltinID BuiltinID, + llvm::IRBuilder<> &B, + llvm::ArrayRef Args); + + /// @brief Return the name of the builtin with the given identifier. + /// @param[in] ID Identifier of the builtin to return the name. + /// @return Name of the builtin. + llvm::StringRef getBuiltinName(BuiltinID ID) const; + + /// @brief Declare the specified OpenCL builtin in the given module. + /// @param[in] M Module in which declare the builtin. + /// @param[in] ID Builtin identifier. + /// @param[in] RetTy Return type for the builtin. + /// @param[in] ArgTys List of argument types. + /// @param[in] ArgQuals List of argument qualifiers. + /// @param[in] Suffix Optional builtin name suffix. + /// @return Builtin function declaration. + llvm::Function *declareBuiltin(llvm::Module *M, BuiltinID ID, + llvm::Type *RetTy, + llvm::ArrayRef ArgTys, + llvm::ArrayRef ArgQuals, + llvm::Twine Suffix = ""); + + /// @brief BuiltinLoader used to load builtins. + std::unique_ptr Loader; +}; + +/// @} +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h new file mode 100644 index 0000000000000..af33fbce17788 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h @@ -0,0 +1,36 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file + +#ifndef COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED +#define COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED + +#include + +namespace compiler { +namespace utils { + +class DefineMuxBuiltinsPass final + : public llvm::PassInfoMixin { +public: + llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &); +}; + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h new file mode 100644 index 0000000000000..c1002430aadc1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h @@ -0,0 +1,125 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Information about compiler device information. + +#ifndef COMPILER_UTILS_DEVICE_INFO_H_INCLUDED +#define COMPILER_UTILS_DEVICE_INFO_H_INCLUDED + +#include + +#include + +namespace compiler { +namespace utils { + +/// @brief Bitfield of all possible floating point capabilities. +/// +/// Each Mux device struct has a member which denotes the floating point +/// capabilities of that device, as a bitfield of the following enum. +/// +/// NOTE: Must be kept in sync with mux_floating_point_capabilities_e in +/// mux/include/mux/mux.h! This should probably be placed in an intermediary +/// mux/compiler library and shared. +enum device_floating_point_capabilities_e { + /// @brief Denormals supported. + device_floating_point_capabilities_denorm = 0x1, + /// @brief INF and NaN are supported. + device_floating_point_capabilities_inf_nan = 0x2, + /// @brief Round to nearest even supported. + device_floating_point_capabilities_rte = 0x4, + /// @brief Round to zero supported. + device_floating_point_capabilities_rtz = 0x8, + /// @brief Round to positive infinity supported. + device_floating_point_capabilities_rtp = 0x10, + /// @brief Round to negative infinity supported. + device_floating_point_capabilities_rtn = 0x20, + /// @brief Fused multiply add supported. + device_floating_point_capabilities_fma = 0x40, + /// @brief Floating point operations are written in software. + device_floating_point_capabilities_soft = 0x80, + /// @brief Binary format conforms to the IEEE-754 specification. + device_floating_point_capabilities_full = 0x100 +}; + +struct DeviceInfo { + DeviceInfo() = default; + + /// @brief Construct a DeviceInfo from individual properties + /// + /// @param h Enumeration of half-precision floating-point capabilities + /// @param f Enumeration of single-precision floating-point capabilities + /// @param d Enumeration of double-precision floating-point capabilities + /// @param max_work_width The maximum number of work-items of a work-group + /// allowed to execute in one invocation of a kernel. + DeviceInfo(uint32_t h, uint32_t f, uint32_t d, uint32_t max_work_width) + : half_capabilities(h), float_capabilities(f), double_capabilities(d), + max_work_width(max_work_width) {} + + uint32_t half_capabilities = 0; + uint32_t float_capabilities = 0; + uint32_t double_capabilities = 0; + uint32_t max_work_width = 0; + + /// @brief List of supported 'required' sub-group sizes reported by this + /// device. + /// + /// These are only the sub-group sizes that can be requested as 'required' for + /// a kernel; the compiler may produce a wide range of other sub-group sizes + /// on undecorated kernels, assuming sub-groups are supported by the device. + std::vector reqd_sub_group_sizes; + + /// @brief Handle invalidation events from the new pass manager. + /// + /// @return false, as this analysis can never be invalidated. + bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &, + llvm::ModuleAnalysisManager::Invalidator &) { + return false; + } +}; + +/// @brief Caches and returns the device information for a Module. +class DeviceInfoAnalysis : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + using Result = DeviceInfo; + + DeviceInfoAnalysis() = default; + DeviceInfoAnalysis(Result res) : Info(res) {} + + /// @brief Retrieve the DeviceInfo for the requested module. + Result run(llvm::Module &, llvm::ModuleAnalysisManager &) { + return Info ? *Info : Result(); + } + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "Device info analysis"; } + +private: + /// @brief Optional device information + std::optional Info; + + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; +}; + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_DEVICE_INFO_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h new file mode 100644 index 0000000000000..815188761f272 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h @@ -0,0 +1,91 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// LLVM DMA pass utility functions. + +#ifndef COMPILER_UTILS_DMA_H_INCLUDED +#define COMPILER_UTILS_DMA_H_INCLUDED + +#include +#include + +#include + +namespace llvm { +class BasicBlock; +class Module; +class Value; +} // namespace llvm + +namespace compiler { +namespace utils { + +class BIMuxInfoConcept; + +/// @addtogroup utils +/// @{ + +/// @brief Helper function to check the local ID of the current thread. +/// +/// @param[in] bb Basic block to generate the check in. +/// @param[in] x The local id in the x dimension to compare against. +/// @param[in] y The local id in the y dimension to compare against. +/// @param[in] z The local id in the z dimension to compare against. +/// @param[in] GetLocalIDFn Function used to get the local work-item ID +/// +/// @return A true Value if the local ID equals that passed via the index +/// arguments, false otherwise. +llvm::Value *isThreadEQ(llvm::BasicBlock *bb, unsigned x, unsigned y, + unsigned z, llvm::Function &GetLocalIDFn); + +/// @brief Helper function to check if the local ID of the current thread is {0, +/// 0, 0}. +/// +/// @param[in] bb Basic block to generate the check in. +/// @param[in] GetLocalIDFn Function used to get the local work-item ID +/// +/// @return A true Value if the local ID is {0, 0, 0} / false otherwise. +llvm::Value *isThreadZero(llvm::BasicBlock *bb, llvm::Function &GetLocalIDFn); + +/// @brief Insert 'thread-checking' logic in the entry block, so that control +/// branches to the 'true' block when the current work-item in the first in the +/// work-group (e.g. ID zero in all dimensions) or to the 'false' block for +/// other work-items +/// +/// @param[in] entryBlock Block to insert the 'thread-checking' logic +/// @param[in] trueBlock Block to execute only on the first work-item +/// @param[in] falseBlock Block to execute on all other work-items +/// @param[in] GetLocalIDFn Function used to get the local work-item ID +void buildThreadCheck(llvm::BasicBlock *entryBlock, llvm::BasicBlock *trueBlock, + llvm::BasicBlock *falseBlock, + llvm::Function &GetLocalIDFn); + +/// @brief Gets or creates the __mux_dma_event_t type. +/// +/// This type may be declared by other passes hence we "get or create it". +/// +/// @param[in] m LLVM Module to get or create the type in. +/// +/// @return The opaque struct declaration of the __mux_dma_event_t type. +llvm::StructType *getOrCreateMuxDMAEventType(llvm::Module &m); + +/// @} +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_DMA_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h new file mode 100644 index 0000000000000..261a5bbc7d4f8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h @@ -0,0 +1,60 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// EncodeKernelMetadataPass pass. + +#ifndef COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED +#define COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED + +#include +#include + +#include + +namespace compiler { +namespace utils { + +/// @brief Sets up the per-function mux metadata used by later passes. +/// Transfers per-module !opencl.kernel metadata to mux kernel metadata. +struct TransferKernelMetadataPass + : public llvm::PassInfoMixin { + explicit TransferKernelMetadataPass() {} + + llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM); +}; + +struct EncodeKernelMetadataPassOptions { + std::string KernelName; + std::optional> LocalSizes = std::nullopt; +}; + +struct EncodeKernelMetadataPass + : public llvm::PassInfoMixin { + EncodeKernelMetadataPass(EncodeKernelMetadataPassOptions Options) + : KernelName(Options.KernelName), LocalSizes(Options.LocalSizes) {} + + llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM); + +private: + std::string KernelName; + std::optional> LocalSizes; +}; +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h new file mode 100644 index 0000000000000..fcbd07825fb22 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h @@ -0,0 +1,112 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// Helper functions for working with sub_group and work_group functions. + +#ifndef COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED +#define COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED + +#include + +namespace llvm { +class Constant; +class Function; +class Type; +} // namespace llvm + +namespace compiler { +namespace utils { +/// @brief Utility function for retrieving the neutral value of a +/// reduction/scan operation. A neutral value is one that does not affect the +/// result of a given operation, e.g., adding 0 or multiplying by 1. +/// +/// @param[in] Kind The kind of scan/reduction operation +/// @param[in] Ty The type of the returned neutral value. Must match the type +/// assumed by @a Kind, e.g., a floating-point type for floating-point +/// operations. +/// +/// @return The neutral value, or nullptr if unhandled. +llvm::Constant *getNeutralVal(llvm::RecurKind Kind, llvm::Type *Ty); + +/// @brief Utility function for retrieving the identity value of a +/// reduction/scan operation. The identity value is one that is expected to be +/// found in the first element of an exclusive scan. It is equal to the neutral +/// value (see @ref getNeutralVal) in all cases except in floating-point +/// min/max, where -INF/+INF is the expected identity and in floating-point +/// addition, where 0.0 (not -0.0 which is the neutral value) is the expected +/// identity. +/// +/// @param[in] Kind The kind of scan/reduction operation +/// @param[in] Ty The type of the returned neutral value. Must match the type +/// assumed by @a Kind, e.g., a floating-point type for floating-point +/// operations. +/// +/// @return The neutral value, or nullptr if unhandled. +llvm::Constant *getIdentityVal(llvm::RecurKind Kind, llvm::Type *Ty); + +/// @brief Represents a work-group or sub-group collective operation. +struct GroupCollective { + /// @brief The different operation types a group collective can represent. + enum class OpKind { + All, + Any, + Reduction, + ScanInclusive, + ScanExclusive, + Broadcast, + Shuffle, + ShuffleUp, + ShuffleDown, + ShuffleXor, + }; + + /// @brief The possible scopes of a group collective. + enum class ScopeKind { WorkGroup, SubGroup, VectorGroup }; + + /// @brief The operation type of the group collective. + OpKind Op = OpKind::All; + /// @brief The scope of the group collective operation. + ScopeKind Scope = ScopeKind::WorkGroup; + /// @brief The llvm recurrence operation this can be mapped to. For broadcasts + /// this will be None. + llvm::RecurKind Recurrence = llvm::RecurKind::None; + /// @brief True if the operation is logical, rather than bitwise. + bool IsLogical = false; + /// @brief Returns true for Any/All type collective operations. + bool isAnyAll() const { return Op == OpKind::Any || Op == OpKind::All; } + /// @brief Returns true for inclusive/exclusive scan collective operations. + bool isScan() const { + return Op == OpKind::ScanExclusive || Op == OpKind::ScanInclusive; + } + /// @brief Returns true for reduction collective operations. + bool isReduction() const { return Op == OpKind::Reduction; } + /// @brief Returns true for broadcast collective operations. + bool isBroadcast() const { return Op == OpKind::Broadcast; } + bool isShuffleLike() const { + return Op == OpKind::Shuffle || Op == OpKind::ShuffleUp || + Op == OpKind::ShuffleDown || Op == OpKind::ShuffleXor; + } + /// @brief Returns true for sub-group collective operations. + bool isSubGroupScope() const { return Scope == ScopeKind::SubGroup; } + /// @brief Returns true for work-group collective operations. + bool isWorkGroupScope() const { return Scope == ScopeKind::WorkGroup; } +}; +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h new file mode 100644 index 0000000000000..66e6a89bd5d43 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h @@ -0,0 +1,408 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Simple function mangling framework. + +#ifndef COMPILER_UTILS_MANGLING_H_INCLUDED +#define COMPILER_UTILS_MANGLING_H_INCLUDED + +#include +#include +#include + +#include + +namespace llvm { +class LLVMContext; +class Type; +class raw_ostream; +} // namespace llvm + +namespace compiler { +namespace utils { +/// @brief Describes type qualifiers, which are aspects that need to be taken +/// into account when mangling function names. Such aspects are not represented +/// in the LLVM type. This is why such qualifiers need to be used along types. +enum TypeQualifier : int32_t { + /// @brief The type has no special qualifier. + eTypeQualNone = 0, + /// @brief The type is a signed integer. + eTypeQualSignedInt = 1, + /// @brief The type is a constant pointer. + eTypeQualPointerConst = 2, + /// @brief The type is a volatile pointer. + eTypeQualPointerVolatile = 4, + /// @brief The type is a restrict pointer. + eTypeQualPointerRestrict = 8 +}; + +/// @brief Contains a small hierarchical list of TypeQualifier. +/// +/// This hierarchy maps to derived types such as pointers or vectors: +/// * First qualifier for the pointer type. +/// * Second qualifier for the pointed-to type. +class TypeQualifiers final { + using StorageT = uint64_t; + +public: + /// @brief Create a type qualifier list with no qualifiers. + TypeQualifiers(); + /// @brief Create a type qualifier list with one qualifiers. + /// + /// @param[in] Qual First qualifier. + TypeQualifiers(TypeQualifier Qual); + /// @brief Create a type qualifier list with two qualifiers. + /// + /// @param[in] Qual1 First qualifier. + /// @param[in] Qual2 Second qualifier. + TypeQualifiers(TypeQualifier Qual1, TypeQualifier Qual2); + + /// @brief Create a type qualifier list with one qualifiers. + /// @note Convenience function that allows bit manipulation of qualifiers. + /// + /// @param[in] Qual First qualifier. + TypeQualifiers(unsigned Qual); + /// @brief Create a type qualifier list with two qualifiers. + /// @note Convenience function that allows bit manipulation of qualifiers. + /// + /// @param[in] Qual1 First qualifier. + /// @param[in] Qual2 Second qualifier. + TypeQualifiers(unsigned Qual1, unsigned Qual2); + + /// @brief Number of type qualifiers contained in the list. + StorageT getCount() const; + + /// @brief Top-most qualifier from the list. + TypeQualifier front() const; + + /// @brief Remove the top-most qualifier from the list and returns it. + TypeQualifier pop_front(); + + /// @brief Return the qualifier at the given index. + TypeQualifier at(unsigned Idx) const; + + /// @brief Add a qualifier to the list, making it bottom-most. + /// + /// @param[in] Qual Qualifier to add to the list. + /// + /// @return true if there was enough space to add the qualifier, or false. + bool push_back(TypeQualifier Qual); + /// @brief Add a qualifier to the list, making it bottom-most. + /// @note Convenience function that allows bit manipulation of qualifiers. + /// + /// @param[in] Qual Qualifier to add to the list. + /// + /// @return true if there was enough space to add the qualifier, or false. + bool push_back(unsigned Qual); + /// @brief Add qualifiers to the end of the list. + /// + /// @param[in] Quals Qualifiers to add to the list. + /// + /// @return true if there was enough space to add the qualifiers, or false. + bool push_back(TypeQualifiers Quals); + + /// @brief Determine whether two qualifier lists are equal. + bool operator==(const TypeQualifiers &other) { + return storage_ == other.storage_; + } + + /// @brief Determine whether two qualifier lists are different. + bool operator!=(const TypeQualifiers &other) { return !(*this == other); } + +private: + /// @brief Set the number of type qualifiers contained in the list. + void setCount(StorageT newCount); + + /// @brief Bits that make up the list. Deliberately small to pass by value. + StorageT storage_; + + /// @brief Number of bits used to encode the size of the list. + static const unsigned NumCountBits = 4; + + /// @brief Number of bits used to encode one qualifier in the list. + static const unsigned NumQualBits = 10; + + /// @brief Number of bits that can be used to store the list. + static const unsigned NumStorageBits = sizeof(StorageT) * 8; + + /// @brief Maximum size of the list. + static const unsigned MaxSize = (NumStorageBits - NumCountBits) / NumQualBits; + + static_assert(MaxSize < (1 << NumCountBits) - 1, "MaxSize cannot be encoded"); +}; + +/// @brief Helps with light parsing such as demangling function names. +class Lexer final { +public: + /// @brief Create a new lexer with the given text. + /// + /// @param[in] text Text to lex. + Lexer(llvm::StringRef text); + + /// @brief Number of characters left to lex. + unsigned Left() const; + /// @brief Current lexing position in the text. + unsigned CurrentPos() const; + /// @brief String containing the text remaining to be lexed. + llvm::StringRef TextLeft() const; + /// @brief Current character. + /// @return Character or negative value if no text is left. + int Current() const; + + /// @brief Consume one character, advancing to the next character in the + /// string. + /// @return true if a character was consumed, false if no text left. + bool Consume(); + /// @brief Consume several characters, advancing through the string. + /// + /// @param[in] Size Number of characters to consume. + /// + /// @return true if Size characters were consumed, false otherwise. + bool Consume(unsigned Size); + /// @brief Consume a string, and skip past it. + /// + /// @param[in] Pattern String to consume. + /// + /// @return true if Pattern was found and consumed, false otherwise. + bool Consume(llvm::StringRef Pattern); + /// @brief Consume an unsigned integer, and skip past it. + /// + /// @param[out] Result Consumed unsigned integer. + /// + /// @return true if an unsigned integer was consumed, false otherwise. + bool ConsumeInteger(unsigned &Result); + /// @brief Consume a signed integer, and skip past it. + /// + /// @param[out] Result Consumed signed integer. + /// + /// @return true if a signed integer was consumed, false otherwise. + bool ConsumeSignedInteger(int &Result); + /// @brief Consume consecutive alphabetic characters and skip past them. + /// + /// @param[out] Result Consumed string. + /// + /// @return true if an alphabetic string was consumed, false otherwise. + bool ConsumeAlpha(llvm::StringRef &Result); + /// @brief Consume consecutive alphanumeric characters and skip past them. + /// + /// @param[out] Result Consumed string. + /// + /// @return true if an alphanumeric string was consumed, false otherwise. + bool ConsumeAlphanumeric(llvm::StringRef &Result); + /// @brief Consume all characters until C is found. C is not consumed. + /// + /// @param[in] C Delimiter character. + /// @param[out] Result Consumed string. + /// + /// @return true if C was found, false otherwise. + bool ConsumeUntil(char C, llvm::StringRef &Result); + /// @brief Consume all whitespace characters + /// + /// @return true if any whitespace was consumed or false otherwise + bool ConsumeWhitespace(); + +private: + /// @brief Text to lex. + llvm::StringRef Text; + /// @brief Current lexing position into the text. + unsigned Pos; +}; + +/// @brief Converts between mangled and non-mangled function names. +class NameMangler final { +public: + /// @brief Create a new name mangler. + /// + /// @param[in] context LLVM context to use. + NameMangler(llvm::LLVMContext *context); + + /// @brief Determine the mangled name of a function. + /// + /// @param[in] Name Non-mangled name of the function. + /// @param[in] Tys List of types, one for each function argument. + /// @param[in] Quals Qualifiers, one for each type in Tys.. + /// + /// @return The mangled name of the function. + std::string mangleName(llvm::StringRef Name, llvm::ArrayRef Tys, + llvm::ArrayRef Quals); + + /// @brief Try to mangle the given qualified type. + /// + /// @param[in] O Output stream to write the mangled name to. + /// @param[in] Type Type to mangle. + /// @param[in] Quals Type qualifiers. + /// + /// @return true if the type name could be mangled. + bool mangleType(llvm::raw_ostream &O, llvm::Type *Type, TypeQualifiers Quals); + + /// @brief Try to mangle the given qualified type, taking substitutions into + /// account. + /// + /// @param[in] O Output stream to write the mangled name to. + /// @param[in] Type Type to mangle. + /// @param[in] Quals Type qualifiers. + /// @param[in] PrevTys Previously mangled types. + /// @param[in] PrevQuals Qualifiers for previously mangled types. + /// + /// @return true if the type name could be mangled. + bool mangleType(llvm::raw_ostream &O, llvm::Type *Type, TypeQualifiers Quals, + llvm::ArrayRef PrevTys, + llvm::ArrayRef PrevQuals); + + /// @brief Remove the mangling of a function name, retrieving argument types + /// and qualifiers in the process. + /// + /// @param[in] Name Mangled function name to demangle. + /// @param[out] Types Vector that will receive LLVM types for the arguments. + /// @param[out] Quals Vector that will receive type qualifiers for the + /// arguments. + /// + /// @return Demangled name or an empty string on failure + llvm::StringRef demangleName(llvm::StringRef Name, + llvm::SmallVectorImpl &Types, + llvm::SmallVectorImpl &Quals); + + /// @brief Remove the mangling of a function name, retrieving argument types + /// and qualifiers in the process. + /// + /// @param[in] Name Mangled function name to demangle. + /// @param[out] Types Vector that will receive LLVM types for the arguments. + /// @param[out] PointerElementTypes Vector that will receive LLVM types for + /// the *first level* of pointer element types. + /// @param[out] Quals Vector that will receive type qualifiers for the + /// arguments. + /// + /// For example: + /// _Z3fooPii + /// Types[0] = PointerType + /// PointerElementTypes[0] = i32 + /// Quals[0] = (PointerQual, SignedIntQual) + /// + /// Types[1] = i32 + /// PointerElementTypes[1] = nullptr + /// Quals[1] = (SignedIntQual) + /// + /// @return Demangled name or an empty string on failure + llvm::StringRef + demangleName(llvm::StringRef Name, llvm::SmallVectorImpl &Types, + llvm::SmallVectorImpl &PointerElementTypes, + llvm::SmallVectorImpl &Quals); + + /// @brief Remove the mangling of a function name. + /// + /// @param[in] Name Mangled function name to demangle. + /// + /// @return Demangled name or original name if not mangled. + llvm::StringRef demangleName(llvm::StringRef Name); + +private: + /// @brief Try to mangle the given qualified type. This only works for simple + /// types that do not require string manipulation. + /// + /// @param[in] Ty Type to mangle. + /// @param[in] Qual Type qualifier. + /// + /// @return Mangled name of the type or nullptr. + const char *mangleSimpleType(llvm::Type *Ty, TypeQualifier Qual); + /// @brief Try to mangle the given builtin type name. This only works for + /// 'spirv' target extension types (LLVM 17+). + /// + /// @param[in] Ty type to mangle. + /// + /// @return string if builtin type could be mangled otherwise empty string. + std::optional mangleBuiltinType(llvm::Type *Ty); + /// @brief Try to demangle the given type name. This only works for simple + /// types that do not require string manipulation. + /// + /// @param[in,out] L Lexer for the mangled type name. + /// @param[out] Ty Demangled type. + /// @param[out] Qual Demangled type qualifier. + /// + /// @return true if the type name could be demangled. + bool demangleSimpleType(Lexer &L, llvm::Type *&Ty, TypeQualifier &Qual); + /// @brief Try to demangle the given type name. This only works for opencl + /// builtin types. + /// + /// @param[in,out] L Lexer for the mangled type name. + /// @param[out] Ty Demangled type. + /// + /// @return true if the type name could be demangled. + bool demangleOpenCLBuiltinType(Lexer &L, llvm::Type *&Ty); + /// @brief Try to demangle the given type. + /// + /// @param[in] L Lexer currently pointing at a type. + /// @param[out] Ty Demangled type. + /// @param[out] PointerEltTy If null, unchanged. Else, set to the demangled + /// pointer element type, if Ty is a non-opaque pointer type. Else set to + /// nulltpr. + /// @param[out] Quals Demangled type qualifiers. + /// @param[in] CtxTypes Previously demangled types, used for substitutions. + /// @param[in] CtxQuals Previously demangled qualifiers. + /// + /// @return true if the type could be demangled, false otherwise. + bool demangleType(Lexer &L, llvm::Type *&Ty, llvm::Type **PointerEltTy, + TypeQualifiers &Quals, + llvm::SmallVectorImpl &CtxTypes, + llvm::SmallVectorImpl &CtxQuals); + + /// @brief Demangle a name. + /// + /// @param[in] L Lexer currently pointing at a mangled name. + /// + /// @return Demangled name or an empty string. + llvm::StringRef demangleName(Lexer &L); + /// @brief Determine the type 'index' the substitution refers to. + /// + /// @param[in] SubID Substitution ID. + /// @param[in] Tys List of types. + /// @param[in] Quals Qualifiers for the types. + /// + /// @return Resolved type index or negative value. + int resolveSubstitution(unsigned SubID, + llvm::SmallVectorImpl &Tys, + llvm::SmallVectorImpl &Quals); + /// @brief Try to emit a substituion for the given type instead of mangling + /// it. + /// + /// @param[in,out] O Stream to write the substitution to. + /// @param[in] Ty Type to mangle + /// @param[in] Quals Qualifiers for the type. + /// @param[in] PrevTys Types that have previously been mangled. + /// @param[in] PrevQuals Qualifiers for the previously mangled types. + /// + /// @return true if a substitution was emitted, false otherwise. + bool emitSubstitution(llvm::raw_ostream &O, llvm::Type *Ty, + TypeQualifiers Quals, + llvm::ArrayRef PrevTys, + llvm::ArrayRef PrevQuals); + /// @brief Determine whether the type is a builtin type or not. Builtin types + /// are not considered for substitutions. + /// + /// @param[in] Ty Type to analyze. + /// @param[in] Quals Type qualifiers. + /// + /// @return true if the type is a builtin type, or false. + bool isTypeBuiltin(llvm::Type *Ty, TypeQualifiers &Quals); + + /// @brief LLVM context used to access LLVM types. + llvm::LLVMContext *Context; +}; +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_MANGLING_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h new file mode 100644 index 0000000000000..eda860477aaee --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h @@ -0,0 +1,296 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef COMPILER_UTILS_METADATA_H_INCLUDED +#define COMPILER_UTILS_METADATA_H_INCLUDED + +#include +#include +#include + +#include + +namespace llvm { +class Function; +class Module; +} // namespace llvm + +namespace compiler { +namespace utils { + +/// @brief OpenCL C standard to target. +enum OpenCLCVer { + /// @brief OpenCL C 1.0 + OpenCLC10 = (1 * 100 + 0) * 1000, + /// @brief OpenCL C 1.1 + OpenCLC11 = (1 * 100 + 1) * 1000, + /// @brief OpenCL C 1.2 + OpenCLC12 = (1 * 100 + 2) * 1000, + /// @brief OpenCL C 2.0 + OpenCLC20 = (2 * 100 + 0) * 1000, + /// @brief OpenCL C 3.0 + OpenCLC30 = (3 * 100 + 0) * 1000, +}; + +/// @brief Returns the OpenCL version, encoded as (Major*100 + Minor)*1000. +/// +/// If the Module does not contain any information, then OpenCLC12 is returned. +uint32_t getOpenCLVersion(const llvm::Module &m); + +/// @brief Describes the state of vectorization on a function/loop. +struct VectorizationInfo { + /// @brief The VectorizationFactor. A scalar value if unvectorized. + llvm::ElementCount vf; + /// @brief The dimension along which vectorization took place. + unsigned simdDimIdx; + /// @brief Whether or not the function/loop was vector-predicated. + bool IsVectorPredicated; +}; + +/// @brief Encodes metadata indicating vectorization failure to a kernel, along +/// with the the vectorization factor and dimension that failed. +/// +/// @param[in] f Function in which to encode the link. +/// @param[in] info Vectorization info serving as the key. +void encodeVectorizationFailedMetadata(llvm::Function &f, + const VectorizationInfo &info); + +/// @brief Encodes the vectorization metadata linking the original kernel to a +/// vectorized one, using the vectorization factor and dimension as the key. +/// +/// @param[in] origF Original function in which to encode the link. +/// @param[in] vectorizedF Vectorized function to link. +/// @param[in] info Vectorization factor serving as the key. +void linkOrigToVeczFnMetadata(llvm::Function &origF, + llvm::Function &vectorizedF, + const VectorizationInfo &info); + +/// @brief Encodes the vectorization metadata linking a vectorized kernel back +/// to its original one, using the vectorization factor and dimension as the +/// key. +/// +/// @param[in] vectorizedF Vectorized function in which to encode the link. +/// @param[in] origF Original function to link. +/// @param[in] info Vectorization factor serving as the key. +void linkVeczToOrigFnMetadata(llvm::Function &vectorizedF, + llvm::Function &origF, + const VectorizationInfo &info); + +using LinkMetadataResult = std::pair; + +/// @brief Decodes the metadata linking a kernel to its vectorized variant. +/// +/// @param[in] f Function for which to decode the metadata. +/// @param[out] factors unordered vector of recovered vectorization links. +/// +/// @return true on success, false if there is no vectorization metadata for the +/// function. +bool parseOrigToVeczFnLinkMetadata( + llvm::Function &f, llvm::SmallVectorImpl &factors); + +/// @brief Decodes the metadata linking a vectorized kernel back to its +/// original one. +/// +/// @param[in] f Function for which to decode the metadata. +/// +/// @return On success, a pair containing a pointer to the original kernel +/// function and the vectorization factor used as the key. The original +/// function may be null. On decoding failure, std::nullopt. +std::optional +parseVeczToOrigFnLinkMetadata(llvm::Function &f); + +/// @brief Drops "base" vectorization metadata from a function, if present. +/// +/// @param[in] f Function to drop metadata from. +void dropVeczOrigMetadata(llvm::Function &f); + +/// @brief Drops "derived" vectorization metadata from a function, if present. +/// +/// @param[in] f Function to drop metadata from. +void dropVeczDerivedMetadata(llvm::Function &f); + +/// @brief Encodes metadata indicating the various components that constitute a +/// kernel function wrapped with the WorkItemLoopsPass. +/// +/// @param[in] f Function in which to encode the metadata. +/// @param[in] simdDimIdx The dimension (0,1,2) along which vectorization took +/// place. +/// @param[in] mainInfo VectorizationInfo used on the 'main' work-item +/// iterations. +/// @param[in] tailInfo VectorizationInfo used on the tail iterations, if +/// applicable. +/// +/// Note that a 'tail' is defined as the work done to execute work-items not +/// covered by the 'main' body. Therefore an unvectorized kernel should expect +/// a scalar 'main' vectorization factor and no 'tail' (rather than the other +/// way round). + +/// Some examples of *typical* usage: +/// 1. An unvectorized kernel will encode a scalar VF for the main iterations +/// and nothing for the tail ones. +/// 2. A vectorized kernel will encode vectorization factor for its main +/// iterations. If it handles the case in which the local work-group size does +/// not evenly divide the vectorization factor, it will encode how it manages +/// the tail iterations. This is *typically* with a series of scalar +/// iterations, encoded in tailVF. +/// 3. Vector-predicated kernels with no tails will encode the *maximum* VF used +/// for the main loop, with no tail iterations. +/// +/// This metadata is encoded as: +/// define void @foo() !codeplay_ca_wrapper !X +/// !X = { !Main, !Tail } +/// !Main = { i32 mKnownMin, i32 mIsScalable, i32 simdDimIdx, i32 mIsVP } +/// if tailVF is None: +/// !Tail = {} +/// else +/// !Tail = { i32 tKnownMin, i32 tIsScalable, i32 simdDimIdx, i32 tIsVP } +void encodeWrapperFnMetadata(llvm::Function &f, + const VectorizationInfo &mainInfo, + std::optional tailInfo); + +/// @brief Decodes the metadata describing a wrapped kernel's loop structure. +/// +/// @param[in] f Function for which to decode the metadata. +/// +/// @return On success, a pair containing the VectorizationInfo for the main +/// loop(s) and the (optional) VectorizationInfo info for the tail loop(s). On +/// decoding failure, std::nullopt. +std::optional>> +parseWrapperFnMetadata(llvm::Function &f); + +/// @brief Copies function metadata from one function to another. +/// +/// @param[in] fromF Function from which to copy the metadata. +/// @param[in] toF Function onto which to copy the metadata. +/// @param[in] includeDebug Whether or not to copy debug function metadata. +void copyFunctionMetadata(llvm::Function &fromF, llvm::Function &toF, + bool includeDebug = false); + +/// @brief Encodes information about a function's local work group size as +/// metadata. +/// +/// @param[in] f Function in which to encode the metadata. +/// @param[in] localSizes array of size information to encode. +void encodeLocalSizeMetadata(llvm::Function &f, + const std::array &localSizes); + +/// @brief Retrieves information about a function's local sizes via metadata. +/// +/// @param[in] f Function from which to decode the metadata +/// @returns The local size array if present, else `std::nullopt` +std::optional> +getLocalSizeMetadata(const llvm::Function &f); + +/// @brief Drops all !mux_scheduled_fn metadata from a function. +void dropSchedulingParameterMetadata(llvm::Function &f); + +/// @brief Retrieves the indices of scheduling parameters from the function. +llvm::SmallVector +getSchedulingParameterFunctionMetadata(const llvm::Function &f); + +/// @brief Sets scheduling-parameter metadata on the given function +void setSchedulingParameterFunctionMetadata(llvm::Function &f, + llvm::ArrayRef idxs); + +/// @brief Sets module-level metadata describing the set of scheduling +/// parameters. +void setSchedulingParameterModuleMetadata(llvm::Module &m, + llvm::ArrayRef names); + +/// @brief Retrieves module-level metadata describing the set of scheduling +/// parameters or nullptr. +llvm::NamedMDNode *getSchedulingParameterModuleMetadata(const llvm::Module &m); + +/// @brief If the given function parameter index is considered a scheduling +/// parameter, it returns the corresponding index into the target's list of +/// scheduling parameters. +/// +/// It uses !mux_scheduled_fn metadata for this check. +std::optional isSchedulingParameter(const llvm::Function &f, + unsigned idx); + +/// @brief Extracts the required work group size from a kernel's function +/// metadata. +/// +/// @param[in] f Kernel for extraction. +/// +/// @return The work group size or std::nullopt if there is no such metadata. +std::optional> +parseRequiredWGSMetadata(const llvm::Function &f); + +/// @brief Extracts the required work group size from an opencl.kernels subnode, +/// which is similar to the function metadata, but the size is stored under +/// different indices than on a function. +/// +/// @param[in] node Kernel's subnode for extraction. +/// +/// @return The work group size or std::nullopt if there is no such metadata. +std::optional> +parseRequiredWGSMetadata(const llvm::MDNode &node); + +/// @brief Extracts the maximum work dimension from a kernel's function +/// metadata +/// +/// @param[in] f Kernel for extraction. +/// +/// @return The maximum work dimension or std::nullopt if there is no such +/// metadata. +std::optional parseMaxWorkDimMetadata(const llvm::Function &f); + +/// @brief Describes the state of vectorization on a function/loop. +struct KernelInfo { + explicit KernelInfo(llvm::StringRef name) : Name(name) {} + /// @brief The function name + std::string Name; + /// @brief The required work-group size. Optional. + std::optional> ReqdWGSize; +}; + +/// @brief Helper function to populate a list of kernels and associated +/// information from a module. +/// +/// @param m Module to retrieve kernels from +/// @param results List of kernel info parsed from metadata or taken from the +/// module. +void populateKernelList(llvm::Module &m, + llvm::SmallVectorImpl &results); + +/// @brief Replaces instances of kernel fromF with toF in module-level +/// !opencl.kernels metadata. +/// @param fromF Function to replace with toF in metadata +/// @param toF Function with which to replace references to fromF +/// @param M Module in which to find the metadata +void replaceKernelInOpenCLKernelsMetadata(llvm::Function &fromF, + llvm::Function &toF, llvm::Module &M); + +/// @brief Encodes information about a function's local work group size as +/// metadata. +/// +/// @param[in] f Function in which to encode the metadata. +/// @param[in] size sub-group size information to encode. +void encodeReqdSubgroupSizeMetadata(llvm::Function &f, uint32_t size); + +/// @brief Retrieves information about a function's required sub-group size via +/// metadata. +/// +/// @param[in] f Function from which to decode the metadata +/// @returns The required sub-group size if present, else `std::nullopt` +std::optional getReqdSubgroupSize(const llvm::Function &f); + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_METADATA_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h new file mode 100644 index 0000000000000..678b753b98a7e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h @@ -0,0 +1,115 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// Optimal builtin replacement pass. + +#ifndef COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED +#define COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED + +#include +#include +#include +#include + +namespace compiler { +namespace utils { + +/// @brief A Callgraph optimization pass which replaces calls to builtin +/// functions with more optimal versions, either via inlined code, or calls to +/// suitable llvm intrinsics which will later be lowered to optimal machine +/// code. When run with a non-null BuiltinInfo analysis, the builtin info is +/// queried to determine the properties of each call in the graph. +/// +/// A set of replacement functions with identical signatures is kept by this +/// pass. These are invoked in order one after another on each call instruction +/// in the call graph. If any replacement returns a non-null `Value*` it is +/// used to replace the call and no further replacements are attempted on that +/// call. It is assumed that no replacement introduces new calls to the graph. +/// The set of replacements can be modified by users by setting +/// `adjustReplacements`. +/// +/// The default set of replacement functions, in order, is: +/// * replaceAbacusCLZ +/// * replaceAbacusMulhi +/// * replaceAbacusFMinFMax +/// * Invoking emitBuiltinInline from BuiltinInfo analysis +class OptimalBuiltinReplacementPass + : public llvm::PassInfoMixin { +public: + using ReplacementFnTy = std::function &, + const llvm::SmallVectorImpl &)>; + + /// @brief Constructor. Sets up default builtin replacements. + OptimalBuiltinReplacementPass(); + + llvm::PreservedAnalyses run(llvm::LazyCallGraph::SCC &C, + llvm::CGSCCAnalysisManager &AM, + llvm::LazyCallGraph &CG, + llvm::CGSCCUpdateResult &UR); + + /// @brief A callback invoked per-SCC before any replacements are performed, + /// allowing customization of the replacements to be performed. The default + /// set of replacements are passed in and may be modified in any way. + std::function &)> adjustReplacements; + + /// @brief Replaces calls __abacus_clz(ty) with @llvm.ctlz(ty, i1 false) + /// indicating that zero does not produce a poison result. + /// Note: This replacement is not performend on 64-bit scalar or vectors of + /// 64-bit scalar types. + static llvm::Value *replaceAbacusCLZ( + llvm::CallBase &CB, llvm::StringRef, + const llvm::SmallVectorImpl &, + const llvm::SmallVectorImpl &); + + /// @brief Replaces __abacus_mul_hi(ty lhs, ty rhs) with a sequence: + /// %lhs.ext = ext ty %lhs to x2bw(ty) + /// %rhs.ext = ext ty %rhs to x2bw(ty) + /// %mul.ext = mul x2bw(ty) %lhs.ext, %rhs.ext + /// %lo.part = ashr x2bw(ty) %mul.ext, bw(ty) + /// %res = trunc x2bw(ty) %lo.part to ty + /// Where x2bw(ty) returns a type with twice the (element) bit-width, and + /// bw(ty) returns the bit-width of a (element) type as an integer. + /// This pattern is better matched by LLVM and target backends often produce + /// "mul_hi" instructions as a result. + static llvm::Value *replaceAbacusMulhi( + llvm::CallBase &, llvm::StringRef, + const llvm::SmallVectorImpl &, + const llvm::SmallVectorImpl &); + + /// @brief Replaces __abacus_(fmin|fmax)(ty1 lhs, ty2 rhs) with + /// @llvm.(minnum|maxnum)(ty1 lhs, ty1 rhs), where ty2 may be a scalar type + /// which is splatted to a vector of ty1, where appropriate. + /// Note: This replacement is not performed on ARM or AArch64 targets, due to + /// LLVM backend bugs (https://llvm.org/PR27363). + static llvm::Value *replaceAbacusFMinFMax( + llvm::CallBase &, llvm::StringRef, + const llvm::SmallVectorImpl &, + const llvm::SmallVectorImpl &); + +private: + std::vector replacements; + + llvm::Value *replaceBuiltinWithInlineIR(llvm::CallBase &CB) const; +}; + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h new file mode 100644 index 0000000000000..b60847eb53f1f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h @@ -0,0 +1,319 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// LLVM pass utility functions. + +#ifndef COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED +#define COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED + +#include +#include +#include +#include +#include + +#include +#include + +namespace llvm { +class Argument; +class BasicBlock; +class Constant; +class ConstantExpr; +class Function; +class IntegerType; +class LLVMContext; +class Module; +class ModulePass; +class Type; +class Value; +class IRBuilderBase; +} // namespace llvm + +namespace compiler { +namespace utils { + +/// @addtogroup utils +/// @{ + +/// @brief Calculate (approximately) the amount of private memory used by a +/// kernel. +/// +/// @param fn The kernel function +/// +/// @return uint64_t The private memory used by the kernel function in bytes. +uint64_t computeApproximatePrivateMemoryUsage(const llvm::Function &fn); + +/// @brief Forces a constant expression or constant vector back to a normal +/// instruction +/// +/// @param[in] constant to be replaced +void replaceConstantExpressionWithInstruction(llvm::Constant *const constant); + +/// @brief remap operands of a constant expression +/// +/// @note This will create a new constant expression and replace references to +/// the original constant with the new one +/// +/// @param[in] expr Constant expression to be remapped +/// @param[in] from Constant which if found in expression will be +/// replaced +/// @param[in] to Constant which will replace any operands which are `from` +void remapConstantExpr(llvm::ConstantExpr *expr, llvm::Constant *from, + llvm::Constant *to); + +/// @brief remap operands of a constant array +/// +/// @note This will create a new constant array and replace references to +/// the original constant with the new one +/// +/// @param[in] arr Constant array to be remapped +/// @param[in] from Constant which if found in array will be +/// replaced +/// @param[in] to Constant which will replace any operands which are `from` +void remapConstantArray(llvm::ConstantArray *arr, llvm::Constant *from, + llvm::Constant *to); + +/// @brief Discover if input function references debug info metadata nodes +/// +/// @param[in] func Function to check +/// @param[in,out] vmap Value map updated with identity mappings of any debug +/// info metadata found +/// +/// @return bool True if function contains debug info, false otherwise +bool funcContainsDebugMetadata(const llvm::Function &func, + llvm::ValueToValueMapTy &vmap); + +/// @brief Return a copy of a function's function, return, and parameter +/// attributes. +/// +/// Only parameter attributes from indices 0 to numParams are copied. If +/// numParams is negative, all parameter attributes are copied. +llvm::AttributeList getCopiedFunctionAttrs(const llvm::Function &oldFn, + int numParams = -1); + +/// @brief Copy a function's attributes to a new function. +/// +/// @param[in] oldFn Function to copy function attributes from. +/// @param[in] newFn Function to copy function attributes to. +/// @param[in] numParams number of parameters to copy attributes from, starting +/// from the first parameter. If set to a negative number, will copy all +/// parameter attributes. +void copyFunctionAttrs(const llvm::Function &oldFn, llvm::Function &newFn, + int numParams = -1); + +using ParamTypeAttrsPair = std::pair; + +using UpdateMDCallbackFn = + std::function; + +/// @brief Clone functions in a module and add an argument to them +/// +/// @param module LLVM module containing the functions +/// @param paramTypeFunc Additional parameter to be added defined as a function +/// returning the type and set of attributes. +/// This function takes a module, primarily to access DataLayout +/// @param toBeClonedFunc function which dictates whether each function is +/// cloned +/// @param updateMetaDataCallback if set, is invoked with the old function, new +/// function and new argument index. +/// +/// @return bool if the module has changed (currently always true) +/// +/// This iterates through all the functions in a module but only clones and adds +/// the extra param for those that meet the following criteria after setting +/// `clonedNoBody` and `ClonedWithBody` from the toBeCloned expression:- +/// +/// 1. `!function` declaration or `ClonedNoBody` _or_ is a function +/// declaration and `ClonedWithBody` +/// 2. Not already processed +bool cloneFunctionsAddArg( + llvm::Module &module, + std::function paramTypeFunc, + std::function + toBeClonedFunc, + const UpdateMDCallbackFn &updateMetaDataCallback = nullptr); + +/// @brief Updates call instructions after to function clone to point to +/// `newFunc` instead of `oldFunc`, old call instructions are deleted. +/// +/// @param[in] oldFunc Function which has been cloned +/// @param[in] newFunc Cloned function to point callsites to +/// @param[in] extraArg Whether the cloned callee has an extra argument added +void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc, + bool extraArg); + +using CreateLoopBodyFn = std::function, + llvm::MutableArrayRef)>; + +struct CreateLoopOpts { + /// @brief indexInc Value by which to increment the loop counter. If nullptr, + /// then it is created as the constant 1, based on type of `indexStart`, + /// which is a parameter to compiler::utils::createLoop proper. + llvm::Value *indexInc = nullptr; + /// @brief disableVectorize Sets loop metadata disabling further + /// vectorization. + bool disableVectorize = false; + /// @brief headerName Optional name for the loop header block. Defaults to: + /// "loopIR". + llvm::StringRef headerName = "loopIR"; + /// @brief An optional list of incoming IV values. + /// + /// Each of these is used as the incoming value to a PHI created by + /// createLoop. These PHIs are provided to the 'body' function of createLoop, + /// which should in turn set the 'next' version of the IV. + std::vector IVs; + /// @brief An optional list of IV names, to be set on the PHIs provided by + /// 'IVs' field/parameter. + /// + /// If set, the names are assumed to correlate 1:1 with those IVs. The list + /// may be shorter than the list of IVs, in which case the trailing IVs are + /// not named. + std::vector loopIVNames; +}; + +/// @brief Create a loop around a body, creating an implicit induction variable +/// (IV) between specified start and end values, and incremented by a +/// user-specified amount. The loop thus has a trip count equal to the +/// following C-style loop: `for (auto i = start; i < end; i += incr)`. +/// +/// Note that this helper always creates a CFG loop, even if the loop bounds +/// are known not to produce a loop at compile time. Users can use stock LLVM +/// optimizations to eliminate/simplify the loop in such a case. +/// +/// @param entry Loop pre-header block. This block will be rewired to jump into +/// the new loop. +/// @param exit Loop exit block. The new loop will jump to this once it exits. +/// @param indexStart The start index +/// @param indexEnd The end index (we compare for <) +/// @param opts Set of options configuring the generation of this loop. +/// @param body Body of code to insert into loop. +/// +/// The parameters of this function are as follows: the loop body BasicBlock; +/// the Value corresponding to the IV beginning at `indexStart` and incremented +/// each iteration by `indexInc` while less than `indexEnd`; the list of IVs +/// for this iteration of the loop (may or may not be PHIs, depending on the +/// loop bounds); the list of IVs for the next iteration of the loop (the +/// function is required to fill these in). Both these sets of IVs will be +/// arrays of equal length to the original list of IVs, in the same order. The +/// function returns the loop latch/exiting block: this block will be given the +/// branch that decides between continuing the loop and exiting from it. +/// +/// @return llvm::BasicBlock* The exit block +llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit, + llvm::Value *indexStart, llvm::Value *indexEnd, + const CreateLoopOpts &opts, CreateLoopBodyFn body); + +/// @brief Get the last argument of a function. +/// +/// @param f An LLVM function to get an argument from. +/// +/// @return An LLVM argument. +llvm::Argument *getLastArgument(llvm::Function *f); + +/// @brief get the device-side size of size_t type in bytes. +unsigned getSizeTypeBytes(const llvm::Module &m); + +/// @brief get a size_t type. +/// @return a LLVM IntegerType representing size_t. +llvm::IntegerType *getSizeType(const llvm::Module &m); + +/// @brief Creates a wrapper function (without body), intended for calling @p F +/// @param M Containing module +/// @param F Kernel function which is being replaced +/// @param ArgTypes List of types to be used for the new function +/// @param Suffix String to which to append to the new function +/// @param OldSuffix String to which to append to the old function +/// @note This takes the metadata and debug from the original function. +/// This is intended to be used for creating a function which replaces +/// the original function but calls the original. +/// +/// @note The name of the wrapper function is computed as the original name of +/// F followed by the Suffix. The original name of F is taken from F's +/// 'mux-base-fn-name' attribute, if set, else it is F's name: +/// +/// declare void @foo() +/// ; Function attrs "mux-base-fn-name"="baz" +/// declare void @bar() +/// +/// With suffix '.wrapper', this function will produce: +/// +/// declare void @foo.wrapper() +/// declare void @baz.wrapper() +/// +/// With suffix '.new' and old suffix '.old', this function will produce: +/// +/// declare void @foo.old() +/// ; Function attrs "mux-base-fn-name"="baz" +/// declare void @bar.old() +/// +/// declare void @foo.new() +/// declare void @baz.new() +/// +/// It is advised that the suffix begins with a character that may not +/// occur in the original source language, to avoid clashes with user +/// functions. +llvm::Function *createKernelWrapperFunction( + llvm::Module &M, llvm::Function &F, llvm::ArrayRef ArgTypes, + llvm::StringRef Suffix, llvm::StringRef OldSuffix = ""); + +/// @brief As above, but creating a wrapper with the exact function signature +/// of @p F. +/// +/// Copies over all parameter names and attributes. +llvm::Function *createKernelWrapperFunction(llvm::Function &F, + llvm::StringRef Suffix, + llvm::StringRef OldSuffix = ""); + +/// @brief Creates a call to a a wrapped function +/// +/// Sets the calling convention and call-site attributes to match the wrapped +/// function. +/// +/// @param WrappedF the function to call +/// @param Args the list of arguments to pass to the call +/// @param BB the basic block into which to insert the call. May be null, in +/// which case the call is not inserted anywhere. +/// @param InsertPt the point in BB at which to insert the call +/// @param Name the name of the call instruction. May be empty. +/// @return The call instruction +llvm::CallInst *createCallToWrappedFunction( + llvm::Function &WrappedF, const llvm::SmallVectorImpl &Args, + llvm::BasicBlock *BB, llvm::BasicBlock::iterator InsertPt, + llvm::StringRef Name = ""); + +/// @brief Create a binary operation corresponding to the given +/// `llvm::RecurKind` with the two provided arguments. It may not +/// necessarily return one of LLVM's in-built `BinaryOperator`s, or even one +/// operation: integer min/max operations may defer to multiple instructions or +/// intrinsics depending on the LLVM version. +/// +/// @param[in] B the IRBuilder to build new instructions +/// @param[in] LHS the left-hand value for the operation +/// @param[in] RHS the right-hand value for the operation +/// @param[in] Kind the kind of operation to create +/// @return The binary operation. +llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS, + llvm::Value *RHS, llvm::RecurKind Kind); +/// @} +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h new file mode 100644 index 0000000000000..671cc9baf7051 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h @@ -0,0 +1,148 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Hold global state and objects used for managing a pass pipeline. + +#ifndef COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED +#define COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED + +#include +#include +#include +#include +#include +#include + +namespace llvm { +class TargetMachine; +} + +namespace compiler { +namespace utils { +extern bool VerifyEachIsEnabled; + +/// @brief Mirror's LLVM's DebugLogging options in its `opt` tool. Clang has +/// a boolean on/off version. +enum class DebugLogging { None, Normal, Verbose, Quiet }; + +extern DebugLogging DebugPasses; + +/// @brief A class that manages the lifetime and initialization of all +/// components required to set up a new-style LLVM pass manager. +class PassMachinery { +public: + PassMachinery(llvm::LLVMContext &Ctx, llvm::TargetMachine *TM, + bool VerifyEach = false, + DebugLogging debugLogLevel = DebugLogging::None); + + virtual ~PassMachinery(); + + /// @brief Initializes the PassBuilder and calls registerPasses. + void initializeStart( + llvm::PipelineTuningOptions PTO = llvm::PipelineTuningOptions()); + + /// @brief Cross-registers analysis managers, adds callbacks and + /// instrumentation support. Calls addClassToPassNames and + /// registerPassCallbacks. + void initializeFinish(); + + /// @brief Calls buildDefaultAAPipeline and registerLLVMAnalyses. + virtual void registerPasses(); + + /// @brief Helper method to register the standard LLVM AA pipeline. + /// + /// Registers: + /// * llvm::PassBuilder::buildDefaultAAPipeline + void buildDefaultAAPipeline(); + + /// @brief Helper method to register the standard LLVM analyses. + /// + /// Calls: + /// * llvm::PassBuilder::registerModuleAnalyses + /// * llvm::PassBuilder::registerCGSCCAnalyses + /// * llvm::PassBuilder::registerFunctionAnalyses + /// * llvm::PassBuilder::registerLoopAnalyses + void registerLLVMAnalyses(); + + /// @brief Method to allow customization of class-to-pass-names for + /// instrumentation purposes. By default, none are set up by + /// PassMachinery::initialize. + virtual void addClassToPassNames() {} + + /// @brief Method to allow customization of pass callbacks via + /// llvm::PassBuilder. of customization of class-to-pass-names for By + /// default, no callbacks are set up by PassMachinery::initialize. + virtual void registerPassCallbacks() {} + + /// @brief print pass names in style of opt --print-passes + /// @note This should print parameters too + virtual void printPassNames(llvm::raw_ostream &) {} + + llvm::ModuleAnalysisManager &getMAM() { return MAM; } + const llvm::ModuleAnalysisManager &getMAM() const { return MAM; } + + llvm::FunctionAnalysisManager &getFAM() { return FAM; } + const llvm::FunctionAnalysisManager &getFAM() const { return FAM; } + + llvm::PassBuilder &getPB() { return PB; } + const llvm::PassBuilder &getPB() const { return PB; } + + llvm::TargetMachine *getTM() { return TM; } + const llvm::TargetMachine *getTM() const { return TM; } + +protected: + /// @brief TargetMachine to be used for passes. May be nullptr. + llvm::TargetMachine *TM; + // Note: the order here is important! They must be destructed in this order. + /// @brief Holds state for Loop analyses. + llvm::LoopAnalysisManager LAM; + /// @brief Holds state for Function analyses. + llvm::FunctionAnalysisManager FAM; + /// @brief Holds state for CGSCC analyses. + llvm::CGSCCAnalysisManager CGAM; + /// @brief Holds state for Module analyses. + llvm::ModuleAnalysisManager MAM; + /// @brief Manages the state for any instrumentation callbacks. + std::unique_ptr SI; + /// @brief Provides an interface to register callbacks. + llvm::PassInstrumentationCallbacks PIC; + /// @brief Helper to build and parse pass pipelines. + llvm::PassBuilder PB; +}; + +/// Helper functions for pass printing. + +/// @brief Helper function for printing a pass name, to be used by +/// printPassNames. +/// @param PassName Name of pass from a debug/parsing perspective. +/// @param OS stream to write to. +/// @note This is a direct copy from PassBuilder.cpp. +void printPassName(llvm::StringRef PassName, llvm::raw_ostream &OS); + +/// @brief Helper function for printing a pass name with parameters, to be. +/// @param PassName Name of pass from a debug/parsing perspective. +/// @param Params Textual representation of the parameters. +/// @param OS stream to write to. +/// @note This is a direct copy from PassBuilder.cpp. +void printPassName(llvm::StringRef PassName, llvm::StringRef Params, + llvm::raw_ostream &OS); + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h new file mode 100644 index 0000000000000..4bdcb2da83969 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h @@ -0,0 +1,45 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// Prepare barriers pass. + +#ifndef COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED +#define COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED + +#include + +namespace compiler { +namespace utils { + +/// @brief Pass for ensuring consistent barrier handling. +/// +/// It inlines functions that contain barriers and gives each barrier call a +/// unique ID as metadata to ensure consistent handling of barriers in +/// different versions of the kernel (i.e. Scalar vs Vector). Run before Vecz +/// for mixed wrapper kernels made up of multiple kernels to work. +/// +/// Runs over all kernels with "kernel entry point" metadata. +class PrepareBarriersPass final + : public llvm::PassInfoMixin { +public: + llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &); +}; +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h new file mode 100644 index 0000000000000..bde53d712aab7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h @@ -0,0 +1,44 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// Replace local module-scope variables pass. + +#ifndef COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED +#define COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED + +#include + +namespace compiler { +namespace utils { + +/// @brief __local address space automatic variables are represented in the +/// LLVM module as global variables with address space 3. This pass identifies +/// these variables and places them into a struct allocated (via alloca) in a +/// newly created wrapper function. A pointer to the struct is then passed +/// via a parameter to the original kernel. +/// +/// Runs over all kernels with "kernel" metadata. +class ReplaceLocalModuleScopeVariablesPass final + : public llvm::PassInfoMixin { +public: + llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &); +}; +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h new file mode 100644 index 0000000000000..08c923b5e56f8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h @@ -0,0 +1,143 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// Various utlities to help with work-item and work-group scheduling. + +#ifndef COMPILER_UTILS_SCHEDULING_H_INCLUDED +#define COMPILER_UTILS_SCHEDULING_H_INCLUDED + +#include +#include + +namespace llvm { +class Function; +class Module; +class StructType; +class Argument; +} // namespace llvm + +namespace compiler { +namespace utils { + +namespace WorkItemInfoStructField { +enum Type : uint32_t { + local_id, + sub_group_id, + num_sub_groups, + max_sub_group_size, + total +}; +} + +namespace WorkGroupInfoStructField { +enum Type : uint32_t { + group_id = 0, + num_groups, + global_offset, + local_size, + work_dim, + total +}; +} + +/// @brief Computes the work item info structure type for the given module. +llvm::StructType *getWorkItemInfoStructTy(llvm::Module &M); + +/// @brief Computes the work item info structure type for the given module. +llvm::StructType *getWorkGroupInfoStructTy(llvm::Module &M); + +/// @brief Populates an empty function with code to look up and return a value +/// from a pointer-to-struct argument. +/// +/// The function may optionally have a 'rank', in which case the struct field +/// index is expected to be a 3D array of values. Ranked functions must have an +/// integer index as their first parameter. Any integer type is supported. The +/// generated code for ranked functions is given a bounds check to ensure the +/// index is less than 3. If the index is out of bounds, the default value is +/// returned. +/// +/// The pointer-to-struct may be any parameter other than the index, which +/// comes first. +/// +/// if !hasRankArg: +/// ; where structFieldIdx identifies the field. +/// %struct = type { ..., i64, ... } +/// declare i64 @foo(ptr %struct-ptr) +/// +/// if hasRankArg: +/// ; where structFieldIdx identifies the field and the %idx parameter +/// ; identifies the sub-field. +/// %struct = type { ..., [i64, i64, i64], ... } +/// declare i64 @foo(i32 %idx, ptr %struct-ptr) +/// +/// @param[in,out] F The function to define +/// @param[in] structPtrArg The pointer-to-struct argument +/// @param[in] structTy The underlying type of the pointer-to-struct argument, +/// used for offset calculations +/// @param[in] structFieldIdx The struct type's field index to load from +/// @param[in] hasRankArg True if the struct type's field index is a 3D array, +/// and thus the function's first parameter is an index parameter. +/// @param[in] defaultValue The default value returned if the index is out of +/// bounds. Only valid for ranked functions. +void populateStructGetterFunction(llvm::Function &F, + llvm::Argument &structPtrArg, + llvm::StructType *const structTy, + uint32_t structFieldIdx, bool hasRankArg, + size_t defaultValue = 0); + +/// @brief Populates an empty function with code to store a value into a +/// pointer-to-struct argument. +/// +/// The function may optionally have a 'rank', in which case the struct field +/// index is expected to be a 3D array of values. Ranked functions must have an +/// integer index as their first parameter. Any integer type is supported. +/// +/// The value to store is the next parameter (either first or second) and the +/// pointer-to-struct may be any other unoccupied parameter. +/// +/// if !hasRankArg: +/// ; where structFieldIdx identifies the field. +/// %struct = type { ..., i64, ... } +/// declare void @foo(i64 %val, ptr %struct-ptr) +/// +/// if hasRankArg: +/// ; where structFieldIdx identifies the field and the %idx parameter +/// ; identifies the sub-field. +/// %struct = type { ..., [i64, i64, i64], ... } +/// declare void @foo(i32 %idx, i64 %val, ptr %struct-ptr) +/// +/// Note that unlike populateStructGetterFunction, no bounds check is +/// performed. The setter functions are only available internally to the +/// compiler, and thus the indices are assumed to be within bounds. +/// +/// @param[in,out] F The function to define +/// @param[in] structPtrArg The pointer-to-struct argument +/// @param[in] structTy The underlying type of the pointer-to-struct argument, +/// used for offset calculations +/// @param[in] structFieldIdx The struct type's field index to store to +/// @param[in] hasRankArg True if the struct type's field index is a 3D array, +/// and thus the function's first parameter is an index parameter. +void populateStructSetterFunction(llvm::Function &F, + llvm::Argument &structPtrArg, + llvm::StructType *const structTy, + uint32_t structFieldIdx, bool hasRankArg); + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_SCHEDULING_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h new file mode 100644 index 0000000000000..af615f3a6f4bf --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h @@ -0,0 +1,115 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED +#define COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED + +#include +#include +#include + +#include +#include + +namespace compiler { +namespace utils { + +/// @brief Provides module-level information about the sub-group usage of each +/// function contained within. +/// +/// The results for each function are cached in a map. Declarations are not +/// processed. Thus an external function declaration that uses sub-group +/// builtins will be missed. +/// +/// Internal mux sub-group 'setter' functions are not counted. This is because +/// they only used internally by the oneAPI Construction Kit as scaffolding for +/// the sub-group support that the user can observe. +/// +/// Each function contains the set of mux sub-group builtins it (transitively) +/// calls. +class GlobalSubgroupInfo { + struct SubgroupInfo { + std::set UsedSubgroupBuiltins; + }; + + using FunctionMapTy = + std::map>; + + FunctionMapTy FunctionMap; + + compiler::utils::BuiltinInfo &BI; + +public: + GlobalSubgroupInfo(llvm::Module &M, BuiltinInfo &); + + compiler::utils::BuiltinInfo &getBuiltinInfo() { return BI; } + + using iterator = FunctionMapTy::iterator; + using const_iterator = FunctionMapTy::const_iterator; + + /// @brief Returns the SubgroupInfo for the provided function. + /// + /// The function must already exist in the map. + const SubgroupInfo *operator[](const llvm::Function *F) const { + const const_iterator I = FunctionMap.find(F); + assert(I != FunctionMap.end() && "Function not in sub-group info!"); + return I->second.get(); + } + + bool usesSubgroups(const llvm::Function &F) const; + + /// @brief Returns true if the provided function is a mux sub-group + /// collective builtin or sub-group barrier. + std::optional + isMuxSubgroupBuiltin(const llvm::Function *F) const; +}; + +/// @brief Computes and returns the GlobalSubgroupInfo for a Module. +class SubgroupAnalysis : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + using Result = GlobalSubgroupInfo; + + explicit SubgroupAnalysis() {} + + /// @brief Retrieve the GlobalSubgroupInfo for the module. + Result run(llvm::Module &M, llvm::ModuleAnalysisManager &); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "Sub-group analysis"; } + +private: + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; +}; + +/// @brief Helper pass to print out the contents of the SubgroupAnalysis +/// analysis. +class SubgroupAnalysisPrinterPass + : public llvm::PassInfoMixin { + llvm::raw_ostream &OS; + +public: + explicit SubgroupAnalysisPrinterPass(llvm::raw_ostream &OS) : OS(OS) {} + + llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM); +}; + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h new file mode 100644 index 0000000000000..c8c97f7848a2e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h @@ -0,0 +1,144 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED +#define COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED + +namespace llvm { +class Type; +class LLVMContext; +} // namespace llvm + +namespace compiler { +namespace utils { +namespace tgtext { + +/// @brief The indices of the *integer* parameters of a "spirv.Image" type. +enum ImageTyIntParamIdx { + ImageTyDimensionalityIdx = 0, + ImageTyDepthIdx, + ImageTyArrayedIdx, + ImageTyMSIdx, + ImageTySampledIdx, + ImageTyFormatIdx, + ImageTyAccessQualIdx, +}; + +/// @brief Values the 'dimensionality' parameter of a "spirv.Image" type may +/// hold. +/// +/// Note that not all of these are supported by the compiler. +enum ImageTyDimensionalityParam { + ImageDim1D = 0, + ImageDim2D, + ImageDim3D, + ImageDimCube, + ImageDimRect, + ImageDimBuffer, + ImageDimSubpassData, +}; + +/// @brief Values the 'depth' parameter of a "spirv.Image" type may hold. +enum ImageTyDepthParam { + ImageDepthNone = 0, // Not a depth image + ImageDepth, // A depth image + ImageDepthUnknown, // No indication as to whether this is a depth or + // non-depth image +}; + +/// @brief Values the 'arrayed' parameter of a "spirv.Image" type may hold. +enum ImageTyArrayedParam { + ImageNonArrayed = 0, + ImageArrayed, +}; + +/// @brief Values the 'MS' parameter of a "spirv.Image" type may hold. +enum ImageTyMSParam { + ImageMSSingleSampled = 0, + ImageMSMultiSampled, +}; + +/// @brief Values the 'Sampled' parameter of a "spirv.Image" type may hold. +enum ImageTySampledParam { + ImageSampledRuntime = 0, // only known at run time + ImageSampledCompat, // compatible with sampling operations + ImageSampledReadWriteCompat, // compatiable with read/write operations (a + // storage or subpass data image) +}; + +enum ImageTyAccessQualParam { + ImageAccessQualReadOnly = 0, + ImageAccessQualWriteOnly, + ImageAccessQualReadWrite, +}; + +/// @brief Returns the TargetExtType representing an 'event' type. +/// +/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise. +llvm::Type *getEventTy(llvm::LLVMContext &Ctx); + +/// @brief Returns the TargetExtType representing an 'sampler' type. +/// +/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise. +llvm::Type *getSamplerTy(llvm::LLVMContext &Ctx); + +/// @brief Returns the TargetExtType representing an 'image1d_t' type. +/// +/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise. +llvm::Type * +getImage1DTy(llvm::LLVMContext &Ctx, + ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly); + +/// @brief Returns the TargetExtType representing an 'image1d_array_t' type. +/// +/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise. +llvm::Type * +getImage1DArrayTy(llvm::LLVMContext &Ctx, + ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly); + +/// @brief Returns the TargetExtType representing an 'image1d_buffer_t' type. +/// +/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise. +llvm::Type * +getImage1DBufferTy(llvm::LLVMContext &Ctx, + ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly); + +/// @brief Returns the TargetExtType representing an 'image2d_t' type. +/// +/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise. +llvm::Type * +getImage2DTy(llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false, + ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly); + +/// @brief Returns the TargetExtType representing an 'image2d_array_t' type. +/// +/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise. +llvm::Type * +getImage2DArrayTy(llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false, + ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly); + +/// @brief Returns the TargetExtType representing an 'image3d_t' type. +/// +/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise. +llvm::Type * +getImage3DTy(llvm::LLVMContext &Ctx, + ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly); + +} // namespace tgtext +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h new file mode 100644 index 0000000000000..88dd7a6fb0c50 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h @@ -0,0 +1,55 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// Make opaque structure types unique. + +#ifndef COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED +#define COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED + +#include + +namespace compiler { +namespace utils { + +/// @addtogroup utils +/// @{ + +/// @brief This pass replaces instances of suffixed opaque structure types +/// with unsuffixed versions if an unsuffixed version exists in the context. +/// +/// When linking together two modules that declare the same opaque struct +/// type, or deserializing a module referencing an opaque struct type in a +/// context that already contains an opaque type with the same name, LLVM +/// will attempt to resolve the clash by appending a suffix to the name in +/// module. For example, deserializing a module referencing the +/// opencl.event_t in a context that already has this type will result in +/// the references all being renamed to opencl.event_t.0. This is +/// problematic if passes rely on the name of the struct to identify them. +/// This pass can be used to resolve this issue by searching for +/// problematic types and replacing them with their unsuffixed version. +class UniqueOpaqueStructsPass + : public llvm::PassInfoMixin { +public: + UniqueOpaqueStructsPass() = default; + llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &); +}; + +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h new file mode 100644 index 0000000000000..21c7b62dff496 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h @@ -0,0 +1,116 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// Work-item loops pass, splitting into "barrier regions" + +#ifndef COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED +#define COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED + +#include +#include +#include +#include +#include + +#include + +namespace llvm { +class DominatorTree; +} + +namespace compiler { +namespace utils { + +class BuiltinInfo; +class BarrierWithLiveVars; + +struct WorkItemLoopsPassOptions { + /// @brief Set to true if the pass should add extra alloca + /// instructions to preserve the values of variables between barriers. + bool IsDebug = false; + /// @brief Set to true if the pass should forcibly omit scalar + /// tail loops from wrapped vector kernels, even if the local work-group size + /// is not known to be a multiple of the vectorization factor. + bool ForceNoTail = false; +}; + +/// @brief The "work-item loops" pass. +/// +/// This pass adds loops around implicitly SIMT kernels such that the original +/// kernel is wrapped in a new function that runs over each work-item in the +/// work-group and calls the original kernel: the scheduling model thus becomes +/// explicit. +/// +/// The work-item loops pass assumes that: +/// +/// * Any functions containing barrier-like functions have already been inlined +/// into the kernel entry points +/// * the IDs of pairs of barrier-like functions align between 'main' and 'tail +/// kernels. +/// +/// Both of these can be achieved by first running the PrepareBarriersPass. +/// +/// The pass will query a kernel function for the `reqd_work_group_size` +/// metadata and optimize accordingly in the presence of it. +/// +/// Runs over all kernels with "kernel entry point" metadata. Work-item orders +/// are sourced from the "work item order" function metadata on each kernel. +class WorkItemLoopsPass final : public llvm::PassInfoMixin { +public: + /// @brief Constructor. + WorkItemLoopsPass(const WorkItemLoopsPassOptions &Options) + : IsDebug(Options.IsDebug), ForceNoTail(Options.ForceNoTail) {} + + llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &); + +private: + /// @brief Make the work-item-loop wrapper function. + /// This creates a wrapper function that iterates over a work group, calling + /// the kernel for each work item, respecting the semantics of any barriers + /// present. The wrapped kernel may be a scalar kernel, a vectorized kernel, + /// or both. When the wrapped kernel wraps both a vector and scalar kernel, + /// all vectorized work items will be executed first, and the scalar tail + /// last. + /// + /// The wrapper function is created as a new function suffixed by + /// ".mux-barrier-wrapper". The original unwrapped kernel(s)s will be left in + /// the Module, but marked as internal linkage so later passes can remove + /// them if uncalled once inlined into the wrapper function. + /// + /// When wrapping only a scalar kernel, or only a vector kernel, pass the + /// same Barrier object as both Barrier input parameters. + /// + /// @param[in] barrierMain the Barrier object of the main kernel function + /// @param[in] barrierTail the Barrier object of the tail kernel function + /// (may be nullptr). + /// @param[in] baseName the base name to use on the new wrapper function + /// @param[in] M the module the kernels live in + /// @param[in] BI BuiltinInfo providing builtin information + /// @return The new wrapper function + llvm::Function *makeWrapperFunction(BarrierWithLiveVars &barrierMain, + BarrierWithLiveVars *barrierTail, + llvm::StringRef baseName, llvm::Module &M, + BuiltinInfo &BI); + + const bool IsDebug; + const bool ForceNoTail; +}; +} // namespace utils +} // namespace compiler + +#endif // COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h new file mode 100644 index 0000000000000..8da2fdcae20dd --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h @@ -0,0 +1,54 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED +#define MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED + +#include +#include + +namespace multi_llvm { + +namespace detail { + +template +struct BinOpHelper; + +// TODO Make this entirely version-based once we no longer have to account for +// older LLVM 21 snapshots that use the LLVM 20 definition of +// llvm::AtomicRMWInst::BinOp. +#define LLVM 21 +#include +#undef LLVM +#define LLVM 20 +#include +#undef LLVM + +} // namespace detail + +static std::optional +consume_binop_with_underscore(llvm::StringRef &String) { + return multi_llvm::detail::BinOpHelper<>::consume_front_with_underscore( + String); +} + +static llvm::StringRef to_string(llvm::AtomicRMWInst::BinOp BinOp) { + return multi_llvm::detail::BinOpHelper<>::to_string(BinOp); +} + +} // namespace multi_llvm + +#endif // MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc new file mode 100644 index 0000000000000..787822d16859b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc @@ -0,0 +1,76 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#if LLVM == 20 +template +struct BinOpHelper> +#define BINOP_LLVM21(OP, STR) +#elif LLVM == 21 +template +struct BinOpHelper +#define BINOP_LLVM21(OP, STR) BINOP(OP, STR) +#endif +{ +#define BINOPS() \ + BINOP(Xchg, "xchg") \ + BINOP(Add, "add") \ + BINOP(Sub, "sub") \ + BINOP(And, "and") \ + BINOP(Nand, "nand") \ + BINOP(Or, "or") \ + BINOP(Xor, "xor") \ + BINOP(Max, "max") \ + BINOP(Min, "min") \ + BINOP(UMax, "umax") \ + BINOP(UMin, "umin") \ + BINOP(FAdd, "fadd") \ + BINOP(FSub, "fsub") \ + BINOP(FMax, "fmax") \ + BINOP(FMin, "fmin") \ + BINOP_LLVM21(FMaximum, "fmaximum") \ + BINOP_LLVM21(FMinimum, "fminumum") \ + BINOP(UIncWrap, "uincwrap") \ + BINOP(UDecWrap, "udecwrap") \ + BINOP(USubCond, "usubcond") \ + BINOP(USubSat, "usubsat") + + static std::optional + consume_front_with_underscore(llvm::StringRef &String) { +#define BINOP(BINOP, STR) \ + if (String.consume_front(STR "_")) { \ + return T::BINOP; \ + } + BINOPS() +#undef BINOP + return std::nullopt; + } + + static llvm::StringRef to_string(T BinOp) { + switch (BinOp) { +#define BINOP(BINOP, STR) \ + case T::BINOP: \ + return STR; + BINOPS() +#undef BINOP + case T::BAD_BINOP: + break; + } + llvm_unreachable("Unexpected BinOp"); + } + +#undef BINOPS +#undef BINOP_LLVM21 +}; diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h new file mode 100644 index 0000000000000..cecbb7f02ddae --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h @@ -0,0 +1,49 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED +#define MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED + +#include +#include + +namespace multi_llvm { + +// Drop getAttributes workaround when LLVM 21 is minimum version +namespace detail { +template +auto getAttributes(T... args) + -> decltype(llvm::Intrinsic::getAttributes(args...)) { + return llvm::Intrinsic::getAttributes(args...); +} +template +auto getAttributes(T... args, llvm::FunctionType *) + -> decltype(llvm::Intrinsic::getAttributes(args...)) { + return llvm::Intrinsic::getAttributes(args...); +} +} // namespace detail + +namespace Intrinsic { +static inline auto getAttributes(llvm::LLVMContext &C, llvm::Intrinsic::ID ID, + llvm::FunctionType *FT) { + return detail::getAttributes(C, ID, + FT); +} +} // namespace Intrinsic + +} // namespace multi_llvm + +#endif // MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h new file mode 100644 index 0000000000000..802471f4562cc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h @@ -0,0 +1,38 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#ifndef MULTI_LLVM_LLVM_VERSION_H_INCLUDED +#define MULTI_LLVM_LLVM_VERSION_H_INCLUDED + +#include + +#define LLVM_VERSION_EQUAL(MAJOR, MINOR) \ + (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR == (MINOR)) + +#define LLVM_VERSION_LESS(MAJOR, MINOR) \ + ((LLVM_VERSION_MAJOR < (MAJOR)) || \ + (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR < (MINOR))) + +#define LLVM_VERSION_LESS_EQUAL(MAJOR, MINOR) \ + (LLVM_VERSION_EQUAL(MAJOR, MINOR) || LLVM_VERSION_LESS(MAJOR, MINOR)) + +#define LLVM_VERSION_GREATER(MAJOR, MINOR) \ + ((LLVM_VERSION_MAJOR > (MAJOR)) || \ + (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR > (MINOR))) + +#define LLVM_VERSION_GREATER_EQUAL(MAJOR, MINOR) \ + (LLVM_VERSION_EQUAL(MAJOR, MINOR) || LLVM_VERSION_GREATER(MAJOR, MINOR)) + +#endif // MULTI_LLVM_LLVM_VERSION_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h new file mode 100644 index 0000000000000..ea350fd4bdec2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h @@ -0,0 +1,22 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef MULTI_LLVM_MULTI_LLVM_H_INCLUDED +#define MULTI_LLVM_MULTI_LLVM_H_INCLUDED + +#include + +#endif // MULTI_LLVM_MULTI_LLVM_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h new file mode 100644 index 0000000000000..6d8e608b860bd --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h @@ -0,0 +1,74 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED +#define MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED + +#include +#include + +namespace multi_llvm { + +namespace detail { + +template +auto isLegalMaskedLoadImpl(const TargetTransformInfo &TTI, llvm::Type *Ty, + llvm::Align Alignment, unsigned) + -> decltype(TTI.isLegalMaskedLoad(Ty, Alignment)) { + return TTI.isLegalMaskedLoad(Ty, Alignment); +} + +template +auto isLegalMaskedStoreImpl(const TargetTransformInfo &TTI, llvm::Type *Ty, + llvm::Align Alignment, unsigned) + -> decltype(TTI.isLegalMaskedStore(Ty, Alignment)) { + return TTI.isLegalMaskedStore(Ty, Alignment); +} + +#if LLVM_VERSION_GREATER_EQUAL(21, 0) +// TODO: Make this depend only on LLVM version once we do not have to remain +// compatible with slightly older LLVM 21 snapshots. + +template +auto isLegalMaskedLoadImpl(const TargetTransformInfo &TTI, llvm::Type *Ty, + llvm::Align Alignment, unsigned AddrSpace) + -> decltype(TTI.isLegalMaskedLoad(Ty, Alignment, AddrSpace)) { + return TTI.isLegalMaskedLoad(Ty, Alignment, AddrSpace); +} + +template +auto isLegalMaskedStoreImpl(const TargetTransformInfo &TTI, llvm::Type *Ty, + llvm::Align Alignment, unsigned AddrSpace) + -> decltype(TTI.isLegalMaskedStore(Ty, Alignment, AddrSpace)) { + return TTI.isLegalMaskedStore(Ty, Alignment, AddrSpace); +} +#endif + +} // namespace detail + +bool isLegalMaskedLoad(const llvm::TargetTransformInfo &TTI, llvm::Type *Ty, + llvm::Align Alignment, unsigned AddrSpace) { + return detail::isLegalMaskedLoadImpl(TTI, Ty, Alignment, AddrSpace); +} + +bool isLegalMaskedStore(const llvm::TargetTransformInfo &TTI, llvm::Type *Ty, + llvm::Align Alignment, unsigned AddrSpace) { + return detail::isLegalMaskedStoreImpl(TTI, Ty, Alignment, AddrSpace); +} + +} // namespace multi_llvm + +#endif // MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h new file mode 100644 index 0000000000000..576b04f284d8e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h @@ -0,0 +1,58 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED +#define MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED + +#include +#include + +namespace multi_llvm { + +namespace detail { + +#if LLVM_VERSION_GREATER_EQUAL(21, 0) + +template +auto createTargetInfo(clang::DiagnosticsEngine &Diags, + clang::TargetOptions &Opts) + -> decltype(TargetInfo::CreateTargetInfo(Diags, Opts)) { + return TargetInfo::CreateTargetInfo(Diags, Opts); +} + +#endif + +template +auto createTargetInfo(clang::DiagnosticsEngine &Diags, + clang::TargetOptions &Opts) + -> decltype(TargetInfo::CreateTargetInfo( + Diags, std::make_shared(Opts))) { + return TargetInfo::CreateTargetInfo( + Diags, std::make_shared(Opts)); +} + +} // namespace detail + +struct TargetInfo { + static clang::TargetInfo *CreateTargetInfo(clang::DiagnosticsEngine &Diags, + clang::TargetOptions &Opts) { + return multi_llvm::detail::createTargetInfo(Diags, Opts); + } +}; + +} // namespace multi_llvm + +#endif // MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h new file mode 100644 index 0000000000000..d13b9d531b8a9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h @@ -0,0 +1,69 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#ifndef MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED +#define MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED + +#include +#include +#include + +namespace multi_llvm { + +// The functions defined below are common functions to allow us to generically +// get VectorType information from a base Type class, due to either deprecation +// or removal of these in LLVM 11 (result of scalable/fixed vectors separation) + +inline llvm::Type *getVectorElementType(llvm::Type *ty) { + assert(llvm::isa(ty) && "Not a vector type"); + return llvm::cast(ty)->getElementType(); +} +inline llvm::Type *getVectorElementType(const llvm::Type *ty) { + assert(llvm::isa(ty) && "Not a vector type"); + return llvm::cast(ty)->getElementType(); +} + +inline uint64_t getVectorNumElements(llvm::Type *ty) { + assert(ty->getTypeID() == llvm::Type::FixedVectorTyID && + "Not a fixed vector type"); + return llvm::cast(ty) + ->getElementCount() + .getFixedValue(); +} +inline uint64_t getVectorNumElements(const llvm::Type *ty) { + assert(ty->getTypeID() == llvm::Type::FixedVectorTyID && + "Not a fixed vector type"); + return llvm::cast(ty) + ->getElementCount() + .getFixedValue(); +} + +inline llvm::ElementCount getVectorElementCount(llvm::Type *ty) { + return llvm::cast(ty)->getElementCount(); +} +inline llvm::ElementCount getVectorElementCount(const llvm::Type *ty) { + return llvm::cast(ty)->getElementCount(); +} + +inline unsigned getVectorKnownMinNumElements(llvm::Type *ty) { + return getVectorElementCount(ty).getKnownMinValue(); +} + +inline unsigned getVectorKnownMinNumElements(const llvm::Type *ty) { + return getVectorElementCount(ty).getKnownMinValue(); +} +} // namespace multi_llvm + +#endif // MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp new file mode 100644 index 0000000000000..98d63c713e0d4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp @@ -0,0 +1,206 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include + +#include + +namespace compiler { +namespace utils { +using namespace llvm; + +static constexpr const char *MuxKernelAttrName = "mux-kernel"; + +void setIsKernel(Function &F) { F.addFnAttr(MuxKernelAttrName, ""); } + +void setIsKernelEntryPt(Function &F) { + F.addFnAttr(MuxKernelAttrName, "entry-point"); +} + +bool isKernel(const Function &F) { + return F.getFnAttribute(MuxKernelAttrName).isValid(); +} + +bool isKernelEntryPt(const Function &F) { + const Attribute Attr = F.getFnAttribute(MuxKernelAttrName); + if (Attr.isValid()) { + return Attr.getValueAsString() == "entry-point"; + } + return false; +} + +void dropIsKernel(Function &F) { F.removeFnAttr(MuxKernelAttrName); } + +void takeIsKernel(Function &ToF, Function &FromF) { + if (!isKernel(FromF)) { + return; + } + // Check whether we need to add entry-point data. + const bool IsEntryPt = isKernelEntryPt(FromF); + // Drop all data for simplicity + dropIsKernel(ToF); + dropIsKernel(FromF); + // Add the new data + IsEntryPt ? setIsKernelEntryPt(ToF) : setIsKernel(ToF); +} + +static StringRef getFnNameFromAttr(const Function &F, StringRef AttrName) { + const Attribute Attr = F.getFnAttribute(AttrName); + if (Attr.isValid()) { + return Attr.getValueAsString(); + } + return ""; +} + +static constexpr const char *OrigFnNameAttr = "mux-orig-fn"; + +void setOrigFnName(Function &F) { F.addFnAttr(OrigFnNameAttr, F.getName()); } + +StringRef getOrigFnName(const Function &F) { + return getFnNameFromAttr(F, OrigFnNameAttr); +} + +StringRef getOrigFnNameOrFnName(const Function &F) { + auto N = getFnNameFromAttr(F, OrigFnNameAttr); + return N.empty() ? F.getName() : N; +} + +static constexpr const char *BaseFnNameAttr = "mux-base-fn-name"; + +void setBaseFnName(Function &F, StringRef N) { F.addFnAttr(BaseFnNameAttr, N); } + +StringRef getBaseFnName(const Function &F) { + return getFnNameFromAttr(F, BaseFnNameAttr); +} + +StringRef getBaseFnNameOrFnName(const Function &F) { + auto N = getFnNameFromAttr(F, BaseFnNameAttr); + return N.empty() ? F.getName() : N; +} + +StringRef getOrSetBaseFnName(Function &F, const Function &SetFromF) { + const Attribute Attr = F.getFnAttribute(BaseFnNameAttr); + if (Attr.isValid()) { + return Attr.getValueAsString(); + } + + // Try and peer through the original function's name + StringRef BaseFnName = getBaseFnNameOrFnName(SetFromF); + F.addFnAttr(BaseFnNameAttr, BaseFnName); + setBaseFnName(F, BaseFnName); + return BaseFnName; +} + +static std::optional getStringFnAttrAsInt(const Attribute &Attr) { + if (Attr.isValid()) { + int AttrValue = 0; + if (!Attr.getValueAsString().getAsInteger(10, AttrValue)) { + return AttrValue; + } + } + return std::nullopt; +} + +static constexpr const char *LocalMemUsageAttrName = "mux-local-mem-usage"; + +void setLocalMemoryUsage(Function &F, uint64_t LocalMemUsage) { + const Attribute Attr = Attribute::get(F.getContext(), LocalMemUsageAttrName, + itostr(LocalMemUsage)); + F.addFnAttr(Attr); +} + +std::optional getLocalMemoryUsage(const Function &F) { + const Attribute Attr = F.getFnAttribute(LocalMemUsageAttrName); + auto Val = getStringFnAttrAsInt(Attr); + // Only return non-negative integers + return Val && Val >= 0 ? std::optional(*Val) : std::nullopt; +} + +static constexpr const char *DMAReqdSizeBytesAttrName = "mux-dma-reqd-size"; + +void setDMAReqdSizeBytes(Function &F, uint32_t DMASizeBytes) { + const Attribute Attr = Attribute::get( + F.getContext(), DMAReqdSizeBytesAttrName, itostr(DMASizeBytes)); + F.addFnAttr(Attr); +} + +std::optional getDMAReqdSizeBytes(const Function &F) { + const Attribute Attr = F.getFnAttribute(DMAReqdSizeBytesAttrName); + auto Val = getStringFnAttrAsInt(Attr); + // Only return non-negative integers + return Val && Val >= 0 ? std::optional(*Val) : std::nullopt; +} + +static constexpr const char *BarrierScheduleAttrName = "mux-barrier-schedule"; + +void setBarrierSchedule(CallInst &CI, BarrierSchedule Sched) { + StringRef Val = "unknown"; + switch (Sched) { + case BarrierSchedule::Unordered: + Val = "unordered"; + break; + case BarrierSchedule::Once: + Val = "once"; + break; + case BarrierSchedule::ScalarTail: + Val = "scalar-tail"; + break; + case BarrierSchedule::Linear: + Val = "linear"; + break; + } + + const Attribute Attr = + Attribute::get(CI.getContext(), BarrierScheduleAttrName, Val); + CI.addFnAttr(Attr); +} + +BarrierSchedule getBarrierSchedule(const CallInst &CI) { + const Attribute Attr = CI.getFnAttr(BarrierScheduleAttrName); + if (Attr.isValid()) { + return StringSwitch(Attr.getValueAsString()) + .Case("once", BarrierSchedule::Once) + .Case("scalar-tail", BarrierSchedule::ScalarTail) + .Case("linear", BarrierSchedule::Linear) + .Default(BarrierSchedule::Unordered); + } + return BarrierSchedule::Unordered; +} + +static constexpr const char *MuxNoSubgroupsAttrName = "mux-no-subgroups"; + +void setHasNoExplicitSubgroups(Function &F) { + F.addFnAttr(MuxNoSubgroupsAttrName); +} + +bool hasNoExplicitSubgroups(const Function &F) { + const Attribute Attr = F.getFnAttribute(MuxNoSubgroupsAttrName); + return Attr.isValid(); +} + +unsigned getMuxSubgroupSize(const llvm::Function &) { + // FIXME: The mux sub-group size is currently assumed to be 1 for all + // functions, kerrnels, and targets. This helper function is just to avoid + // hard-coding the constant 1 in places that will eventually need updated. + return 1; +} +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp new file mode 100644 index 0000000000000..df6cf77da1b8e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp @@ -0,0 +1,1467 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace llvm; + +#define NDEBUG_BARRIER +#define DEBUG_TYPE "barrier-regions" + +namespace { +using AlignIntTy = uint64_t; + +/// @brief it returns true if and only if the instruction is a work group +/// collective call, and returns false otherwise. +std::optional +getWorkGroupCollectiveCall(Instruction *inst, + compiler::utils::BuiltinInfo &bi) { + auto *const ci = dyn_cast_or_null(inst); + if (!ci) { + return std::nullopt; + } + + if (Function *callee = ci->getCalledFunction()) { + if (const auto b = bi.analyzeBuiltin(*callee)) { + const auto info = bi.isMuxGroupCollective(b->ID); + if (info && info->isWorkGroupScope()) { + return info; + } + } + } + return std::nullopt; +} + +/// @brief Builds a stub function containing only a void return instruction. +/// +/// @note This is useful for client debuggers that want to break on a +/// particular barrier and work item. Customer specific passes can fill the +/// contents since it may involve inline assembly for breakpoint traps. The +/// stub function takes a single i32 argument which is an id identifying the +/// barrier which invoked the stub. A client debugger should be able to read +/// this argument using the arch calling convention even without debug info +/// since it's always the first argument, although customer passes may +/// rearrange parameters later. +/// +/// @param[in] name What to name the stub function. +/// @param[in] module Current module. +/// @param[in] cc Calling convention for function. +/// +/// @return Return function created. +Function *MakeStubFunction(StringRef name, Module &module, CallingConv::ID cc) { + // If we've already created a stub return the existing function + if (Function *existing = module.getFunction(name)) { + return existing; + } + + auto &context = module.getContext(); + // 32-bit integer parameter + IntegerType *int32_type = IntegerType::get(context, 32); + // Function returns void + FunctionType *func_type = + FunctionType::get(Type::getVoidTy(context), {int32_type}, false); + + // Create function in module + Function *stub_func = + Function::Create(func_type, Function::ExternalLinkage, name, &module); + + // Don't inline the function since we want the debugger to be able to hook it + stub_func->addFnAttr(Attribute::NoInline); + + // we don't use exceptions + stub_func->addFnAttr(Attribute::NoUnwind); + stub_func->setCallingConv(cc); + + // No stub or cloned function should have SPIR_KERNEL calling convention. + // Please consider using SPIR_FUNC instead of SPIR_KERNEL. In case the + // original code has a different calling convention, we should preserve that + // one. + assert(cc != CallingConv::SPIR_KERNEL && "calling convention mismatch"); + + // Single basic block containing only a return void instruction + IRBuilder<> IRBuilder(BasicBlock::Create(context, "entry", stub_func)); + IRBuilder.CreateRetVoid(); + + // Build debug info for function if compiled with -g + DIBuilder DIB(module, /*AllowUnresolved*/ false); + + // Find module compile unit + auto *cu = DIB.createCompileUnit( + dwarf::DW_LANG_OpenCL, DIB.createFile("debug", "/"), "", false, "", 0); + + // Create DISubprogram metadata for function + auto type_array = + DIB.getOrCreateTypeArray({DIB.createUnspecifiedParameter()}); + auto subprogram_type = DIB.createSubroutineType(type_array); + auto DISubprogram = DIB.createFunction( + cu->getFile(), name, name, cu->getFile(), 0, subprogram_type, 0, + DINode::FlagZero, DISubprogram::SPFlagDefinition); + + // Set function compile unit + DISubprogram->replaceUnit(cu); + + // Assigned debug info to function + stub_func->setSubprogram(DISubprogram); + + DIB.finalize(); + + return stub_func; +} + +/// @brief Check whether this value is valid as def. +/// +/// @param[in] v Value for checking. +/// +/// @return True = valid for definition, False = not valid. +inline bool CheckValidDef(Value *v) { + return !(isa(v) || isa(v)); +} + +/// @brief Check whether this value is valid as use. +/// +/// @param[in] v - value for checking. +/// +/// @return True = valid for use, False = not valid. +inline bool CheckValidUse(Value *v) { + return !(isa(v) || isa(v) || isa(v)); +} + +bool IsRematerializableBuiltinCall(Value *v, compiler::utils::BuiltinInfo &bi) { + if (auto *call = dyn_cast(v)) { + if (auto *F = call->getCalledFunction()) { + if (const auto B = bi.analyzeBuiltin(*F)) { + if (B->properties & compiler::utils::eBuiltinPropertyRematerializable) { + for (auto &op : call->operands()) { + if (isa(op.get())) { + return false; + } + } + return true; + } + } + } + } + return false; +} + +// It traces through instructions with a single Instruction operand, looking +// for work item functions or function arguments. +bool IsTrivialValue(Value *v, unsigned depth, + compiler::utils::BuiltinInfo &bi) { + while (depth--) { + auto *const I = dyn_cast(v); + if (!I || IsRematerializableBuiltinCall(v, bi)) { + return true; + } + + // Pass through a vector splat to the splatted value + if (auto *const shuffle = dyn_cast(I)) { + if (shuffle->isZeroEltSplat()) { + if (auto *const ins = + dyn_cast(shuffle->getOperand(0))) { + if (auto *const src = dyn_cast(ins->getOperand(1))) { + v = src; + continue; + } else { + // Splat of a non-Instruction (i.e. an Argument) + return true; + } + } + } + return false; + } + + // Consider only certain trivial operations + if (!I->isBinaryOp() && !I->isCast() && !I->isUnaryOp()) { + return false; + } + + Value *chain = nullptr; + for (auto *op : I->operand_values()) { + if (auto *const opI = dyn_cast(op)) { + if (!chain) { + chain = opI; + } else if (chain != op) { + // It's non-trivial if it has more than one Instruction operand. + return false; + } + } + } + + // It's trivial if it didn't have any operands that were instructions. + if (!chain) { + return true; + } + + v = chain; + } + return false; +} + +// GEPs typically have a low cost, allow up to 1 non-trivial operand +// (including the pointer operand as well as the indices). +bool IsTrivialGEP(Value *v, SmallVectorImpl &operands) { + auto *const GEP = dyn_cast(v); + if (!GEP) { + return false; + } + + unsigned inst_ops = 0; + for (auto *op : GEP->operand_values()) { + if (isa(op) && ++inst_ops > 1) { + return false; + } + } + + for (auto *op : GEP->operand_values()) { + if (isa(op)) { + operands.push_back(op); + } + } + return true; +} + +/// @brief Update all basic block edges for PHINodes, and drop edges from +/// basic blocks that are not in the the new function (which only consists of +/// the subset of blocks that make up one region). +/// +/// @param[in] BB Basic block to process. +/// @param[in] vmap Map for value for cloning. +void UpdateAndTrimPHINodeEdges(BasicBlock *BB, ValueToValueMapTy &vmap) { + for (auto &phi : BB->phis()) { + for (unsigned i = 0; i < phi.getNumIncomingValues(); i++) { + const BasicBlock *incoming_bb = phi.getIncomingBlock(i); + + // If the incoming basic block was processed during cloning then + // update the edge, if it wasn't then it is not in the region so + // remove it. + if (vmap.count(incoming_bb)) { + Value *updated_bb = vmap[incoming_bb]; + phi.setIncomingBlock(i, cast(updated_bb)); + } else { + // Note: Updating the loop iterator to reflect the updated + // post-deletion indices. + phi.removeIncomingValue(i--); + } + } + } +} + +/// @brief Returns true if the type is a struct type containing any scalable +/// vectors in its list of elements +bool isStructWithScalables(Type *ty) { + if (auto *const struct_ty = dyn_cast(ty)) { + return any_of(struct_ty->elements(), + [](Type *ty) { return isa(ty); }); + } + return false; +} + +} // namespace + +Value *compiler::utils::Barrier::LiveValuesHelper::getExtractValueGEP( + const Value *live) { + if (auto *const extract = dyn_cast(live)) { + // We can't handle extracts with multiple indices + if (extract->getIndices().size() == 1) { + return getGEP(extract->getAggregateOperand(), extract->getIndices()[0]); + } + } + return nullptr; +} + +Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live, + unsigned member_idx) { + auto key = std::make_pair(live, member_idx); + if (auto gep_it = live_GEPs.find(key); gep_it != live_GEPs.end()) { + return gep_it->second; + } + + Value *gep; + + if (auto field_it = barrier.live_variable_index_map_.find(key); + field_it != barrier.live_variable_index_map_.end()) { + LLVMContext &context = barrier.module_.getContext(); + const unsigned field_index = field_it->second; + Value *live_variable_info_idxs[2] = { + ConstantInt::get(Type::getInt32Ty(context), 0), + ConstantInt::get(Type::getInt32Ty(context), field_index)}; + + gep = gepBuilder.CreateInBoundsGEP(barrier.live_var_mem_ty_, barrier_struct, + live_variable_info_idxs, + Twine("live_gep_") + live->getName()); + } else if (auto field_it = barrier.live_variable_scalables_map_.find(key); + field_it != barrier.live_variable_scalables_map_.end()) { + const unsigned field_offset = field_it->second; + Value *scaled_offset = nullptr; + + LLVMContext &context = barrier.module_.getContext(); + if (field_offset != 0) { + if (!vscale) { + Type *size_type = gepBuilder.getIntNTy(barrier.size_t_bytes * 8); + vscale = gepBuilder.CreateIntrinsic(Intrinsic::vscale, size_type, {}); + } + scaled_offset = gepBuilder.CreateMul( + vscale, gepBuilder.getIntN(barrier.size_t_bytes * 8, field_offset)); + } else { + scaled_offset = ConstantInt::get(Type::getInt32Ty(context), 0); + } + + Value *live_variable_info_idxs[3] = { + ConstantInt::get(Type::getInt32Ty(context), 0), + ConstantInt::get(Type::getInt32Ty(context), + barrier.live_var_mem_scalables_index), + scaled_offset, + }; + + // Gep into the raw byte buffer + gep = gepBuilder.CreateInBoundsGEP( + barrier.live_var_mem_ty_, barrier_struct, live_variable_info_idxs, + Twine("live_gep_scalable_") + live->getName()); + } else { + // Fall back and see if this live variable is actually a decomposed + // structure type. + return getExtractValueGEP(live); + } + + // Cache this GEP for later + live_GEPs[key] = gep; + + return gep; +} + +Value *compiler::utils::Barrier::LiveValuesHelper::getReload(Value *live, + IRBuilderBase &ir, + const char *name, + bool reuse) { + auto &mapped = reloads[live]; + if (reuse && mapped) { + return mapped; + } + + if (Value *v = getGEP(live)) { + if (!isa(live)) { + // If live variable is not allocainst, insert load. + if (!isStructWithScalables(live->getType())) { + v = ir.CreateLoad(live->getType(), v, Twine(live->getName(), name)); + } else { + auto *const struct_ty = cast(live->getType()); + // Start off with a poison value, and build the struct up member by + // member, reloading each member at a time from their respective + // offsets. + v = PoisonValue::get(struct_ty); + for (auto [idx, ty] : enumerate(struct_ty->elements())) { + auto *const elt_addr = getGEP(live, idx); + assert(elt_addr && "Could not get address of struct element"); + auto *const reload = + ir.CreateLoad(ty, elt_addr, Twine(live->getName(), name)); + v = ir.CreateInsertValue(v, reload, idx); + } + } + } + mapped = v; + return v; + } + + if (auto *I = dyn_cast(live)) { + // Save these + auto insPoint = ir.GetInsertPoint(); + auto *const insBB = ir.GetInsertBlock(); + + if (!reuse || !mapped) { + auto *clone = I->clone(); + clone->setName(I->getName()); + clone->setDebugLoc(DebugLoc()); + ir.Insert(clone); + if (gepBuilder.GetInsertPoint() == ir.GetInsertPoint()) { + gepBuilder.SetInsertPoint(clone); + } + ir.SetInsertPoint(clone); + mapped = clone; + I = clone; + } else { + return mapped; + } + + for (auto op_it = I->op_begin(); op_it != I->op_end();) { + auto &op = *op_it++; + if (auto *op_inst = dyn_cast(op.get())) { + ir.SetInsertPoint(I); + op.set(getReload(op_inst, ir, name, reuse)); + } + } + + // Restore the original insert point + ir.SetInsertPoint(insBB, insPoint); + return I; + } + + return live; +} + +void compiler::utils::Barrier::Run(llvm::ModuleAnalysisManager &mam) { + bi_ = &mam.getResult(module_); + FindBarriers(); + + kernel_id_map_[kBarrier_EndID] = nullptr; + + if (barriers_.empty()) { + // If there are no barriers, we can use the original function as the + // single barrier region. + auto &node = barrier_region_id_map_[kBarrier_FirstID]; + node.entry = &func_.getEntryBlock(); + node.id = kBarrier_FirstID; + node.successor_ids.push_back(kBarrier_EndID); + kernel_id_map_[kBarrier_FirstID] = &func_; + return; + } + + // If we found some barriers, we need to split up our kernel across them! + { + ModulePassManager pm; + // It's convenient to create LCSSA PHI nodes to stop values defined + // within a loop being stored to the barrier unnecessarily on every + // iteration (if, for instance, the loop is entirely between two + // barriers, but the value is used outside of that barrier region). + pm.addPass(llvm::createModuleToFunctionPassAdaptor(LCSSAPass())); + pm.run(module_, mam); + mam.invalidate(module_, PreservedAnalyses::allInSet()); + } + + // Do the splitting first in case a value is used on both sides of a barrier + // within the same basic block. + SplitBlockwithBarrier(); + FindLiveVariables(); + + // Tidy up the barrier struct, removing values that we can + // reload/rematerialize on the other side of the barrier. + // NB: We don't do this if any of the barriers is a work-group broadcast. In + // the case that a broadcasted value is non-uniform (i.e., it depends on + // work-item builtins), we must preserve it in the barrier struct! This is + // because we can't rematerialize the local ID and broadcast that; we need + // to broadcast the specific local ID for the broadcasted work-item. + // This is very crude. We could either: + // 1. Trace through all candidate values we want to remove and ensure they're + // not being broadcasted. + // 2. Add some more advanced rematerialization logic to substitute + // rematerializable work-item functions with values specific to a given + // work-item. Note that the builtins we rematerialize are ultimately up to + // the BuiltinInfo to identify, so we can't assume anything here and would + // have to defer back to the BuiltinInfo to do this correctly. + if (llvm::none_of(barriers_, [this](llvm::CallInst *const CI) { + auto Info = getWorkGroupCollectiveCall(CI, *bi_); + return Info && Info->isBroadcast(); + })) { + TidyLiveVariables(); + } + + MakeLiveVariableMemType(); + SeperateKernelWithBarrier(); +} + +void compiler::utils::Barrier::replaceSubkernel(Function *from, Function *to) { + for (auto &k : kernel_id_map_) { + if (k.second == from) { + k.second = to; + } + } +} + +/// @brief Find Barriers. +void compiler::utils::Barrier::FindBarriers() { + SmallVector, 8> orderedBarriers; + + // Check whether current function has barrier or not. + for (BasicBlock &b : func_) { + for (Instruction &bi : b) { + // Check call instructions for barrier. + if (CallInst *call_inst = dyn_cast(&bi)) { + if (Function *callee = call_inst->getCalledFunction()) { + const auto B = bi_->analyzeBuiltin(*callee); + if (B && BuiltinInfo::isMuxBuiltinWithWGBarrierID(B->ID)) { + auto *const id_param = call_inst->getOperand(0); + auto *const id_param_c = cast(id_param); + const auto id = id_param_c->getZExtValue(); + orderedBarriers.emplace_back(id, call_inst); + } + } + } + } + } + + std::sort(orderedBarriers.begin(), orderedBarriers.end()); + for (const auto &barrier : orderedBarriers) { + barriers_.push_back(barrier.second); + } +} + +/// @brief Split block with barrier. +void compiler::utils::Barrier::SplitBlockwithBarrier() { + // If debugging, create stub functions in the module which will be invoked + // before each barrier, and after each barrier, by every work item. + Function *entry_stub = nullptr; + Function *exit_stub = nullptr; + if (is_debug_) { + CallingConv::ID stub_cc; + if (func_.getCallingConv() == CallingConv::SPIR_KERNEL) { + stub_cc = CallingConv::SPIR_FUNC; + } else { + stub_cc = func_.getCallingConv(); + } + entry_stub = MakeStubFunction("__barrier_entry", module_, stub_cc); + exit_stub = MakeStubFunction("__barrier_exit", module_, stub_cc); + } + + auto &node = barrier_region_id_map_[kBarrier_FirstID]; + node.entry = &func_.getEntryBlock(); + node.id = kBarrier_FirstID; + + for (CallInst *split_point : barriers_) { + // ID identifying which barrier invoked stub used as argument to call. + auto *id = cast(split_point->getOperand(0)); + const auto barrier_id = kBarrier_StartNewID + id->getZExtValue(); + + if (is_debug_) { + assert(entry_stub != nullptr); // Guaranteed as is_debug_ is const. + assert(exit_stub != nullptr); // Guaranteed as is_debug_ is const. + + // Create call instructions invoking debug stubs for every barrier. We + // don't insert these into a basic block yet since we want to insert + // them at a point where live variables have already been loaded. This + // info won't be available till later. + + // Call invoking entry stub + auto entry_caller = CallInst::Create(entry_stub, id); + entry_caller->setDebugLoc(split_point->getDebugLoc()); + entry_caller->setCallingConv(entry_stub->getCallingConv()); + + // Call invoking exit stub + auto exit_caller = CallInst::Create(exit_stub, id); + exit_caller->setDebugLoc(split_point->getDebugLoc()); + exit_caller->setCallingConv(exit_stub->getCallingConv()); + + // Store call instructions in map for later insertion + barrier_stub_call_map_[barrier_id] = + std::make_pair(entry_caller, exit_caller); + } + + auto &node = barrier_region_id_map_[barrier_id]; + node.barrier_inst = split_point; + node.id = barrier_id; + node.schedule = getBarrierSchedule(*split_point); + + // Our scan implementation requires a linear work-item ordering, to loop + // over all of the 'main' and 'tail' work-items in order. + if (auto collective = getWorkGroupCollectiveCall(split_point, *bi_)) { + if (collective->isScan()) { + node.schedule = BarrierSchedule::Linear; + } + } + + split_point->getParent()->splitBasicBlock(split_point, "barrier"); + } + + // We have to gather the basic block data after splitting, because we + // might not be processing barriers in program order, and things can get + // awfully confused. + for (auto &[i, node] : barrier_region_id_map_) { + if (node.barrier_inst) { + auto *const bb = node.barrier_inst->getParent(); + barrier_id_map_[bb] = node.id; + barrier_successor_set_.insert(*predecessors(bb).begin()); + node.entry = bb; + } + } +} + +/// @brief Generate an empty kernel that only duplicates the source kernel's +/// CFG +/// +/// This is used to do a "dry run" of kernel splitting in order to obtain the +/// dominator tree, which is needed for correct identification of values that +/// cross the barrier. +/// +/// @param[in] region the region to clone into the new kernel. +/// @param[out] bbmap a mapping of original blocks onto the empty clones. +/// @return the fake kernel +Function *compiler::utils::Barrier::GenerateFakeKernel( + BarrierRegion ®ion, DenseMap &bbmap) { + LLVMContext &context = module_.getContext(); + + // Make new kernel function. + FunctionType *new_fty = FunctionType::get(Type::getVoidTy(context), false); + Function *new_kernel = + Function::Create(new_fty, Function::InternalLinkage, "tmp", &module_); + ValueToValueMapTy vmap; + + for (auto *bb : region.blocks) { + BasicBlock *new_bb = BasicBlock::Create(context, "", new_kernel); + if (region.barrier_blocks.contains(bb)) { + ReturnInst::Create(context, nullptr, new_bb); + } else { + bb->getTerminator()->clone()->insertInto(new_bb, new_bb->end()); + } + vmap[bb] = new_bb; + bbmap[bb] = new_bb; + } + + const RemapFlags remapFlags = + RF_IgnoreMissingLocals | llvm::RF_ReuseAndMutateDistinctMDs; + for (auto &f : *new_kernel) { + auto *term = f.getTerminator(); + RemapInstruction(term, vmap, remapFlags); + } + + return new_kernel; +} + +/// @brief Obtain a set of Basic Blocks for an inter-barrier region +/// +/// It traverses the CFG, following successors, until it hits a barrier, +/// building the region's internal data. +/// +/// @param[out] region the region to process +void compiler::utils::Barrier::GatherBarrierRegionBlocks( + BarrierRegion ®ion) { + DenseSet visited; + region.blocks.push_back(region.entry); + visited.insert(region.entry); + size_t index = 0; + while (index < region.blocks.size()) { + BasicBlock *BB = region.blocks[index++]; + if (barrier_successor_set_.contains(BB)) { + region.barrier_blocks.insert(BB); + } else { + for (BasicBlock *succ : successors(BB)) { + if (visited.insert(succ).second) { + region.blocks.push_back(succ); + } + } + } + } +} + +/// @brief Obtain a set of Values used in a region that cross a barrier +/// +/// A value use crosses a barrier in the following cases: +/// * Its use is not in the same region as the defintion +/// * Its definition does not dominate the use +/// +/// @param[in] region The inter-barrier region +/// @param[in] ignore set of values to ignore +void compiler::utils::Barrier::GatherBarrierRegionUses( + BarrierRegion ®ion, DenseSet &ignore) { + DenseMap bbmap; + Function *fake_func = GenerateFakeKernel(region, bbmap); + + // We should check the dominance relation between definition bb of live + // variables and user bb. If def bb does not dominate user bb, the user is + // modified by live variable information. + DominatorTree DT; + DT.recalculate(*fake_func); + + for (auto *BB : region.blocks) { + BasicBlock *BBclone = bbmap[BB]; + for (auto &I : *BB) { + if (PHINode *pn = dyn_cast(&I)) { + for (unsigned i = 0, e = pn->getNumIncomingValues(); i != e; i++) { + Value *val = pn->getIncomingValue(i); + if (CheckValidUse(val) && !ignore.contains(val)) { + if (auto *inst = dyn_cast(val)) { + BasicBlock *incoming = pn->getIncomingBlock(i); + BasicBlock *parent = inst->getParent(); + // If the incoming edge comes from outside the region, it is + // going to get removed anyway, so disregard it + if (bbmap.contains(incoming)) { + if (!bbmap.contains(parent)) { + region.uses_ext.insert(val); + } else if (!DT.dominates(bbmap[parent], bbmap[incoming])) { + region.uses_int.insert(val); + } + } + } + } + } + } else { + for (Value *val : I.operands()) { + if (CheckValidUse(val) && !ignore.contains(val)) { + if (auto *inst = dyn_cast(val)) { + BasicBlock *parent = inst->getParent(); + if (!bbmap.contains(parent)) { + region.uses_ext.insert(val); + } else if (!DT.dominates(bbmap[parent], BBclone)) { + region.uses_int.insert(val); + } + } + } + } + } + if (CheckValidDef(&I) && !I.use_empty()) { + region.defs.insert(&I); + } + } + } + DT.reset(); + fake_func->eraseFromParent(); +} + +/// @brief Find livein and liveout variables per each basic block. +void compiler::utils::Barrier::FindLiveVariables() { + DenseSet func_args; + for (Argument &arg : func_.args()) { + func_args.insert(&arg); + } + +#ifndef NDEBUG + // Make sure there aren't any stray allocas outside the entry block. + for (auto block = func_.begin(); ++block != func_.end();) { + for (auto &inst : *block) { + assert(!isa(inst) && "Alloca found outside entry block!"); + } + } +#endif // ndef NDEBUG + + // Put all the original allocas into the barrier struct, in case they get + // indirectly referenced from the other side of a barrier. + for (Instruction &bi : func_.front()) { + if (isa(&bi)) { + whole_live_variables_set_.insert(&bi); + } else { + continue; + } + } + + for (auto &[i, region] : barrier_region_id_map_) { + GatherBarrierRegionBlocks(region); + GatherBarrierRegionUses(region, func_args); + whole_live_variables_set_.set_union(region.uses_int); + whole_live_variables_set_.set_union(region.uses_ext); + } +} + +/// @brief Remove variables that are better recalculated than stored in the +/// barrier, for instance casts and vector splats. +void compiler::utils::Barrier::TidyLiveVariables() { + const auto &dl = module_.getDataLayout(); + + // Start off by doing a simple sweep of stuff that is better off not in the + // barrier: vector splats, no-op/widening casts, and single/zero index GEPs + // since we might as well put their source operand in the barrier, instead. + SmallVector removals; + SmallVector redirects; + for (auto v : whole_live_variables_set_) { + if (auto *const shuffle = dyn_cast(v)) { + if (shuffle->isZeroEltSplat()) { + // if we remove a vector splat, we have to make sure the scalar + // source operand is in the barrier instead. + Value *const op = shuffle->getOperand(0); + if (auto *const ins = dyn_cast(op)) { + removals.push_back(v); + + Value *const src = ins->getOperand(1); + // Put the source instruction in the barrier instead. + // If it's not an instruction, it is probably a function argument. + if (isa(src) && !IsTrivialGEP(src, redirects)) { + redirects.push_back(src); + } + } + } + } else if (auto *const cast = dyn_cast(v)) { + if (auto *const src = dyn_cast(cast->getOperand(0))) { + if (cast->isNoopCast(dl) || + (cast->getSrcTy()->getScalarSizeInBits() < + cast->getDestTy()->getScalarSizeInBits())) { + removals.push_back(v); + + // Put the source instruction in the barrier instead. + if (isa(src) && !IsTrivialGEP(src, redirects)) { + redirects.push_back(src); + } + } + } else { + // No casts of non-instructions in the barrier, please.. + removals.push_back(v); + } + } else if (IsTrivialGEP(v, redirects)) { + removals.push_back(v); + } + } + + // We put the redirects into the barrier first, so that if they in turn + // turn out to be redundant, we can remove them again. + whole_live_variables_set_.set_union(redirects); + + // Remove work item calls and casts of arguments or other barrier members. + for (auto v : whole_live_variables_set_) { + if (IsTrivialValue(v, 4u, *bi_)) { + removals.push_back(v); + } else if (auto *cast = dyn_cast(v)) { + Value *op = cast->getOperand(0); + if (whole_live_variables_set_.contains(op)) { + removals.push_back(v); + } + } + } + whole_live_variables_set_.set_subtract(removals); +} + +/// @brief Pad the field types to an alignment by adding an int array if +/// needed +/// @param field_tys The vector of types representing the final structure +/// @param offset The current offset in the structure +/// @param alignment The required alignment +/// @return The new offset (or original offset if no padding needed) +unsigned compiler::utils::Barrier::PadTypeToAlignment( + SmallVectorImpl &field_tys, unsigned offset, unsigned alignment) { + if (alignment) { + // check if member is not already aligned + const unsigned int remainder = offset % alignment; + if (0 != remainder) { + // calculate number of padding bytes + const unsigned int padding = alignment - remainder; + + // Use a byte array to pad struct rather than trying to create + // an arbitrary intNTy, since this may not be supported by the backend. + const auto padByteType = Type::getInt8Ty(module_.getContext()); + const auto padByteArrayType = ArrayType::get(padByteType, padding); + field_tys.push_back(padByteArrayType); + + // bump offset by padding size + offset += padding; + } + } + return offset; +} + +/// @brief Make type for whole live variables. +void compiler::utils::Barrier::MakeLiveVariableMemType() { + SmallVector field_tys; + max_live_var_alignment = 0; + + const auto &dl = module_.getDataLayout(); + + struct member_info { + /// @brief The root `value` being stored. + Value *value; + /// @brief The member index of this member inside `value`, if `value` is a + /// decomposed structure type. Zero otherwise. + unsigned member_idx; + /// @brief The type of `value`, or of the specific member of `value`. + Type *type; + /// @brief The alignment of the value being stored + unsigned alignment; + /// @brief The size of the value being stored + unsigned size; + }; + + SmallVector barrier_members; + barrier_members.reserve(whole_live_variables_set_.size()); + for (Value *live_var : whole_live_variables_set_) { + LLVM_DEBUG(dbgs() << "whole live set:" << *live_var << '\n'; + dbgs() << "type:" << *(live_var->getType()) << '\n';); + Type *field_ty = live_var->getType(); + + Type *member_ty = nullptr; + unsigned alignment = 0; + // If allocainst is live variable, get element type of pointer type + // from field_ty and remember alignment + if (const auto *AI = dyn_cast(live_var)) { + member_ty = AI->getAllocatedType(); + alignment = AI->getAlign().value(); + } else { + member_ty = field_ty; + } + + std::vector member_tys = {member_ty}; + // If this is a struct type containing any scalable members, we must + // decompose the value into its individual components. + if (isStructWithScalables(member_ty)) { + member_tys = cast(member_ty)->elements().vec(); + } + + for (auto [idx, ty] : enumerate(member_tys)) { + // For a scalable vector, we need the size of the equivalent fixed vector + // based on its known minimum size. + auto member_ty_fixed = ty; + if (isa(ty)) { + auto *const eltTy = multi_llvm::getVectorElementType(ty); + auto n = multi_llvm::getVectorElementCount(ty).getKnownMinValue(); + member_ty_fixed = VectorType::get(eltTy, ElementCount::getFixed(n)); + } + + // Need to ensure that alloc alignment or preferred alignment is kept + // in the new struct so pad as necessary. + const unsigned size = dl.getTypeAllocSize(member_ty_fixed); + alignment = std::max(dl.getPrefTypeAlign(ty).value(), + static_cast(alignment)); + max_live_var_alignment = std::max(alignment, max_live_var_alignment); + + barrier_members.push_back( + {live_var, static_cast(idx), ty, alignment, size}); + } + } + + // sort the barrier members by decreasing alignment to minimise the amount + // of padding required (use a stable sort so it's deterministic) + std::stable_sort(barrier_members.begin(), barrier_members.end(), + [](const member_info &lhs, const member_info &rhs) -> bool { + return lhs.alignment > rhs.alignment; + }); + + // Deal with non-scalable members first + unsigned offset = 0; + for (auto &member : barrier_members) { + if (isa(member.type)) { + continue; + } + + offset = PadTypeToAlignment(field_tys, offset, member.alignment); + + // Check if the alloca has a debug info source variable attached. If + // so record this and the matching byte offset into the struct. + const auto DVRDeclares = findDVRDeclares(member.value); + for (auto *const DVRDeclare : DVRDeclares) { + debug_variable_records_.push_back(std::make_pair(DVRDeclare, offset)); + } + offset += member.size; + live_variable_index_map_[std::make_pair(member.value, member.member_idx)] = + field_tys.size(); + field_tys.push_back(member.type); + } + // Pad the end of the struct to the max alignment as we are creating an + // array + offset = PadTypeToAlignment(field_tys, offset, max_live_var_alignment); + live_var_mem_size_fixed = offset; // No more offsets required. + + // Now deal with any scalable members. We reset the offset to zero because + // scalables are indexed bytewise starting from the beginning of the + // variable-sized scalables section at the end of the struct. + SmallVector field_tys_scalable; + offset = 0; + for (auto &member : barrier_members) { + if (!isa(member.type)) { + continue; + } + + offset = PadTypeToAlignment(field_tys_scalable, offset, member.alignment); + + live_variable_scalables_map_[std::make_pair(member.value, + member.member_idx)] = offset; + offset += member.size; + field_tys_scalable.push_back(member.type); + } + // Pad the end of the struct to the max alignment as we are creating an + // array + offset = + PadTypeToAlignment(field_tys_scalable, offset, max_live_var_alignment); + live_var_mem_size_scalable = offset; // No more offsets required. + + LLVMContext &context = module_.getContext(); + // if the barrier contains scalables, add a flexible byte array on the end + if (offset != 0) { + live_var_mem_scalables_index = field_tys.size(); + field_tys.push_back(ArrayType::get(IntegerType::getInt8Ty(context), 0)); + } + + // Create struct type for live variable memory allocation; we create this + // even when the type is empty. The big entry point pass depends on this + // to detect that the barrier pass has been executed. + SmallString<128> name; + live_var_mem_ty_ = StructType::create( + context, field_tys, + (Twine(func_.getName() + "_live_mem_info")).toStringRef(name), false); + + name.clear(); + + LLVM_DEBUG(dbgs() << "Barrier size: " << offset << "\n"; + dbgs() << "whole live set type:" << *(live_var_mem_ty_) << '\n';); +} + +/// @brief Generate new kernel from an inter-barrier region such that no call +/// to barriers occur within it. +/// +/// @param[in] region the inter-barrier region to create the kernel from +/// @return the new kernel +Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion ®ion) { + BasicBlock *entry_point = region.entry; + LLVMContext &context = module_.getContext(); + + LLVM_DEBUG({ + dbgs() << "\n"; + unsigned I = 0; + for (auto *D : region.blocks) { + dbgs() << "entry block: " << entry_point->getName() << "\n"; + dbgs() << "region visited path [" << I++ << "] = " << D->getName() + << "\n\n"; + dbgs() << *D << "\n\n"; + } + }); + + SmallVector new_func_params; + // First kernel adds original kernel's parameters. + for (const auto &arg : func_.args()) { + new_func_params.push_back(arg.getType()); + } + + // If we have a work group collective call, we need to create a new argument + // so that the result can be passed in. + const bool collective = + getWorkGroupCollectiveCall(region.barrier_inst, *bi_).has_value(); + if (collective) { + new_func_params.push_back(region.barrier_inst->getType()); + } + + // Add live variables' parameter as last if there are any. + const bool hasBarrierStruct = !whole_live_variables_set_.empty() && + region.schedule != BarrierSchedule::Once; + if (hasBarrierStruct) { + PointerType *pty = PointerType::get(context, /*AddressSpace=*/0); + new_func_params.push_back(pty); + } + + // Make new kernel function. + FunctionType *new_fty = FunctionType::get(Type::getInt32Ty(context), + new_func_params, func_.isVarArg()); + Function *new_kernel = + Function::Create(new_fty, Function::InternalLinkage, + func_.getName() + ".mux-barrier-region", &module_); + + // We don't use exceptions. + new_kernel->setAttributes(func_.getAttributes()); + + // We also want to always inline this function (unless it is noinline). + if (!new_kernel->hasFnAttribute(Attribute::NoInline)) { + new_kernel->addFnAttr(Attribute::AlwaysInline); + } + + // copy the calling convention from the old function, except for + // SPIR_KERNEL. SPIR_KERNELs need to be split into SPIR_FUNC + CallingConv::ID new_kernel_cc; + if (func_.getCallingConv() == CallingConv::SPIR_KERNEL) { + new_kernel_cc = CallingConv::SPIR_FUNC; + } else { + new_kernel_cc = func_.getCallingConv(); + } + new_kernel->setCallingConv(new_kernel_cc); + + // Copy the metadata into the new kernel ignoring any debug info. + compiler::utils::copyFunctionMetadata(func_, *new_kernel); + + // We're not interested in these sub-kernels being registered as kernels. + // While they're technically kernels, they're only ever called from our + // actual wrapper entry point. + compiler::utils::dropIsKernel(*new_kernel); + + live_variable_mem_t live_vars_defs_in_kernel; + ValueToValueMapTy vmap; + // First kernel follows original kernel's arguments first. + Function::arg_iterator new_arg = new_kernel->arg_begin(); + for (const auto &arg : func_.args()) { + vmap[&arg] = &*(new_arg++); + } + + // Copy a region to the new kernel function. + bool returns_from_kernel = false; + for (auto *block : region.blocks) { + BasicBlock *cloned_bb = + CloneBasicBlock(block, vmap, "", live_vars_defs_in_kernel, new_kernel); + vmap[block] = cloned_bb; + + // Remove last terminator from clone block with barrier. + if (region.barrier_blocks.contains(block)) { + cloned_bb->getTerminator()->eraseFromParent(); + + // Return the next barrier's id. + const unsigned next_barrier_id = + barrier_id_map_[block->getSingleSuccessor()]; + ConstantInt *barrier_id_cst = + ConstantInt::get(Type::getInt32Ty(context), next_barrier_id); + auto new_ret = ReturnInst::Create(context, barrier_id_cst, cloned_bb); + + // Barrier blocks should be unique. + region.successor_ids.push_back(next_barrier_id); + + // Insert call to debug stub before return if debugging, this stub + // signifies that we're about to enter the next barrier + if (is_debug_) { + // Look up entry call instruction in map + CallInst *entry_call = barrier_stub_call_map_[next_barrier_id].first; + + // Check for null since if this is the final kernel there won't be + // a next barrier to have an entry for. + if (!entry_call) { + continue; + } + + // Check if the entry call already has a parent since there can be + // multiple return instructions in a kernel, if it does then clone + // the instruction first. + if (nullptr == entry_call->getParent()) { + entry_call->insertBefore(new_ret->getIterator()); + } else { + entry_call->clone()->insertBefore(new_ret->getIterator()); + } + } + } else if (ReturnInst *ret = + dyn_cast(cloned_bb->getTerminator())) { + // Change return instruction with end barrier number. + ConstantInt *cst_endid = + ConstantInt::get(Type::getInt32Ty(context), kBarrier_EndID); + ReturnInst *new_ret = ReturnInst::Create(context, cst_endid); + new_ret->insertBefore(ret->getIterator()); + ret->replaceAllUsesWith(new_ret); + ret->eraseFromParent(); + + // We can have multiple return points, but should only count it once. + returns_from_kernel = true; + } + } + if (returns_from_kernel) { + region.successor_ids.push_back(kBarrier_EndID); + } + // Keep things consistent + std::sort(region.successor_ids.begin(), region.successor_ids.end()); + + // Update the incoming edges to phi nodes, and drop edges to basic blocks + // that are not present in the new function. Note that this must happen + // after all the basic blocks have been cloned, so that we know how to + // update the incoming edges to phi nodes that represent back edges. + for (auto *block : region.blocks) { + UpdateAndTrimPHINodeEdges(cast(vmap[block]), vmap); + } + + BasicBlock *new_kernel_entry_block = &(new_kernel->getEntryBlock()); + Instruction *insert_point = &*new_kernel_entry_block->getFirstNonPHIOrDbg(); + auto *const cloned_barrier_call = + region.barrier_inst ? insert_point : nullptr; + + // If we have a work group collective call, we need to remap its result from + // the arguments list. + if (collective) { + vmap[insert_point] = &*(new_arg++); + } + + // The entry kernel might have allocas in it that don't get removed, + // so better make sure to insert after them. + while (isa(insert_point)) { + insert_point = insert_point->getNextNode(); + } + + // It puts all the GEPs at the start of the kernel, but only once + LiveValuesHelper live_values( + *this, insert_point, + hasBarrierStruct ? compiler::utils::getLastArgument(new_kernel) + : nullptr); + + // Load live variables and map them. + // These variables are defined in a different kernel, so we insert the + // relevant load instructions in the entry block of the kernel. + { + // Note that if our barrier is a work group collective, its operand will + // probably still get reloaded here, even though it's going to get deleted, + // so we hope that it gets optimized away later, in this case. + for (const auto cur_live : region.uses_ext) { + IRBuilder<> insertIR(insert_point); + vmap[cur_live] = live_values.getReload(cur_live, insertIR, "_load", true); + } + } + + SmallVector allocas_and_intrinsics_to_remove; + + // Store only live variables that are defined in this kernel. + // + // We might like to store the variables at the point we hit the barrier. + // However, this is not always possible because the value definition might + // not dominate any or all of the exit blocks. Furthermore, if this value + // is used again in the same kernel after looping around the barrier, we + // have to be aware that the usage might be expecting the updated value. + // (This can happen in nested loops, where the outer increment becomes a + // conditional block.) Therefore, we put the store right after the + // definition instead. + for (const auto live_var : live_vars_defs_in_kernel) { + // If allocainst is live variable and defined in this function, then + // change the alloca to a GEP directly into the live variables struct + // otherwise we store the value to the struct. This is needed because + // it is possible for one live variable to reference another by + // pointer. When we then save them to the live variable struct they + // will point to the wrong address. By GEPping directly to the final + // live struct we resolve this issue as it will always use the final + // address. + if (auto *alloca_inst = dyn_cast(live_var)) { + // Check to see if it is still an alloca after vmap. If not we may + // have processed it before and no work needs doing as we are using + // the live variable struct directly. + if (auto *new_alloca_inst = dyn_cast(vmap[alloca_inst])) { + allocas_and_intrinsics_to_remove.push_back(new_alloca_inst); + // Also remove any assume-like intrinsics that are users of this + // alloca. These assumptions may not hold. For example, lifetime + // intrinsics are definitely dangerous, as by directly replacing their + // alloca operands with the address of the live variable struct, we are + // telling LLVM that *all* accesses of the live variable struct also + // start/end at that point, which is not true. + // Similarly, llvm.assume and llvm.experimental.noalias.scope.decl may + // hold for the alloca but not the live variables struct. + for (auto *const user : alloca_inst->users()) { + if (auto *const intrinsic = dyn_cast(user); + intrinsic && intrinsic->isAssumeLikeIntrinsic()) { + allocas_and_intrinsics_to_remove.push_back(intrinsic); + } + } + // change the vmap to point to the GEP instead of the original alloca + vmap[live_var] = live_values.getGEP(live_var); + } + } else { + // Place the new store immediately after the definition, but if it's a + // PHI node we have to make sure to put it after any other PHI nodes. + Instruction *inst = cast(vmap[live_var]); + Instruction *insert_point = inst->getNextNode(); + while (isa(insert_point)) { + insert_point = insert_point->getNextNode(); + } + IRBuilder<> B(insert_point); + if (!isStructWithScalables(live_var->getType())) { + auto *addr = live_values.getGEP(live_var); + B.CreateStore(live_var, addr); + } else { + // Store this struct containing scalable members piece-wise + auto member_tys = cast(live_var->getType())->elements(); + for (auto [idx, ty] : enumerate(member_tys)) { + auto *extract = B.CreateExtractValue(live_var, idx); + auto *extract_addr = live_values.getGEP(extract); + assert(extract_addr); + B.CreateStore(extract, extract_addr); + } + } + } + } + + // Iterate instruction from insert point at entry basic block. + insert_point = &*new_kernel_entry_block->getFirstNonPHIOrDbg(); + const RemapFlags remapFlags = + RF_IgnoreMissingLocals | llvm::RF_ReuseAndMutateDistinctMDs; + BasicBlock::iterator b_iter = insert_point->getIterator(); + while (b_iter != new_kernel_entry_block->end()) { + RemapInstruction(&*b_iter, vmap, remapFlags); + b_iter++; + } + + // Remove barrier. We do this after creating stores so that if it's a work + // group collective, it will have been processed as normal above and written + // into the barrier struct where needed. + if (cloned_barrier_call) { + // When debugging insert a call to the exit debug stub at the insert + // point, this location is important since all the live variables will + // have been loaded by this point. + if (is_debug_) { + const unsigned barrier_id = barrier_id_map_[entry_point]; + // Get call instruction invoking exit stub from map + CallInst *exit_caller = barrier_stub_call_map_[barrier_id].second; + exit_caller->insertAfter(cloned_barrier_call); + // Use updated debug info scope since call_inst will have had + // this set by ModifyDebugInfoScopes() + exit_caller->setDebugLoc(cloned_barrier_call->getDebugLoc()); + } + if (collective) { + cloned_barrier_call->replaceAllUsesWith(vmap[cloned_barrier_call]); + } + cloned_barrier_call->eraseFromParent(); + } + + // don't remap the first basicblock again.. + Function::iterator cfi = ++(new_kernel->begin()); + const Function::iterator cfie = new_kernel->end(); + for (; cfi != cfie; cfi++) { + for (Instruction &cbi : *cfi) { + RemapInstruction(&cbi, vmap, remapFlags); + } + } + + // Remove any allocas and their dependent intrinsics that have been replaced + // by a GEP instruction + for (auto *inst : allocas_and_intrinsics_to_remove) { + inst->eraseFromParent(); + } + + // This needs resetting for the sake of any further new GEPs created + live_values.gepBuilder.SetInsertPoint( + new_kernel_entry_block->getFirstNonPHIOrDbg()); + + // If there are definitions of live variable in this function, process it + // here. As mentioned above regarding value stores, the user might want to + // load the value after it has been updated. Therefore, we place the new + // loads right before their uses. + // + // Potentially, this is not optimal, since it might create multiple loads. + // Ideally we should use some kind of reachability query to determine if + // the load can be placed before the store, and if not, PHI nodes could + // be inserted instead to get the value directly from the new definition. + // + // It would be nice not to have to build the Dominator Tree here again, + // since we already did it when we gathered the barrier crossing values. + // The problem is it's a use/user pair that crosses a barrier, not just the + // use itself. Some users may be dominated, and others not. + // + // NOTE it is impossible for any of these to be an Alloca. + DominatorTree DT; + DT.recalculate(*new_kernel); + + for (auto OldDef : region.uses_int) { + Instruction *NewDef = cast(vmap[OldDef]); + BasicBlock *DefBB = NewDef->getParent(); + + for (auto use_it = NewDef->use_begin(); use_it != NewDef->use_end();) { + auto &U = *use_it++; + Instruction *UserInst = cast(U.getUser()); + BasicBlock *UserBB = UserInst->getParent(); + + // Check whether user is in current function. + if (UserBB->getParent() == new_kernel) { + Instruction *load_insert = nullptr; + + // Check dominance relation between def bb and user bb. + if (auto *PHI = dyn_cast(UserInst)) { + BasicBlock *incoming = PHI->getIncomingBlock(U); + if (!DT.dominates(DefBB, incoming)) { + load_insert = incoming->getTerminator(); + } + } else if (!DT.dominates(DefBB, UserBB)) { + load_insert = UserInst; + } + + if (load_insert) { + IRBuilder<> loadIR(load_insert); + U.set(live_values.getReload(OldDef, loadIR, "_reload")); + } + } + } + } + + // Removing incoming PHI node edges might have created some redundant ones. + for (auto *BB : region.blocks) { + BasicBlock *cBB = cast(vmap[BB]); + for (auto I = cBB->begin(); I != cBB->end();) { + if (auto *PHI = dyn_cast(&*(I++))) { + if (auto *V = PHI->hasConstantValue()) { + PHI->replaceAllUsesWith(V); + PHI->eraseFromParent(); + } + } else { + break; + } + } + } + + // Remap any remaining unmapped instructions coming from DT-based reloads + for (auto &BB : *new_kernel) { + for (Instruction &I : BB) { + RemapInstruction(&I, vmap, remapFlags); + } + } + + LLVM_DEBUG(dbgs() << "new kernel function: " << new_kernel->getName() + << "\n";); + return new_kernel; +} + +/// @brief This function is a copy from llvm::CloneBasicBlock. In order to +/// update live variable information, some of codes are added. +/// +/// @param[in] bb Basic block to copy. +/// @param[out] vmap Map for value for cloning. +/// @param[in] name_suffix Name for suffix. +/// @param[out] live_defs_info Live definitions' info current basic block. +/// @param[in] F Current function. +/// +/// @return Return cloned basic block. +BasicBlock *compiler::utils::Barrier::CloneBasicBlock( + BasicBlock *bb, ValueToValueMapTy &vmap, const Twine &name_suffix, + live_variable_mem_t &live_defs_info, Function *F) { + BasicBlock *new_bb = BasicBlock::Create(bb->getContext(), "", F); + if (bb->hasName()) + new_bb->setName(bb->getName() + name_suffix); + + // Loop over all instructions, and copy them over. + for (Instruction &i : *bb) { + Instruction *new_inst = i.clone(); + if (i.hasName()) + new_inst->setName(i.getName() + name_suffix); + new_inst->insertInto(new_bb, new_bb->end()); + + // Record live variables' defs which are in current kernel. + if (whole_live_variables_set_.contains(&i)) { + live_defs_info.insert(&i); + } + + vmap[&i] = new_inst; + } + return new_bb; +} + +/// @brief Seperate kernel function with barrier boundary. +void compiler::utils::Barrier::SeperateKernelWithBarrier() { + if (barriers_.empty()) + return; + + for (auto &[i, region] : barrier_region_id_map_) { + kernel_id_map_[region.id] = GenerateNewKernel(region); + } + + // Record barrier information on metadata. + SmallString<128> name; + LLVMContext &context = module_.getContext(); + ValueAsMetadata *num_barriers_ = ValueAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(context), barriers_.size())); + MDNode *num_barriers__md = + MDNode::get(context, ArrayRef(num_barriers_)); + NamedMDNode *barrier_md = module_.getOrInsertNamedMetadata( + Twine(func_.getName() + "_barrier").toStringRef(name)); + barrier_md->addOperand(num_barriers__md); + + LLVM_DEBUG({ + for (const auto &Kid : kernel_id_map_) { + dbgs() << "kernel_id[" << Kid.first << "] = " << Kid.second->getName() + << "\n"; + } + + dbgs() << "\n\n" << module_ << "\n\n"; + }); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp new file mode 100644 index 0000000000000..372280d135302 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp @@ -0,0 +1,1270 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +namespace compiler { +namespace utils { + +AnalysisKey BuiltinInfoAnalysis::Key; + +BuiltinInfoAnalysis::BuiltinInfoAnalysis() + : BICallback([](const Module &) -> BuiltinInfo { + return BuiltinInfo(std::make_unique(nullptr)); + }) {} + +Module *BuiltinInfo::getBuiltinsModule() { + if (LangImpl) { + return LangImpl->getBuiltinsModule(); + } + // Mux builtins don't need a module. + return nullptr; +} + +std::optional>> +BuiltinInfo::identifyMuxBuiltin(const Function &F) const { + StringRef Name = F.getName(); + auto ID = + StringSwitch>(Name) + .Case(MuxBuiltins::isftz, eMuxBuiltinIsFTZ) + .Case(MuxBuiltins::usefast, eMuxBuiltinUseFast) + .Case(MuxBuiltins::isembeddedprofile, eMuxBuiltinIsEmbeddedProfile) + .Case(MuxBuiltins::get_global_size, eMuxBuiltinGetGlobalSize) + .Case(MuxBuiltins::get_global_id, eMuxBuiltinGetGlobalId) + .Case(MuxBuiltins::get_global_offset, eMuxBuiltinGetGlobalOffset) + .Case(MuxBuiltins::get_local_size, eMuxBuiltinGetLocalSize) + .Case(MuxBuiltins::get_local_id, eMuxBuiltinGetLocalId) + .Case(MuxBuiltins::set_local_id, eMuxBuiltinSetLocalId) + .Case(MuxBuiltins::get_sub_group_id, eMuxBuiltinGetSubGroupId) + .Case(MuxBuiltins::set_sub_group_id, eMuxBuiltinSetSubGroupId) + .Case(MuxBuiltins::get_num_groups, eMuxBuiltinGetNumGroups) + .Case(MuxBuiltins::get_num_sub_groups, eMuxBuiltinGetNumSubGroups) + .Case(MuxBuiltins::set_num_sub_groups, eMuxBuiltinSetNumSubGroups) + .Case(MuxBuiltins::get_max_sub_group_size, + eMuxBuiltinGetMaxSubGroupSize) + .Case(MuxBuiltins::set_max_sub_group_size, + eMuxBuiltinSetMaxSubGroupSize) + .Case(MuxBuiltins::get_group_id, eMuxBuiltinGetGroupId) + .Case(MuxBuiltins::get_work_dim, eMuxBuiltinGetWorkDim) + .Case(MuxBuiltins::dma_read_1d, eMuxBuiltinDMARead1D) + .Case(MuxBuiltins::dma_read_2d, eMuxBuiltinDMARead2D) + .Case(MuxBuiltins::dma_read_3d, eMuxBuiltinDMARead3D) + .Case(MuxBuiltins::dma_write_1d, eMuxBuiltinDMAWrite1D) + .Case(MuxBuiltins::dma_write_2d, eMuxBuiltinDMAWrite2D) + .Case(MuxBuiltins::dma_write_3d, eMuxBuiltinDMAWrite3D) + .Case(MuxBuiltins::dma_wait, eMuxBuiltinDMAWait) + .Case(MuxBuiltins::get_global_linear_id, eMuxBuiltinGetGlobalLinearId) + .Case(MuxBuiltins::get_local_linear_id, eMuxBuiltinGetLocalLinearId) + .Case(MuxBuiltins::get_enqueued_local_size, + eMuxBuiltinGetEnqueuedLocalSize) + .Case(MuxBuiltins::get_sub_group_size, eMuxBuiltinGetSubGroupSize) + .Case(MuxBuiltins::get_sub_group_local_id, + eMuxBuiltinGetSubGroupLocalId) + .Case(MuxBuiltins::work_group_barrier, eMuxBuiltinWorkGroupBarrier) + .Case(MuxBuiltins::sub_group_barrier, eMuxBuiltinSubGroupBarrier) + .Case(MuxBuiltins::mem_barrier, eMuxBuiltinMemBarrier) + .Default(std::nullopt); + if (ID) { + switch (*ID) { + default: + return {{*ID, {}}}; + case eMuxBuiltinDMARead1D: + case eMuxBuiltinDMARead2D: + case eMuxBuiltinDMARead3D: + case eMuxBuiltinDMAWrite1D: + case eMuxBuiltinDMAWrite2D: + case eMuxBuiltinDMAWrite3D: + // Return the event type used by these builtins. The event type is + // required to declare/define these builtins, so return it here for + // the sake of completeness. The event type doesn't change the + // builtins' name (i.e., it's not mangled) as it's required to be + // consistent at any single snapshot of the module, though it may + // change through time. + return {{*ID, {F.getReturnType()}}}; + } + } + + // Now check for group functions, which are a bit more involved as there's + // many of them and they're also mangled. We enforce that the mangling makes + // sense, otherwise the builtin is declared as invalid. + const bool IsSubgroupOp = Name.consume_front("__mux_sub_group_"); + const bool IsVecgroupOp = Name.consume_front("__mux_vec_group_"); + if (!IsSubgroupOp && !IsVecgroupOp && + !Name.consume_front("__mux_work_group_")) { + return std::nullopt; + } + +#define SCOPED_GROUP_OP(OP) \ + (IsSubgroupOp ? eMuxBuiltinSubgroup##OP \ + : IsVecgroupOp ? eMuxBuiltinVecgroup##OP \ + : eMuxBuiltinWorkgroup##OP) + + // Most group operations have one argument, except for broadcasts. Despite + // that, we don't mangle the indices as they're fixed. + const unsigned NumExpectedMangledArgs = 1; + + if (Name.consume_front("any")) { + ID = SCOPED_GROUP_OP(Any); + } else if (Name.consume_front("all")) { + ID = SCOPED_GROUP_OP(All); + } else if (Name.consume_front("broadcast")) { + ID = SCOPED_GROUP_OP(Broadcast); + } else if (Name.consume_front("shuffle_up")) { + if (!IsSubgroupOp) { + return std::nullopt; + } + ID = eMuxBuiltinSubgroupShuffleUp; + } else if (Name.consume_front("shuffle_down")) { + if (!IsSubgroupOp) { + return std::nullopt; + } + ID = eMuxBuiltinSubgroupShuffleDown; + } else if (Name.consume_front("shuffle_xor")) { + if (!IsSubgroupOp) { + return std::nullopt; + } + ID = eMuxBuiltinSubgroupShuffleXor; + } else if (Name.consume_front("shuffle")) { + if (!IsSubgroupOp) { + return std::nullopt; + } + ID = eMuxBuiltinSubgroupShuffle; + } else if (Name.consume_front("reduce_")) { + auto NextIdx = Name.find_first_of('_'); + std::string Group = Name.substr(0, NextIdx).str(); + Name = Name.drop_front(Group.size()); + + if (Group == "logical") { + Name = Name.drop_front(); // Drop the underscore + auto NextIdx = Name.find_first_of('_'); + auto RealGroup = Name.substr(0, NextIdx); + Group += "_" + RealGroup.str(); + Name = Name.drop_front(RealGroup.size()); + } + + ID = StringSwitch>(Group) + .Case("add", SCOPED_GROUP_OP(ReduceAdd)) + .Case("fadd", SCOPED_GROUP_OP(ReduceFAdd)) + .Case("mul", SCOPED_GROUP_OP(ReduceMul)) + .Case("fmul", SCOPED_GROUP_OP(ReduceFMul)) + .Case("smin", SCOPED_GROUP_OP(ReduceSMin)) + .Case("umin", SCOPED_GROUP_OP(ReduceUMin)) + .Case("fmin", SCOPED_GROUP_OP(ReduceFMin)) + .Case("smax", SCOPED_GROUP_OP(ReduceSMax)) + .Case("umax", SCOPED_GROUP_OP(ReduceUMax)) + .Case("fmax", SCOPED_GROUP_OP(ReduceFMax)) + .Case("and", SCOPED_GROUP_OP(ReduceAnd)) + .Case("or", SCOPED_GROUP_OP(ReduceOr)) + .Case("xor", SCOPED_GROUP_OP(ReduceXor)) + .Case("logical_and", SCOPED_GROUP_OP(ReduceLogicalAnd)) + .Case("logical_or", SCOPED_GROUP_OP(ReduceLogicalOr)) + .Case("logical_xor", SCOPED_GROUP_OP(ReduceLogicalXor)) + .Default(std::nullopt); + } else if (Name.consume_front("scan_")) { + const bool IsInclusive = Name.consume_front("inclusive_"); + if (!IsInclusive && !Name.consume_front("exclusive_")) { + return std::nullopt; + } + + auto NextIdx = Name.find_first_of('_'); + std::string Group = Name.substr(0, NextIdx).str(); + Name = Name.drop_front(Group.size()); + + if (Group == "logical") { + auto NextIdx = Name.find_first_of('_', /*From*/ 1); + auto RealGroup = Name.substr(0, NextIdx); + Group += RealGroup.str(); + Name = Name.drop_front(RealGroup.size()); + } + + ID = StringSwitch>(Group) + .Case("add", IsInclusive ? SCOPED_GROUP_OP(ScanAddInclusive) + : SCOPED_GROUP_OP(ScanAddExclusive)) + .Case("fadd", IsInclusive ? SCOPED_GROUP_OP(ScanFAddInclusive) + : SCOPED_GROUP_OP(ScanFAddExclusive)) + .Case("mul", IsInclusive ? SCOPED_GROUP_OP(ScanMulInclusive) + : SCOPED_GROUP_OP(ScanMulExclusive)) + .Case("fmul", IsInclusive ? SCOPED_GROUP_OP(ScanFMulInclusive) + : SCOPED_GROUP_OP(ScanFMulExclusive)) + .Case("smin", IsInclusive ? SCOPED_GROUP_OP(ScanSMinInclusive) + : SCOPED_GROUP_OP(ScanSMinExclusive)) + .Case("umin", IsInclusive ? SCOPED_GROUP_OP(ScanUMinInclusive) + : SCOPED_GROUP_OP(ScanUMinExclusive)) + .Case("fmin", IsInclusive ? SCOPED_GROUP_OP(ScanFMinInclusive) + : SCOPED_GROUP_OP(ScanFMinExclusive)) + .Case("smax", IsInclusive ? SCOPED_GROUP_OP(ScanSMaxInclusive) + : SCOPED_GROUP_OP(ScanSMaxExclusive)) + .Case("umax", IsInclusive ? SCOPED_GROUP_OP(ScanUMaxInclusive) + : SCOPED_GROUP_OP(ScanUMaxExclusive)) + .Case("fmax", IsInclusive ? SCOPED_GROUP_OP(ScanFMaxInclusive) + : SCOPED_GROUP_OP(ScanFMaxExclusive)) + .Case("and", IsInclusive ? SCOPED_GROUP_OP(ScanAndInclusive) + : SCOPED_GROUP_OP(ScanAndExclusive)) + .Case("or", IsInclusive ? SCOPED_GROUP_OP(ScanOrInclusive) + : SCOPED_GROUP_OP(ScanOrExclusive)) + .Case("xor", IsInclusive ? SCOPED_GROUP_OP(ScanXorInclusive) + : SCOPED_GROUP_OP(ScanXorExclusive)) + .Case("logical_and", + IsInclusive ? SCOPED_GROUP_OP(ScanLogicalAndInclusive) + : SCOPED_GROUP_OP(ScanLogicalAndExclusive)) + .Case("logical_or", IsInclusive + ? SCOPED_GROUP_OP(ScanLogicalOrInclusive) + : SCOPED_GROUP_OP(ScanLogicalOrExclusive)) + .Case("logical_xor", + IsInclusive ? SCOPED_GROUP_OP(ScanLogicalXorInclusive) + : SCOPED_GROUP_OP(ScanLogicalXorExclusive)) + .Default(std::nullopt); + } + if (!ID) { + return std::nullopt; + } + + std::vector OverloadInfo; + + // Consume the rest of this group Op function name. If we can't identify a + // series of mangled type names, this builtin is invalid. + unsigned NumMangledArgs = 0; + // Work-group builtins have an unmangled 'barrier ID' parameter first, which + // we want to skip. + const unsigned Offset = ID >= eFirstMuxWorkgroupCollectiveBuiltin && + ID <= eLastMuxWorkgroupCollectiveBuiltin; + while (!Name.empty()) { + if (!Name.consume_front("_")) { + return std::nullopt; + } + auto [Ty, NewName] = getDemangledTypeFromStr(Name, F.getContext()); + Name = NewName; + + auto ParamIdx = Offset + NumMangledArgs; + if (ParamIdx >= F.arg_size() || Ty != F.getArg(ParamIdx)->getType()) { + return std::nullopt; + } + + ++NumMangledArgs; + OverloadInfo.push_back(Ty); + } + if (NumMangledArgs != NumExpectedMangledArgs) { + return std::nullopt; + } + + return {{*ID, OverloadInfo}}; +#undef SCOPED_GROUP_OP +} + +BuiltinUniformity BuiltinInfo::isBuiltinUniform(const Builtin &B, + const CallInst *CI, + unsigned SimdDimIdx) const { + switch (B.ID) { + default: + break; + case eMuxBuiltinGetGlobalId: + case eMuxBuiltinGetLocalId: { + // We need to know the dimension requested from these builtins at compile + // time to infer their uniformity. + if (!CI || CI->arg_empty()) { + return eBuiltinUniformityNever; + } + auto *Rank = dyn_cast(CI->getArgOperand(0)); + if (!Rank) { + // The Rank is some function, which "might" evaluate to zero + // sometimes, so we let the packetizer sort it out with some + // conditional magic. + // TODO Make sure this can never go haywire in weird edge cases. + // Where we have one get_global_id() dependent on another, this is + // not packetized correctly. Doing so is very hard! We should + // probably just fail to packetize in this case. We might also be + // able to return eBuiltinUniformityNever here, in cases where we can + // prove that the value can never be zero. + return eBuiltinUniformityMaybeInstanceID; + } + // Only vectorize on selected dimension. The value of get_global_id with + // other ranks is uniform. + if (Rank->getZExtValue() == SimdDimIdx) { + return eBuiltinUniformityInstanceID; + } + + return eBuiltinUniformityAlways; + } + case eMuxBuiltinGetSubGroupLocalId: + return eBuiltinUniformityInstanceID; + case eMuxBuiltinGetLocalLinearId: + case eMuxBuiltinGetGlobalLinearId: + // TODO: This is fine for vectorizing in the x-axis, but currently we do + // not support vectorizing along y or z. + return SimdDimIdx ? eBuiltinUniformityNever : eBuiltinUniformityInstanceID; + } + + // Reductions and broadcasts are always uniform + if (auto Info = isMuxGroupCollective(B.ID)) { + if (Info->isAnyAll() || Info->isReduction() || Info->isBroadcast()) { + return eBuiltinUniformityAlways; + } + } + + if (LangImpl) { + return LangImpl->isBuiltinUniform(B, CI, SimdDimIdx); + } + return eBuiltinUniformityUnknown; +} + +std::optional BuiltinInfo::analyzeBuiltin(const Function &F) const { + // Handle LLVM intrinsics. + if (F.isIntrinsic()) { + int32_t Properties = eBuiltinPropertyNone; + + const Intrinsic::ID IntrID = (Intrinsic::ID)F.getIntrinsicID(); + const AttributeList AS = multi_llvm::Intrinsic::getAttributes( + F.getContext(), IntrID, F.getFunctionType()); + const bool NoSideEffect = F.onlyReadsMemory(); + bool SafeIntrinsic = false; + switch (IntrID) { + default: + SafeIntrinsic = false; + break; + case Intrinsic::smin: + case Intrinsic::smax: + case Intrinsic::umin: + case Intrinsic::umax: + case Intrinsic::abs: + case Intrinsic::ctlz: + case Intrinsic::cttz: + case Intrinsic::sqrt: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::pow: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::fma: + case Intrinsic::fabs: + case Intrinsic::minnum: + case Intrinsic::maxnum: + case Intrinsic::copysign: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::round: + case Intrinsic::ctpop: + case Intrinsic::fmuladd: + case Intrinsic::fshl: + case Intrinsic::fshr: + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + case Intrinsic::ssub_sat: + case Intrinsic::usub_sat: + case Intrinsic::bitreverse: + // All these function are overloadable and have both scalar and vector + // versions. + Properties |= eBuiltinPropertyVectorEquivalent; + SafeIntrinsic = true; + break; + case Intrinsic::assume: + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + case Intrinsic::ptr_annotation: + case Intrinsic::var_annotation: + case Intrinsic::experimental_noalias_scope_decl: + SafeIntrinsic = true; + break; + case Intrinsic::memset: + case Intrinsic::memcpy: + Properties |= eBuiltinPropertyNoVectorEquivalent; + Properties |= eBuiltinPropertySideEffects; + break; + } + if (NoSideEffect || SafeIntrinsic) { + Properties |= eBuiltinPropertyNoSideEffects; + if (!AS.hasFnAttr(Attribute::NoDuplicate)) { + Properties |= eBuiltinPropertySupportsInstantiation; + } + } + return Builtin{F, eBuiltinUnknown, (BuiltinProperties)Properties}; + } + + auto MB = identifyMuxBuiltin(F); + if (!MB) { + // It's not a Mux builtin, so defer to the language implementation + if (LangImpl) { + return LangImpl->analyzeBuiltin(F); + } + return std::nullopt; + } + + auto [ID, OverloadInfo] = *MB; + + // Check that all overloadable builtins have returned some overloading + // information, for API consistency. + assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) && + "Inconsistency in overloadable builtin APIs"); + + bool IsConvergent = false; + unsigned Properties = eBuiltinPropertyNone; + switch (ID) { + default: + break; + case eMuxBuiltinMemBarrier: + Properties = eBuiltinPropertySideEffects; + break; + case eMuxBuiltinSubGroupBarrier: + case eMuxBuiltinWorkGroupBarrier: + IsConvergent = true; + Properties = eBuiltinPropertyExecutionFlow | eBuiltinPropertySideEffects; + break; + case eMuxBuiltinDMARead1D: + case eMuxBuiltinDMARead2D: + case eMuxBuiltinDMARead3D: + case eMuxBuiltinDMAWrite1D: + case eMuxBuiltinDMAWrite2D: + case eMuxBuiltinDMAWrite3D: + case eMuxBuiltinDMAWait: + // Our DMA builtins, by default, rely on thread checks against specific + // work-item IDs, so they must be convergent. + IsConvergent = true; + Properties = eBuiltinPropertyNoSideEffects; + break; + case eMuxBuiltinGetWorkDim: + case eMuxBuiltinGetGroupId: + case eMuxBuiltinGetGlobalSize: + case eMuxBuiltinGetGlobalOffset: + case eMuxBuiltinGetLocalSize: + case eMuxBuiltinGetNumGroups: + case eMuxBuiltinGetGlobalLinearId: + case eMuxBuiltinGetLocalLinearId: + case eMuxBuiltinGetGlobalId: + case eMuxBuiltinGetSubGroupLocalId: + Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyRematerializable; + break; + case eMuxBuiltinGetLocalId: + Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyLocalID | + eBuiltinPropertyRematerializable; + break; + case eMuxBuiltinIsFTZ: + case eMuxBuiltinIsEmbeddedProfile: + case eMuxBuiltinUseFast: + Properties = eBuiltinPropertyNoSideEffects; + break; + } + + // Group functions are convergent. + if (isMuxGroupCollective(ID)) { + IsConvergent = true; + } + + if (!IsConvergent) { + Properties |= eBuiltinPropertyKnownNonConvergent; + } + + return Builtin{F, ID, (BuiltinProperties)Properties, OverloadInfo}; +} + +std::optional +BuiltinInfo::analyzeBuiltinCall(const CallInst &CI, unsigned SimdDimIdx) const { + if (auto *const callee = dyn_cast(CI.getCalledOperand())) { + if (const auto B = analyzeBuiltin(*callee)) { + const auto U = isBuiltinUniform(*B, &CI, SimdDimIdx); + return BuiltinCall{*B, CI, U}; + } + } + return std::nullopt; +} + +Function *BuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width, + Module *M) { + // We don't handle LLVM intrinsics here + if (B.function.isIntrinsic()) { + return nullptr; + } + + if (LangImpl) { + return LangImpl->getVectorEquivalent(B, Width, M); + } + return nullptr; +} + +Function *BuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) { + // We will first check to see if this is an LLVM intrinsic that has a scalar + // equivalent. + if (B.function.isIntrinsic()) { + // Analyze the builtin. Some functions have no scalar equivalent. + const auto Props = B.properties; + if (!(Props & eBuiltinPropertyVectorEquivalent)) { + return nullptr; + } + + // Check the return type. + auto *VecRetTy = dyn_cast(B.function.getReturnType()); + if (!VecRetTy) { + return nullptr; + } + + auto IntrinsicID = B.function.getIntrinsicID(); + // Currently, we can only handle correctly intrinsics that have one + // overloaded type, used for both the return type and all of the arguments. + // TODO: More generic support for intrinsics with vector equivalents. + for (Type *ArgTy : B.function.getFunctionType()->params()) { + // If the argument isn't a vector, then it isn't going to get scalarized, + // so don't worry about it. + if (ArgTy->isVectorTy() && ArgTy != VecRetTy) { + return nullptr; + } + } + Type *ScalarType = VecRetTy->getElementType(); + // Get the scalar version of the intrinsic + Function *ScalarIntrinsic = + Intrinsic::getOrInsertDeclaration(M, IntrinsicID, ScalarType); + + return ScalarIntrinsic; + } + + if (LangImpl) { + return LangImpl->getScalarEquivalent(B, M); + } + return nullptr; +} + +Value *BuiltinInfo::emitBuiltinInline(Function *Builtin, IRBuilder<> &B, + ArrayRef Args) { + if (LangImpl) { + return LangImpl->emitBuiltinInline(Builtin, B, Args); + } + return nullptr; +} + +std::optional BuiltinInfo::getBuiltinRange( + CallInst &CI, std::array, 3> MaxLocalSizes, + std::array, 3> MaxGlobalSizes) const { + auto *F = CI.getCalledFunction(); + // Ranges only apply to integer types, and ensure that there's a named + // function to analyze. + if (!F || !F->hasName() || !CI.getType()->isIntegerTy()) { + return std::nullopt; + } + + // First, check mux builtins + if (auto MB = identifyMuxBuiltin(*F); MB && isMuxBuiltinID(MB->first)) { + return MuxImpl->getBuiltinRange(CI, MB->first, MaxLocalSizes, + MaxGlobalSizes); + } + + // Next, ask the language builtin info + if (LangImpl) { + return LangImpl->getBuiltinRange(CI, MaxLocalSizes, MaxGlobalSizes); + } + + return std::nullopt; +} + +Instruction *BuiltinInfo::lowerBuiltinToMuxBuiltin(CallInst &CI) { + if (LangImpl) { + return LangImpl->lowerBuiltinToMuxBuiltin(CI, *MuxImpl); + } + // We shouldn't be mapping mux builtins to mux builtins, so we can stop here. + return nullptr; +} + +std::optional BuiltinInfo::getPrintfBuiltin() const { + if (LangImpl) { + return LangImpl->getPrintfBuiltin(); + } + return std::nullopt; +} + +bool BuiltinInfo::requiresSchedulingParameters(BuiltinID ID) { + // Defer to mux for the scheduling parameters. + return MuxImpl->requiresSchedulingParameters(ID); +} + +Type *BuiltinInfo::getRemappedTargetExtTy(Type *Ty, Module &M) { + // Defer to mux for the scheduling parameters. + return MuxImpl->getRemappedTargetExtTy(Ty, M); +} + +SmallVector +BuiltinInfo::getMuxSchedulingParameters(Module &M) { + // Defer to mux for the scheduling parameters. + return MuxImpl->getMuxSchedulingParameters(M); +} + +SmallVector +BuiltinInfo::getFunctionSchedulingParameters(Function &F) { + // Defer to mux for the scheduling parameters. + return MuxImpl->getFunctionSchedulingParameters(F); +} + +Value *BuiltinInfo::initializeSchedulingParamForWrappedKernel( + const SchedParamInfo &Info, IRBuilder<> &B, Function &IntoF, + Function &CalleeF) { + return MuxImpl->initializeSchedulingParamForWrappedKernel(Info, B, IntoF, + CalleeF); +} + +// This provides an extremely simple mangling scheme matching LLVM's intrinsic +// mangling system. It is only designed to be used with a specific set of types +// and is not a general-purpose mangler. +std::string BuiltinInfo::getMangledTypeStr(Type *Ty) { + std::string Result; + if (VectorType *VTy = dyn_cast(Ty)) { + const ElementCount EC = VTy->getElementCount(); + if (EC.isScalable()) { + Result += "nx"; + } + return "v" + utostr(EC.getKnownMinValue()) + + getMangledTypeStr(VTy->getElementType()); + } + + if (Ty) { + switch (Ty->getTypeID()) { + default: + break; + case Type::HalfTyID: + return "f16"; + case Type::BFloatTyID: + return "bf16"; + case Type::FloatTyID: + return "f32"; + case Type::DoubleTyID: + return "f64"; + case Type::IntegerTyID: + return "i" + utostr(cast(Ty)->getBitWidth()); + } + } + llvm_unreachable("Unhandled type"); +} + +std::pair +BuiltinInfo::getDemangledTypeFromStr(StringRef TyStr, LLVMContext &Ctx) { + const bool IsScalable = TyStr.consume_front("nx"); + if (TyStr.consume_front("v")) { + unsigned EC; + if (TyStr.consumeInteger(10, EC)) { + return {nullptr, TyStr}; + } + if (auto [EltTy, NewTyStr] = getDemangledTypeFromStr(TyStr, Ctx); EltTy) { + return {VectorType::get(EltTy, EC, IsScalable), NewTyStr}; + } + return {nullptr, TyStr}; + } + if (TyStr.consume_front("f16")) { + return {Type::getHalfTy(Ctx), TyStr}; + } + if (TyStr.consume_front("bf16")) { + return {Type::getBFloatTy(Ctx), TyStr}; + } + if (TyStr.consume_front("f32")) { + return {Type::getFloatTy(Ctx), TyStr}; + } + if (TyStr.consume_front("f64")) { + return {Type::getDoubleTy(Ctx), TyStr}; + } + unsigned IntBitWidth; + if (TyStr.consume_front("i") && !TyStr.consumeInteger(10, IntBitWidth)) { + return {IntegerType::get(Ctx, IntBitWidth), TyStr}; + } + + return {nullptr, TyStr}; +} + +std::string BuiltinInfo::getMuxBuiltinName(BuiltinID ID, + ArrayRef OverloadInfo) { + assert(isMuxBuiltinID(ID)); + switch (ID) { + default: + break; + case eMuxBuiltinIsFTZ: + return MuxBuiltins::isftz; + case eMuxBuiltinUseFast: + return MuxBuiltins::usefast; + case eMuxBuiltinIsEmbeddedProfile: + return MuxBuiltins::isembeddedprofile; + case eMuxBuiltinGetGlobalSize: + return MuxBuiltins::get_global_size; + case eMuxBuiltinGetGlobalId: + return MuxBuiltins::get_global_id; + case eMuxBuiltinGetGlobalOffset: + return MuxBuiltins::get_global_offset; + case eMuxBuiltinGetLocalSize: + return MuxBuiltins::get_local_size; + case eMuxBuiltinGetLocalId: + return MuxBuiltins::get_local_id; + case eMuxBuiltinSetLocalId: + return MuxBuiltins::set_local_id; + case eMuxBuiltinGetSubGroupId: + return MuxBuiltins::get_sub_group_id; + case eMuxBuiltinSetSubGroupId: + return MuxBuiltins::set_sub_group_id; + case eMuxBuiltinGetNumGroups: + return MuxBuiltins::get_num_groups; + case eMuxBuiltinGetNumSubGroups: + return MuxBuiltins::get_num_sub_groups; + case eMuxBuiltinSetNumSubGroups: + return MuxBuiltins::set_num_sub_groups; + case eMuxBuiltinGetMaxSubGroupSize: + return MuxBuiltins::get_max_sub_group_size; + case eMuxBuiltinSetMaxSubGroupSize: + return MuxBuiltins::set_max_sub_group_size; + case eMuxBuiltinGetGroupId: + return MuxBuiltins::get_group_id; + case eMuxBuiltinGetWorkDim: + return MuxBuiltins::get_work_dim; + case eMuxBuiltinDMARead1D: + return MuxBuiltins::dma_read_1d; + case eMuxBuiltinDMARead2D: + return MuxBuiltins::dma_read_2d; + case eMuxBuiltinDMARead3D: + return MuxBuiltins::dma_read_3d; + case eMuxBuiltinDMAWrite1D: + return MuxBuiltins::dma_write_1d; + case eMuxBuiltinDMAWrite2D: + return MuxBuiltins::dma_write_2d; + case eMuxBuiltinDMAWrite3D: + return MuxBuiltins::dma_write_3d; + case eMuxBuiltinDMAWait: + return MuxBuiltins::dma_wait; + case eMuxBuiltinGetGlobalLinearId: + return MuxBuiltins::get_global_linear_id; + case eMuxBuiltinGetLocalLinearId: + return MuxBuiltins::get_local_linear_id; + case eMuxBuiltinGetEnqueuedLocalSize: + return MuxBuiltins::get_enqueued_local_size; + case eMuxBuiltinGetSubGroupSize: + return MuxBuiltins::get_sub_group_size; + case eMuxBuiltinGetSubGroupLocalId: + return MuxBuiltins::get_sub_group_local_id; + case eMuxBuiltinMemBarrier: + return MuxBuiltins::mem_barrier; + case eMuxBuiltinWorkGroupBarrier: + return MuxBuiltins::work_group_barrier; + case eMuxBuiltinSubGroupBarrier: + return MuxBuiltins::sub_group_barrier; + } + + // A sneaky macro to do case statements on all scopes of a group operation. + // Note that it is missing a leading 'case' and a trailing ':' to trick + // clang-format into formatting it like a regular case statement. +#define CASE_GROUP_OP_ALL_SCOPES(OP) \ + eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP: \ + case eMuxBuiltinWorkgroup##OP + + std::string BaseName = [](BuiltinID ID) { + // For simplicity, return all group operations as 'work_group' and replace + // the string with 'sub_group' or 'vec_group' post-hoc. + switch (ID) { + default: + return ""; + case CASE_GROUP_OP_ALL_SCOPES(All): + return "__mux_work_group_all"; + case CASE_GROUP_OP_ALL_SCOPES(Any): + return "__mux_work_group_any"; + case CASE_GROUP_OP_ALL_SCOPES(Broadcast): + return "__mux_work_group_broadcast"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd): + return "__mux_work_group_reduce_add"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd): + return "__mux_work_group_reduce_fadd"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin): + return "__mux_work_group_reduce_smin"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin): + return "__mux_work_group_reduce_umin"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin): + return "__mux_work_group_reduce_fmin"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax): + return "__mux_work_group_reduce_smax"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax): + return "__mux_work_group_reduce_umax"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax): + return "__mux_work_group_reduce_fmax"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceMul): + return "__mux_work_group_reduce_mul"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul): + return "__mux_work_group_reduce_fmul"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd): + return "__mux_work_group_reduce_and"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceOr): + return "__mux_work_group_reduce_or"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceXor): + return "__mux_work_group_reduce_xor"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd): + return "__mux_work_group_reduce_logical_and"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr): + return "__mux_work_group_reduce_logical_or"; + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor): + return "__mux_work_group_reduce_logical_xor"; + case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive): + return "__mux_work_group_scan_inclusive_add"; + case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive): + return "__mux_work_group_scan_inclusive_fadd"; + case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive): + return "__mux_work_group_scan_exclusive_add"; + case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive): + return "__mux_work_group_scan_exclusive_fadd"; + case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive): + return "__mux_work_group_scan_inclusive_smin"; + case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive): + return "__mux_work_group_scan_inclusive_umin"; + case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive): + return "__mux_work_group_scan_inclusive_fmin"; + case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive): + return "__mux_work_group_scan_exclusive_smin"; + case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive): + return "__mux_work_group_scan_exclusive_umin"; + case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive): + return "__mux_work_group_scan_exclusive_fmin"; + case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive): + return "__mux_work_group_scan_inclusive_smax"; + case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive): + return "__mux_work_group_scan_inclusive_umax"; + case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive): + return "__mux_work_group_scan_inclusive_fmax"; + case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive): + return "__mux_work_group_scan_exclusive_smax"; + case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive): + return "__mux_work_group_scan_exclusive_umax"; + case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive): + return "__mux_work_group_scan_exclusive_fmax"; + case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive): + return "__mux_work_group_scan_inclusive_mul"; + case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive): + return "__mux_work_group_scan_inclusive_fmul"; + case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive): + return "__mux_work_group_scan_exclusive_mul"; + case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive): + return "__mux_work_group_scan_exclusive_fmul"; + case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive): + return "__mux_work_group_scan_inclusive_and"; + case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive): + return "__mux_work_group_scan_exclusive_and"; + case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive): + return "__mux_work_group_scan_inclusive_or"; + case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive): + return "__mux_work_group_scan_exclusive_or"; + case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive): + return "__mux_work_group_scan_inclusive_xor"; + case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive): + return "__mux_work_group_scan_exclusive_xor"; + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive): + return "__mux_work_group_scan_inclusive_logical_and"; + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive): + return "__mux_work_group_scan_exclusive_logical_and"; + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive): + return "__mux_work_group_scan_inclusive_logical_or"; + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive): + return "__mux_work_group_scan_exclusive_logical_or"; + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive): + return "__mux_work_group_scan_inclusive_logical_xor"; + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive): + return "__mux_work_group_scan_exclusive_logical_xor"; + case eMuxBuiltinSubgroupShuffle: + return "__mux_work_group_shuffle"; + case eMuxBuiltinSubgroupShuffleUp: + return "__mux_work_group_shuffle_up"; + case eMuxBuiltinSubgroupShuffleDown: + return "__mux_work_group_shuffle_down"; + case eMuxBuiltinSubgroupShuffleXor: + return "__mux_work_group_shuffle_xor"; + } + }(ID); + + if (!BaseName.empty()) { + assert(!OverloadInfo.empty() && + "Must know how to overload group operation"); + if (ID >= eFirstMuxSubgroupCollectiveBuiltin && + ID <= eLastMuxSubgroupCollectiveBuiltin) { + // Replace 'work' with 'sub' + BaseName = BaseName.replace(6, 4, "sub"); + } else if (ID >= eFirstMuxVecgroupCollectiveBuiltin && + ID <= eLastMuxVecgroupCollectiveBuiltin) { + // Replace 'work' with 'vec' + BaseName = BaseName.replace(6, 4, "vec"); + } + auto *const Ty = OverloadInfo.front(); + return BaseName + "_" + getMangledTypeStr(Ty); + } + llvm_unreachable("Unhandled mux builtin"); +#undef CASE_GROUP_OP_ALL_SCOPES +} + +Function *BuiltinInfo::defineMuxBuiltin(BuiltinID ID, Module &M, + ArrayRef OverloadInfo) { + assert(isMuxBuiltinID(ID) && "Only handling mux builtins"); + // Check that all overloadable builtins have returned some overloading + // information, for API consistency. + assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) && + "Inconsistency in overloadable builtin APIs"); + + Function *F = M.getFunction(getMuxBuiltinName(ID, OverloadInfo)); + // FIXME: We'd ideally want to declare it here to reduce pass + // inter-dependencies. + assert(F && "Function should have been pre-declared"); + if (!F->isDeclaration()) { + return F; + } + // Defer to the mux implementation to define this builtin. + return MuxImpl->defineMuxBuiltin(ID, M, OverloadInfo); +} + +Function *BuiltinInfo::getOrDeclareMuxBuiltin(BuiltinID ID, Module &M, + ArrayRef OverloadInfo) { + assert(isMuxBuiltinID(ID) && "Only handling mux builtins"); + // Check that all overloadable builtins have returned some overloading + // information, for API consistency. + assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) && + "Inconsistency in overloadable builtin APIs"); + // Defer to the mux implementation to get/declare this builtin. + return MuxImpl->getOrDeclareMuxBuiltin(ID, M, OverloadInfo); +} + +std::optional BuiltinInfo::isMuxGroupCollective(BuiltinID ID) { + GroupCollective Collective; + + if (ID >= eFirstMuxSubgroupCollectiveBuiltin && + ID <= eLastMuxSubgroupCollectiveBuiltin) { + Collective.Scope = GroupCollective::ScopeKind::SubGroup; + } else if (ID >= eFirstMuxWorkgroupCollectiveBuiltin && + ID <= eLastMuxWorkgroupCollectiveBuiltin) { + Collective.Scope = GroupCollective::ScopeKind::WorkGroup; + } else if (ID >= eFirstMuxVecgroupCollectiveBuiltin && + ID <= eLastMuxVecgroupCollectiveBuiltin) { + Collective.Scope = GroupCollective::ScopeKind::VectorGroup; + } else { + return std::nullopt; + } + + // A sneaky macro to do case statements on all scopes of a group operation. + // Note that it is missing a leading 'case' and a trailing ':' to trick + // clang-format into formatting it like a regular case statement. +#define CASE_GROUP_OP_ALL_SCOPES(OP) \ + eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP: \ + case eMuxBuiltinWorkgroup##OP + + switch (ID) { + default: + llvm_unreachable("Unhandled mux group builtin"); + case CASE_GROUP_OP_ALL_SCOPES(All): + Collective.Op = GroupCollective::OpKind::All; + break; + case CASE_GROUP_OP_ALL_SCOPES(Any): + Collective.Op = GroupCollective::OpKind::Any; + break; + case CASE_GROUP_OP_ALL_SCOPES(Broadcast): + Collective.Op = GroupCollective::OpKind::Broadcast; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd): + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr): + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor): + Collective.IsLogical = true; + [[fallthrough]]; + case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd): + case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd): + case CASE_GROUP_OP_ALL_SCOPES(ReduceMul): + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul): + case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin): + case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin): + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin): + case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax): + case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax): + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax): + case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd): + case CASE_GROUP_OP_ALL_SCOPES(ReduceOr): + case CASE_GROUP_OP_ALL_SCOPES(ReduceXor): + Collective.Op = GroupCollective::OpKind::Reduction; + break; + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive): + Collective.IsLogical = true; + [[fallthrough]]; + case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive): + Collective.Op = GroupCollective::OpKind::ScanInclusive; + break; + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive): + Collective.IsLogical = true; + [[fallthrough]]; + case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive): + Collective.Op = GroupCollective::OpKind::ScanExclusive; + break; + case eMuxBuiltinSubgroupShuffle: + Collective.Op = GroupCollective::OpKind::Shuffle; + break; + case eMuxBuiltinSubgroupShuffleUp: + Collective.Op = GroupCollective::OpKind::ShuffleUp; + break; + case eMuxBuiltinSubgroupShuffleDown: + Collective.Op = GroupCollective::OpKind::ShuffleDown; + break; + case eMuxBuiltinSubgroupShuffleXor: + Collective.Op = GroupCollective::OpKind::ShuffleXor; + break; + } + + // Then the recurrence kind. + if (Collective.Op == GroupCollective::OpKind::All) { + Collective.Recurrence = RecurKind::And; + } else if (Collective.Op == GroupCollective::OpKind::Any) { + Collective.Recurrence = RecurKind::Or; + } else if (Collective.Op == GroupCollective::OpKind::Reduction || + Collective.Op == GroupCollective::OpKind::ScanExclusive || + Collective.Op == GroupCollective::OpKind::ScanInclusive) { + switch (ID) { + case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd): + case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive): + Collective.Recurrence = RecurKind::Add; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd): + case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive): + Collective.Recurrence = RecurKind::FAdd; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceMul): + case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive): + Collective.Recurrence = RecurKind::Mul; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive): + Collective.Recurrence = RecurKind::FMul; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin): + case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive): + Collective.Recurrence = RecurKind::SMin; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin): + case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive): + Collective.Recurrence = RecurKind::UMin; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive): + Collective.Recurrence = RecurKind::FMin; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax): + case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive): + Collective.Recurrence = RecurKind::SMax; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax): + case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive): + Collective.Recurrence = RecurKind::UMax; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive): + Collective.Recurrence = RecurKind::FMax; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd): + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd): + case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive): + Collective.Recurrence = RecurKind::And; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceOr): + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr): + case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive): + Collective.Recurrence = RecurKind::Or; + break; + case CASE_GROUP_OP_ALL_SCOPES(ReduceXor): + case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor): + case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive): + case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive): + Collective.Recurrence = RecurKind::Xor; + break; + default: + llvm_unreachable("Unhandled mux group operation"); + } + } else if (!Collective.isBroadcast() && !Collective.isShuffleLike()) { + llvm_unreachable("Unhandled mux group operation"); + } + + return Collective; +#undef CASE_GROUP_OP_ALL_SCOPES +} + +std::optional +BuiltinInfo::getMuxGroupCollective(const GroupCollective &Group) { +#define SIMPLE_SCOPE_SWITCH(OP) \ + do { \ + switch (Group.Scope) { \ + case GroupCollective::ScopeKind::SubGroup: \ + return eMuxBuiltinSubgroup##OP; \ + case GroupCollective::ScopeKind::WorkGroup: \ + return eMuxBuiltinWorkgroup##OP; \ + case GroupCollective::ScopeKind::VectorGroup: \ + return eMuxBuiltinVecgroup##OP; \ + } \ + llvm_unreachable("Impossible scope kind"); \ + } while (0) + +#define COMPLEX_SCOPE_SWITCH(OP, SUFFIX) \ + do { \ + switch (Group.Recurrence) { \ + default: \ + llvm_unreachable("Unhandled recursion kind"); \ + case RecurKind::Add: \ + SIMPLE_SCOPE_SWITCH(OP##Add##SUFFIX); \ + case RecurKind::Mul: \ + SIMPLE_SCOPE_SWITCH(OP##Mul##SUFFIX); \ + case RecurKind::FAdd: \ + SIMPLE_SCOPE_SWITCH(OP##FAdd##SUFFIX); \ + case RecurKind::FMul: \ + SIMPLE_SCOPE_SWITCH(OP##FMul##SUFFIX); \ + case RecurKind::SMin: \ + SIMPLE_SCOPE_SWITCH(OP##SMin##SUFFIX); \ + case RecurKind::UMin: \ + SIMPLE_SCOPE_SWITCH(OP##UMin##SUFFIX); \ + case RecurKind::FMin: \ + SIMPLE_SCOPE_SWITCH(OP##FMin##SUFFIX); \ + case RecurKind::SMax: \ + SIMPLE_SCOPE_SWITCH(OP##SMax##SUFFIX); \ + case RecurKind::UMax: \ + SIMPLE_SCOPE_SWITCH(OP##UMax##SUFFIX); \ + case RecurKind::FMax: \ + SIMPLE_SCOPE_SWITCH(OP##FMax##SUFFIX); \ + case RecurKind::And: \ + if (Group.IsLogical) { \ + SIMPLE_SCOPE_SWITCH(OP##LogicalAnd##SUFFIX); \ + } else { \ + SIMPLE_SCOPE_SWITCH(OP##And##SUFFIX); \ + } \ + case RecurKind::Or: \ + if (Group.IsLogical) { \ + SIMPLE_SCOPE_SWITCH(OP##LogicalOr##SUFFIX); \ + } else { \ + SIMPLE_SCOPE_SWITCH(OP##Or##SUFFIX); \ + } \ + case RecurKind::Xor: \ + if (Group.IsLogical) { \ + SIMPLE_SCOPE_SWITCH(OP##LogicalXor##SUFFIX); \ + } else { \ + SIMPLE_SCOPE_SWITCH(OP##Xor##SUFFIX); \ + } \ + } \ + } while (0) + + switch (Group.Op) { + case GroupCollective::OpKind::All: + SIMPLE_SCOPE_SWITCH(All); + case GroupCollective::OpKind::Any: + SIMPLE_SCOPE_SWITCH(Any); + case GroupCollective::OpKind::Broadcast: + SIMPLE_SCOPE_SWITCH(Broadcast); + case GroupCollective::OpKind::Reduction: + COMPLEX_SCOPE_SWITCH(Reduce, ); + case GroupCollective::OpKind::ScanExclusive: + COMPLEX_SCOPE_SWITCH(Scan, Exclusive); + case GroupCollective::OpKind::ScanInclusive: + COMPLEX_SCOPE_SWITCH(Scan, Inclusive); + break; + case GroupCollective::OpKind::Shuffle: + case GroupCollective::OpKind::ShuffleUp: + case GroupCollective::OpKind::ShuffleDown: + case GroupCollective::OpKind::ShuffleXor: + if (!Group.isSubGroupScope()) { + break; + } + switch (Group.Op) { + default: + llvm_unreachable("Unhandled op"); + case GroupCollective::OpKind::Shuffle: + return eMuxBuiltinSubgroupShuffle; + case GroupCollective::OpKind::ShuffleUp: + return eMuxBuiltinSubgroupShuffleUp; + case GroupCollective::OpKind::ShuffleDown: + return eMuxBuiltinSubgroupShuffleDown; + case GroupCollective::OpKind::ShuffleXor: + return eMuxBuiltinSubgroupShuffleXor; + } + } + return std::nullopt; +#undef COMPLEX_SCOPE_SWITCH +#undef SCOPE_SWITCH +} + +bool BuiltinInfo::isOverloadableMuxBuiltinID(BuiltinID ID) { + if (!isMuxBuiltinID(ID)) { + return false; + } + switch (ID) { + default: + return isMuxGroupCollective(ID).has_value(); + case eMuxBuiltinDMARead1D: + case eMuxBuiltinDMAWrite1D: + case eMuxBuiltinDMARead2D: + case eMuxBuiltinDMAWrite2D: + case eMuxBuiltinDMARead3D: + case eMuxBuiltinDMAWrite3D: + return true; + } +} + +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp new file mode 100644 index 0000000000000..20b934795d20c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp @@ -0,0 +1,3654 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// For compatibility with the Android NDK, we need to use the C ilogb function. +namespace stdcompat { +#ifdef __ANDROID__ +// Note: This function accepts double only as its argument +using ::ilogb; +#else +using std::ilogb; +#endif // __ANDROID__ +} // namespace stdcompat + +namespace { +/// @brief Identifiers for recognized OpenCL builtins. +enum CLBuiltinID : compiler::utils::BuiltinID { + // Non-standard Builtin Functions + /// @brief Internal builtin 'convert_half_to_float'. + eCLBuiltinConvertHalfToFloat = compiler::utils::eFirstTargetBuiltin, + /// @brief Internal builtin 'convert_float_to_half'. + eCLBuiltinConvertFloatToHalf, + /// @brief Internal builtin 'convert_float_to_half_rte' + eCLBuiltinConvertFloatToHalfRte, + /// @brief Internal builtin 'convert_float_to_half_rtz' + eCLBuiltinConvertFloatToHalfRtz, + /// @brief Internal builtin 'convert_float_to_half_rtp' + eCLBuiltinConvertFloatToHalfRtp, + /// @brief Internal builtin 'convert_float_to_half_rtn' + eCLBuiltinConvertFloatToHalfRtn, + /// @brief Internal builtin 'convert_half_to_double'. + eCLBuiltinConvertHalfToDouble, + /// @brief Internal builtin 'convert_double_to_half'. + eCLBuiltinConvertDoubleToHalf, + /// @brief Internal builtin 'convert_double_to_half_rte' + eCLBuiltinConvertDoubleToHalfRte, + /// @brief Internal builtin 'convert_double_to_half_rtz' + eCLBuiltinConvertDoubleToHalfRtz, + /// @brief Internal builtin 'convert_double_to_half_rtp' + eCLBuiltinConvertDoubleToHalfRtp, + /// @brief Internal builtin 'convert_double_to_half_rtn' + eCLBuiltinConvertDoubleToHalfRtn, + + // 6.2.3 Explicit Conversions + /// @brief OpenCL builtin `convert_char` + eCLBuiltinConvertChar, + /// @brief OpenCL builtin `convert_short` + eCLBuiltinConvertShort, + /// @brief OpenCL builtin `convert_int` + eCLBuiltinConvertInt, + /// @brief OpenCL builtin `convert_long` + eCLBuiltinConvertLong, + /// @brief OpenCL builtin `convert_uchar` + eCLBuiltinConvertUChar, + /// @brief OpenCL builtin `convert_ushort` + eCLBuiltinConvertUShort, + /// @brief OpenCL builtin `convert_uint` + eCLBuiltinConvertUInt, + /// @brief OpenCL builtin `convert_ulong` + eCLBuiltinConvertULong, + + // 6.12.1 Work-Item Functions + /// @brief OpenCL builtin 'get_work_dim'. + eCLBuiltinGetWorkDim, + /// @brief OpenCL builtin 'get_group_id'. + eCLBuiltinGetGroupId, + /// @brief OpenCL builtin 'get_global_size'. + eCLBuiltinGetGlobalSize, + /// @brief OpenCL builtin 'get_global_offset'. + eCLBuiltinGetGlobalOffset, + /// @brief OpenCL builtin 'get_local_id'. + eCLBuiltinGetLocalId, + /// @brief OpenCL builtin 'get_local_size'. + eCLBuiltinGetLocalSize, + /// @brief OpenCL builtin 'get_enqueued_local_size'. + eCLBuiltinGetEnqueuedLocalSize, + /// @brief OpenCL builtin 'get_num_groups'. + eCLBuiltinGetNumGroups, + /// @brief OpenCL builtin 'get_global_id'. + eCLBuiltinGetGlobalId, + /// @brief OpenCL builtin 'get_local_linear_id' (OpenCL >= 2.0). + eCLBuiltinGetLocalLinearId, + /// @brief OpenCL builtin 'get_global_linear_id' (OpenCL >= 2.0). + eCLBuiltinGetGlobalLinearId, + /// @brief OpenCL builtin 'get_sub_group_local_id' (OpenCL >= 3.0). + eCLBuiltinGetSubgroupLocalId, + /// @brief OpenCL builtin 'get_sub_group_size' (OpenCL >= 3.0). + eCLBuiltinGetSubgroupSize, + /// @brief OpenCL builtin 'get_max_sub_group_size' (OpenCL >= 3.0). + eCLBuiltinGetMaxSubgroupSize, + /// @brief OpenCL builtin 'get_num_sub_groups' (OpenCL >= 3.0). + eCLBuiltinGetNumSubgroups, + /// @brief OpenCL builtin 'get_enqueued_num_sub_groups' (OpenCL >= 3.0). + eCLBuiltinGetEnqueuedNumSubgroups, + /// @brief OpenCL builtin 'get_sub_group_id' (OpenCL >= 3.0). + eCLBuiltinGetSubgroupId, + + // 6.12.2 Math Functions + /// @brief OpenCL builtin 'fmax'. + eCLBuiltinFMax, + /// @brief OpenCL builtin 'fmin'. + eCLBuiltinFMin, + /// @brief OpenCL builtin 'fract'. + eCLBuiltinFract, + /// @brief OpenCL builtin 'frexp'. + eCLBuiltinFrexp, + /// @brief OpenCL builtin 'lgamma_r'. + eCLBuiltinLGammaR, + /// @brief OpenCL builtin 'modf'. + eCLBuiltinModF, + /// @brief OpenCL builtin 'sincos'. + eCLBuiltinSinCos, + /// @brief OpenCL builtin 'remquo'. + eCLBuiltinRemquo, + + // 6.12.3 Integer Functions + /// @brief OpenCL builtin 'add_sat'. + eCLBuiltinAddSat, + /// @brief OpenCL builtin 'sub_sat'. + eCLBuiltinSubSat, + + // 6.12.5 Geometric Builtin-in Functions + /// @brief OpenCL builtin 'dot'. + eCLBuiltinDot, + /// @brief OpenCL builtin 'cross'. + eCLBuiltinCross, + /// @brief OpenCL builtin 'length'. + eCLBuiltinLength, + /// @brief OpenCL builtin 'distance'. + eCLBuiltinDistance, + /// @brief OpenCL builtin 'normalize'. + eCLBuiltinNormalize, + /// @brief OpenCL builtin 'fast_length'. + eCLBuiltinFastLength, + /// @brief OpenCL builtin 'fast_distance'. + eCLBuiltinFastDistance, + /// @brief OpenCL builtin 'fast_normalize'. + eCLBuiltinFastNormalize, + + // 6.12.6 Relational Functions + /// @brief OpenCL builtin 'all'. + eCLBuiltinAll, + /// @brief OpenCL builtin 'any'. + eCLBuiltinAny, + /// @brief OpenCL builtin 'isequal'. + eCLBuiltinIsEqual, + /// @brief OpenCL builtin 'isnotequal'. + eCLBuiltinIsNotEqual, + /// @brief OpenCL builtin 'isgreater'. + eCLBuiltinIsGreater, + /// @brief OpenCL builtin 'isgreaterequal'. + eCLBuiltinIsGreaterEqual, + /// @brief OpenCL builtin 'isless'. + eCLBuiltinIsLess, + /// @brief OpenCL builtin 'islessequal'. + eCLBuiltinIsLessEqual, + /// @brief OpenCL builtin 'islessgreater'. + eCLBuiltinIsLessGreater, + /// @brief OpenCL builtin 'isordered'. + eCLBuiltinIsOrdered, + /// @brief OpenCL builtin 'isunordered'. + eCLBuiltinIsUnordered, + /// @brief OpenCL builtin 'isfinite'. + eCLBuiltinIsFinite, + /// @brief OpenCL builtin 'isinf'. + eCLBuiltinIsInf, + /// @brief OpenCL builtin 'isnan'. + eCLBuiltinIsNan, + /// @brief OpenCL builtin 'isnormal'. + eCLBuiltinIsNormal, + /// @brief OpenCL builtin 'signbit'. + eCLBuiltinSignBit, + /// @brief OpenCL builtin `select`. + eCLBuiltinSelect, + + // 6.12.8 Synchronization Functions + /// @brief OpenCL builtin 'barrier'. + eCLBuiltinBarrier, + /// @brief OpenCL builtin 'mem_fence'. + eCLBuiltinMemFence, + /// @brief OpenCL builtin 'read_mem_fence'. + eCLBuiltinReadMemFence, + /// @brief OpenCL builtin 'write_mem_fence'. + eCLBuiltinWriteMemFence, + /// @brief OpenCL builtin 'atomic_work_item_fence'. + eCLBuiltinAtomicWorkItemFence, + /// @brief OpenCL builtin 'sub_group_barrier'. + eCLBuiltinSubGroupBarrier, + /// @brief OpenCL builtin 'work_group_barrier'. + eCLBuiltinWorkGroupBarrier, + + // 6.12.10 Async Copies and Prefetch Functions + /// @brief OpenCL builtin 'async_work_group_copy'. + eCLBuiltinAsyncWorkGroupCopy, + /// @brief OpenCL builtin 'async_work_group_strided_copy'. + eCLBuiltinAsyncWorkGroupStridedCopy, + /// @brief OpenCL builtin 'wait_group_events'. + eCLBuiltinWaitGroupEvents, + /// @brief OpenCL builtin 'async_work_group_copy_2D2D'. + eCLBuiltinAsyncWorkGroupCopy2D2D, + /// @brief OpenCL builtin 'async_work_group_copy_3D3D'. + eCLBuiltinAsyncWorkGroupCopy3D3D, + + // 6.12.11 Atomic Functions + /// @brief OpenCL builtins 'atomic_add', 'atom_add'. + eCLBuiltinAtomicAdd, + /// @brief OpenCL builtins 'atomic_sub', 'atom_sub'. + eCLBuiltinAtomicSub, + /// @brief OpenCL builtins 'atomic_xchg', 'atom_xchg'. + eCLBuiltinAtomicXchg, + /// @brief OpenCL builtins 'atomic_inc', 'atom_inc'. + eCLBuiltinAtomicInc, + /// @brief OpenCL builtins 'atomic_dec', 'atom_dec'. + eCLBuiltinAtomicDec, + /// @brief OpenCL builtins 'atomic_cmpxchg', 'atom_cmpxchg'. + eCLBuiltinAtomicCmpxchg, + /// @brief OpenCL builtins 'atomic_min', 'atom_min'. + eCLBuiltinAtomicMin, + /// @brief OpenCL builtins 'atomic_max', 'atom_max'. + eCLBuiltinAtomicMax, + /// @brief OpenCL builtins 'atomic_and', 'atom_and'. + eCLBuiltinAtomicAnd, + /// @brief OpenCL builtins 'atomic_or', 'atom_or'. + eCLBuiltinAtomicOr, + /// @brief OpenCL builtins 'atomic_xor', 'atom_xor'. + eCLBuiltinAtomicXor, + + // 6.12.12 Miscellaneous Vector Functions + eCLBuiltinShuffle, + eCLBuiltinShuffle2, + + // 6.12.13 printf + /// @brief OpenCL builtin 'printf'. + eCLBuiltinPrintf, + + // 6.15.16 Work-group Collective Functions + /// @brief OpenCL builtin 'work_group_all'. + eCLBuiltinWorkgroupAll, + /// @brief OpenCL builtin 'work_group_any'. + eCLBuiltinWorkgroupAny, + /// @brief OpenCL builtin 'work_group_broadcast'. + eCLBuiltinWorkgroupBroadcast, + /// @brief OpenCL builtin 'work_group_reduce_add'. + eCLBuiltinWorkgroupReduceAdd, + /// @brief OpenCL builtin 'work_group_reduce_min'. + eCLBuiltinWorkgroupReduceMin, + /// @brief OpenCL builtin 'work_group_reduce_max'. + eCLBuiltinWorkgroupReduceMax, + /// @brief OpenCL builtin 'work_group_scan_inclusive_add'. + eCLBuiltinWorkgroupScanAddInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_add'. + eCLBuiltinWorkgroupScanAddExclusive, + /// @brief OpenCL builtin 'work_group_scan_inclusive_min'. + eCLBuiltinWorkgroupScanMinInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_min'. + eCLBuiltinWorkgroupScanMinExclusive, + /// @brief OpenCL builtin 'work_group_scan_inclusive_max'. + eCLBuiltinWorkgroupScanMaxInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_max'. + eCLBuiltinWorkgroupScanMaxExclusive, + + /// @brief OpenCL builtin 'work_group_reduce_mul'. + eCLBuiltinWorkgroupReduceMul, + /// @brief OpenCL builtin 'work_group_reduce_and'. + eCLBuiltinWorkgroupReduceAnd, + /// @brief OpenCL builtin 'work_group_reduce_or'. + eCLBuiltinWorkgroupReduceOr, + /// @brief OpenCL builtin 'work_group_reduce_xor'. + eCLBuiltinWorkgroupReduceXor, + /// @brief OpenCL builtin 'work_group_reduce_logical_and'. + eCLBuiltinWorkgroupReduceLogicalAnd, + /// @brief OpenCL builtin 'work_group_reduce_logical_or'. + eCLBuiltinWorkgroupReduceLogicalOr, + /// @brief OpenCL builtin 'work_group_reduce_logical_xor'. + eCLBuiltinWorkgroupReduceLogicalXor, + /// @brief OpenCL builtin 'work_group_scan_inclusive_mul'. + eCLBuiltinWorkgroupScanMulInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_mul'. + eCLBuiltinWorkgroupScanMulExclusive, + /// @brief OpenCL builtin 'work_group_scan_inclusive_and'. + eCLBuiltinWorkgroupScanAndInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_and'. + eCLBuiltinWorkgroupScanAndExclusive, + /// @brief OpenCL builtin 'work_group_scan_inclusive_or'. + eCLBuiltinWorkgroupScanOrInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_or'. + eCLBuiltinWorkgroupScanOrExclusive, + /// @brief OpenCL builtin 'work_group_scan_inclusive_xor'. + eCLBuiltinWorkgroupScanXorInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_xor'. + eCLBuiltinWorkgroupScanXorExclusive, + /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_and'. + eCLBuiltinWorkgroupScanLogicalAndInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_and'. + eCLBuiltinWorkgroupScanLogicalAndExclusive, + /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_or'. + eCLBuiltinWorkgroupScanLogicalOrInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_or'. + eCLBuiltinWorkgroupScanLogicalOrExclusive, + /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_xor'. + eCLBuiltinWorkgroupScanLogicalXorInclusive, + /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_xor'. + eCLBuiltinWorkgroupScanLogicalXorExclusive, + + // 6.15.19 Subgroup Collective Functions + /// @brief OpenCL builtin 'sub_group_all'. + eCLBuiltinSubgroupAll, + /// @brief OpenCL builtin 'sub_group_any'. + eCLBuiltinSubgroupAny, + /// @brief OpenCL builtin 'sub_group_broadcast'. + eCLBuiltinSubgroupBroadcast, + /// @brief OpenCL builtin 'sub_group_reduce_add'. + eCLBuiltinSubgroupReduceAdd, + /// @brief OpenCL builtin 'sub_group_reduce_min'. + eCLBuiltinSubgroupReduceMin, + /// @brief OpenCL builtin 'sub_group_reduce_max'. + eCLBuiltinSubgroupReduceMax, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_add'. + eCLBuiltinSubgroupScanAddInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_add'. + eCLBuiltinSubgroupScanAddExclusive, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_min'. + eCLBuiltinSubgroupScanMinInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_min'. + eCLBuiltinSubgroupScanMinExclusive, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_max'. + eCLBuiltinSubgroupScanMaxInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_max'. + eCLBuiltinSubgroupScanMaxExclusive, + + /// @brief OpenCL builtin 'sub_group_reduce_mul'. + eCLBuiltinSubgroupReduceMul, + /// @brief OpenCL builtin 'sub_group_reduce_and'. + eCLBuiltinSubgroupReduceAnd, + /// @brief OpenCL builtin 'sub_group_reduce_or'. + eCLBuiltinSubgroupReduceOr, + /// @brief OpenCL builtin 'sub_group_reduce_xor'. + eCLBuiltinSubgroupReduceXor, + /// @brief OpenCL builtin 'sub_group_reduce_logical_and'. + eCLBuiltinSubgroupReduceLogicalAnd, + /// @brief OpenCL builtin 'sub_group_reduce_logical_or'. + eCLBuiltinSubgroupReduceLogicalOr, + /// @brief OpenCL builtin 'sub_group_reduce_logical_xor'. + eCLBuiltinSubgroupReduceLogicalXor, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_mul'. + eCLBuiltinSubgroupScanMulInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_mul'. + eCLBuiltinSubgroupScanMulExclusive, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_and'. + eCLBuiltinSubgroupScanAndInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_and'. + eCLBuiltinSubgroupScanAndExclusive, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_or'. + eCLBuiltinSubgroupScanOrInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_or'. + eCLBuiltinSubgroupScanOrExclusive, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_xor'. + eCLBuiltinSubgroupScanXorInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_xor'. + eCLBuiltinSubgroupScanXorExclusive, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_and'. + eCLBuiltinSubgroupScanLogicalAndInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_and'. + eCLBuiltinSubgroupScanLogicalAndExclusive, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_or'. + eCLBuiltinSubgroupScanLogicalOrInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_or'. + eCLBuiltinSubgroupScanLogicalOrExclusive, + /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_xor'. + eCLBuiltinSubgroupScanLogicalXorInclusive, + /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_xor'. + eCLBuiltinSubgroupScanLogicalXorExclusive, + + // 6.12.7 Vector Data Load and Store Functions + eCLBuiltinVLoad, + eCLBuiltinVLoadHalf, + eCLBuiltinVStore, + eCLBuiltinVStoreHalf, + + // 6.3 Conversions & Type Casting Examples + eCLBuiltinAs, +}; +} // namespace + +namespace { +using namespace llvm; +using namespace compiler::utils; + +// Returns whether the given integer is a valid vector width in OpenCL. +// Matches 2, 3, 4, 8, 16. +bool isValidVecWidth(unsigned w) { + return (w == 3 || (w >= 2 && w <= 16 && llvm::isPowerOf2_32(w))); +} + +/// @brief Copy global variables to a module on demand. +class GlobalValueMaterializer final : public llvm::ValueMaterializer { +public: + /// @brief Create a new global variable materializer. + /// @param[in] M Module to materialize the variables in. + GlobalValueMaterializer(Module &M) : DestM(M) {} + + /// @brief List of variables created during materialization. + const std::vector &variables() const { return Variables; } + + /// @brief Materialize the given value. + /// + /// @param[in] V Value to materialize. + /// + /// @return A value that lives in the destination module, or nullptr if the + /// given value could not be materialized (e.g. it is not a global variable). + Value *materialize(Value *V) override final { + GlobalVariable *GV = dyn_cast(V); + if (!GV) { + return nullptr; + } + GlobalVariable *NewGV = DestM.getGlobalVariable(GV->getName()); + if (!NewGV) { + NewGV = new GlobalVariable( + DestM, GV->getValueType(), GV->isConstant(), GV->getLinkage(), + (Constant *)nullptr, GV->getName(), (GlobalVariable *)nullptr, + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); + NewGV->copyAttributesFrom(GV); + Variables.push_back(GV); + } + return NewGV; + } + +private: + /// @brief Modules to materialize variables in. + Module &DestM; + /// @brief Materialized variables. + std::vector Variables; +}; +} // namespace + +namespace compiler { +namespace utils { +using namespace llvm; + +std::unique_ptr createCLBuiltinInfo(Module *Builtins) { + return std::make_unique(Builtins); +} + +CLBuiltinInfo::CLBuiltinInfo(Module *builtins) + : Loader(std::make_unique(builtins)) {} + +CLBuiltinInfo::~CLBuiltinInfo() = default; + +/// @brief Create a call instruction to the given builtin and set the correct +/// calling convention. +/// +/// This function is intended as a helper function for creating calls to +/// builtins. For each call generated we need to set the calling convention +/// manually, which can lead to code bloat. This function will create the call +/// instruction and then it will either copy the calling convention for the +/// called function (if possible) or set it to the default value of spir_func. +/// +/// @param[in] B The IRBuilder to use when creating the CallInst +/// @param[in] Builtin The Function to call +/// @param[in] Args The call arguments +/// @param[in] NameStr The name for the new CallInst +/// @return The newly emitted CallInst +static CallInst *CreateBuiltinCall(IRBuilder<> &B, Function *Builtin, + ArrayRef Args, + const Twine &NameStr = "") { + CallInst *CI = + B.CreateCall(Builtin->getFunctionType(), Builtin, Args, NameStr); + CI->setCallingConv(Builtin->getCallingConv()); + return CI; +} + +struct CLBuiltinEntry { + /// @brief Identifier for the builtin function. + BuiltinID ID; + /// @brief OpenCL name of the builtin function. + const char *OpenCLFnName; + /// @brief Minimum OpenCL version that supports this builtin. + uint32_t MinVer = OpenCLC10; +}; + +/// @brief Information about known OpenCL builtins. +static constexpr CLBuiltinEntry Builtins[] = { + // Non-standard Builtin Functions + {eCLBuiltinConvertHalfToFloat, "convert_half_to_float"}, + {eCLBuiltinConvertFloatToHalf, "convert_float_to_half"}, + {eCLBuiltinConvertFloatToHalfRte, "convert_float_to_half_rte"}, + {eCLBuiltinConvertFloatToHalfRtz, "convert_float_to_half_rtz"}, + {eCLBuiltinConvertFloatToHalfRtp, "convert_float_to_half_rtp"}, + {eCLBuiltinConvertFloatToHalfRtn, "convert_float_to_half_rtn"}, + {eCLBuiltinConvertHalfToDouble, "convert_half_to_double"}, + {eCLBuiltinConvertDoubleToHalf, "convert_double_to_half"}, + {eCLBuiltinConvertDoubleToHalfRte, "convert_double_to_half_rte"}, + {eCLBuiltinConvertDoubleToHalfRtz, "convert_double_to_half_rtz"}, + {eCLBuiltinConvertDoubleToHalfRtp, "convert_double_to_half_rtp"}, + {eCLBuiltinConvertDoubleToHalfRtn, "convert_double_to_half_rtn"}, + + // 6.2.3 Explicit Conversions + {eCLBuiltinConvertChar, "convert_char"}, + {eCLBuiltinConvertShort, "convert_short"}, + {eCLBuiltinConvertInt, "convert_int"}, + {eCLBuiltinConvertLong, "convert_long"}, + {eCLBuiltinConvertUChar, "convert_uchar"}, + {eCLBuiltinConvertUShort, "convert_ushort"}, + {eCLBuiltinConvertUInt, "convert_uint"}, + {eCLBuiltinConvertULong, "convert_ulong"}, + + // 6.12.1 Work-Item Functions + {eCLBuiltinGetWorkDim, "get_work_dim"}, + {eCLBuiltinGetGroupId, "get_group_id"}, + {eCLBuiltinGetGlobalSize, "get_global_size"}, + {eCLBuiltinGetGlobalOffset, "get_global_offset"}, + {eCLBuiltinGetLocalId, "get_local_id"}, + {eCLBuiltinGetLocalSize, "get_local_size"}, + {eCLBuiltinGetEnqueuedLocalSize, "get_enqueued_local_size"}, + {eCLBuiltinGetNumGroups, "get_num_groups"}, + {eCLBuiltinGetGlobalId, "get_global_id"}, + {eCLBuiltinGetLocalLinearId, "get_local_linear_id", OpenCLC20}, + {eCLBuiltinGetGlobalLinearId, "get_global_linear_id", OpenCLC20}, + {eCLBuiltinGetSubgroupLocalId, "get_sub_group_local_id", OpenCLC30}, + {eCLBuiltinGetSubgroupSize, "get_sub_group_size", OpenCLC30}, + {eCLBuiltinGetMaxSubgroupSize, "get_max_sub_group_size", OpenCLC30}, + {eCLBuiltinGetNumSubgroups, "get_num_sub_groups", OpenCLC30}, + {eCLBuiltinGetEnqueuedNumSubgroups, "get_enqueued_num_sub_groups", + OpenCLC30}, + {eCLBuiltinGetSubgroupId, "get_sub_group_id", OpenCLC30}, + + // 6.12.2 Math Functions + {eCLBuiltinFMax, "fmax"}, + {eCLBuiltinFMin, "fmin"}, + {eCLBuiltinFract, "fract"}, + {eCLBuiltinFrexp, "frexp"}, + {eCLBuiltinLGammaR, "lgamma_r"}, + {eCLBuiltinModF, "modf"}, + {eCLBuiltinSinCos, "sincos"}, + {eCLBuiltinRemquo, "remquo"}, + + // 6.12.3 Integer Functions + {eCLBuiltinAddSat, "add_sat"}, + {eCLBuiltinSubSat, "sub_sat"}, + + // 6.12.5 Geometric Functions + {eCLBuiltinDot, "dot"}, + {eCLBuiltinCross, "cross"}, + {eCLBuiltinLength, "length"}, + {eCLBuiltinDistance, "distance"}, + {eCLBuiltinNormalize, "normalize"}, + {eCLBuiltinFastLength, "fast_length"}, + {eCLBuiltinFastDistance, "fast_distance"}, + {eCLBuiltinFastNormalize, "fast_normalize"}, + + // 6.12.6 Relational Functions + {eCLBuiltinAll, "all"}, + {eCLBuiltinAny, "any"}, + {eCLBuiltinIsEqual, "isequal"}, + {eCLBuiltinIsNotEqual, "isnotequal"}, + {eCLBuiltinIsGreater, "isgreater"}, + {eCLBuiltinIsGreaterEqual, "isgreaterequal"}, + {eCLBuiltinIsLess, "isless"}, + {eCLBuiltinIsLessEqual, "islessequal"}, + {eCLBuiltinIsLessGreater, "islessgreater"}, + {eCLBuiltinIsOrdered, "isordered"}, + {eCLBuiltinIsUnordered, "isunordered"}, + {eCLBuiltinIsFinite, "isfinite"}, + {eCLBuiltinIsInf, "isinf"}, + {eCLBuiltinIsNan, "isnan"}, + {eCLBuiltinIsNormal, "isnormal"}, + {eCLBuiltinSignBit, "signbit"}, + {eCLBuiltinSelect, "select"}, + + // 6.12.8 Synchronization Functions + {eCLBuiltinBarrier, "barrier"}, + {eCLBuiltinMemFence, "mem_fence"}, + {eCLBuiltinReadMemFence, "read_mem_fence"}, + {eCLBuiltinWriteMemFence, "write_mem_fence"}, + {eCLBuiltinAtomicWorkItemFence, "atomic_work_item_fence", OpenCLC20}, + {eCLBuiltinSubGroupBarrier, "sub_group_barrier", OpenCLC30}, + {eCLBuiltinWorkGroupBarrier, "work_group_barrier", OpenCLC20}, + + // 6.12.10 Async Copies and Prefetch Functions + {eCLBuiltinAsyncWorkGroupCopy, "async_work_group_copy"}, + {eCLBuiltinAsyncWorkGroupStridedCopy, "async_work_group_strided_copy"}, + {eCLBuiltinWaitGroupEvents, "wait_group_events"}, + {eCLBuiltinAsyncWorkGroupCopy2D2D, "async_work_group_copy_2D2D"}, + {eCLBuiltinAsyncWorkGroupCopy3D3D, "async_work_group_copy_3D3D"}, + + // 6.12.11 Atomic Functions + {eCLBuiltinAtomicAdd, "atom_add"}, + {eCLBuiltinAtomicSub, "atom_sub"}, + {eCLBuiltinAtomicXchg, "atom_xchg"}, + {eCLBuiltinAtomicInc, "atom_inc"}, + {eCLBuiltinAtomicDec, "atom_dec"}, + {eCLBuiltinAtomicCmpxchg, "atom_cmpxchg"}, + {eCLBuiltinAtomicMin, "atom_min"}, + {eCLBuiltinAtomicMax, "atom_max"}, + {eCLBuiltinAtomicAnd, "atom_and"}, + {eCLBuiltinAtomicOr, "atom_or"}, + {eCLBuiltinAtomicXor, "atom_xor"}, + {eCLBuiltinAtomicAdd, "atomic_add"}, + {eCLBuiltinAtomicSub, "atomic_sub"}, + {eCLBuiltinAtomicXchg, "atomic_xchg"}, + {eCLBuiltinAtomicInc, "atomic_inc"}, + {eCLBuiltinAtomicDec, "atomic_dec"}, + {eCLBuiltinAtomicCmpxchg, "atomic_cmpxchg"}, + {eCLBuiltinAtomicMin, "atomic_min"}, + {eCLBuiltinAtomicMax, "atomic_max"}, + {eCLBuiltinAtomicAnd, "atomic_and"}, + {eCLBuiltinAtomicOr, "atomic_or"}, + {eCLBuiltinAtomicXor, "atomic_xor"}, + + // 6.11.12 Miscellaneous Vector Functions + {eCLBuiltinShuffle, "shuffle"}, + {eCLBuiltinShuffle2, "shuffle2"}, + + // 6.12.13 printf + {eCLBuiltinPrintf, "printf"}, + + // 6.15.16 Work-group Collective Functions + {eCLBuiltinWorkgroupAll, "work_group_all", OpenCLC20}, + {eCLBuiltinWorkgroupAny, "work_group_any", OpenCLC20}, + {eCLBuiltinWorkgroupBroadcast, "work_group_broadcast", OpenCLC20}, + {eCLBuiltinWorkgroupReduceAdd, "work_group_reduce_add", OpenCLC20}, + {eCLBuiltinWorkgroupReduceMin, "work_group_reduce_min", OpenCLC20}, + {eCLBuiltinWorkgroupReduceMax, "work_group_reduce_max", OpenCLC20}, + {eCLBuiltinWorkgroupScanAddInclusive, "work_group_scan_inclusive_add", + OpenCLC20}, + {eCLBuiltinWorkgroupScanAddExclusive, "work_group_scan_exclusive_add", + OpenCLC20}, + {eCLBuiltinWorkgroupScanMinInclusive, "work_group_scan_inclusive_min", + OpenCLC20}, + {eCLBuiltinWorkgroupScanMinExclusive, "work_group_scan_exclusive_min", + OpenCLC20}, + {eCLBuiltinWorkgroupScanMaxInclusive, "work_group_scan_inclusive_max", + OpenCLC20}, + {eCLBuiltinWorkgroupScanMaxExclusive, "work_group_scan_exclusive_max", + OpenCLC20}, + + /// Provided by SPV_KHR_uniform_group_instructions. + {eCLBuiltinWorkgroupReduceMul, "work_group_reduce_mul", OpenCLC20}, + {eCLBuiltinWorkgroupReduceAnd, "work_group_reduce_and", OpenCLC20}, + {eCLBuiltinWorkgroupReduceOr, "work_group_reduce_or", OpenCLC20}, + {eCLBuiltinWorkgroupReduceXor, "work_group_reduce_xor", OpenCLC20}, + {eCLBuiltinWorkgroupReduceLogicalAnd, "work_group_reduce_logical_and", + OpenCLC20}, + {eCLBuiltinWorkgroupReduceLogicalOr, "work_group_reduce_logical_or", + OpenCLC20}, + {eCLBuiltinWorkgroupReduceLogicalXor, "work_group_reduce_logical_xor", + OpenCLC20}, + {eCLBuiltinWorkgroupScanMulInclusive, "work_group_scan_inclusive_mul", + OpenCLC20}, + {eCLBuiltinWorkgroupScanMulExclusive, "work_group_scan_exclusive_mul", + OpenCLC20}, + {eCLBuiltinWorkgroupScanAndInclusive, "work_group_scan_inclusive_and", + OpenCLC20}, + {eCLBuiltinWorkgroupScanAndExclusive, "work_group_scan_exclusive_and", + OpenCLC20}, + {eCLBuiltinWorkgroupScanOrInclusive, "work_group_scan_inclusive_or", + OpenCLC20}, + {eCLBuiltinWorkgroupScanOrExclusive, "work_group_scan_exclusive_or", + OpenCLC20}, + {eCLBuiltinWorkgroupScanXorInclusive, "work_group_scan_inclusive_xor", + OpenCLC20}, + {eCLBuiltinWorkgroupScanXorExclusive, "work_group_scan_exclusive_xor", + OpenCLC20}, + {eCLBuiltinWorkgroupScanLogicalAndInclusive, + "work_group_scan_inclusive_logical_and", OpenCLC20}, + {eCLBuiltinWorkgroupScanLogicalAndExclusive, + "work_group_scan_exclusive_logical_and", OpenCLC20}, + {eCLBuiltinWorkgroupScanLogicalOrInclusive, + "work_group_scan_inclusive_logical_or", OpenCLC20}, + {eCLBuiltinWorkgroupScanLogicalOrExclusive, + "work_group_scan_exclusive_logical_or", OpenCLC20}, + {eCLBuiltinWorkgroupScanLogicalXorInclusive, + "work_group_scan_inclusive_logical_xor", OpenCLC20}, + {eCLBuiltinWorkgroupScanLogicalXorExclusive, + "work_group_scan_exclusive_logical_xor", OpenCLC20}, + + // 6.15.19 Subgroup Collective Functions + {eCLBuiltinSubgroupAll, "sub_group_all", OpenCLC30}, + {eCLBuiltinSubgroupAny, "sub_group_any", OpenCLC30}, + {eCLBuiltinSubgroupBroadcast, "sub_group_broadcast", OpenCLC30}, + {eCLBuiltinSubgroupReduceAdd, "sub_group_reduce_add", OpenCLC30}, + {eCLBuiltinSubgroupReduceMin, "sub_group_reduce_min", OpenCLC30}, + {eCLBuiltinSubgroupReduceMax, "sub_group_reduce_max", OpenCLC30}, + {eCLBuiltinSubgroupScanAddInclusive, "sub_group_scan_inclusive_add", + OpenCLC30}, + {eCLBuiltinSubgroupScanAddExclusive, "sub_group_scan_exclusive_add", + OpenCLC30}, + {eCLBuiltinSubgroupScanMinInclusive, "sub_group_scan_inclusive_min", + OpenCLC30}, + {eCLBuiltinSubgroupScanMinExclusive, "sub_group_scan_exclusive_min", + OpenCLC30}, + {eCLBuiltinSubgroupScanMaxInclusive, "sub_group_scan_inclusive_max", + OpenCLC30}, + {eCLBuiltinSubgroupScanMaxExclusive, "sub_group_scan_exclusive_max", + OpenCLC30}, + /// Provided by SPV_KHR_uniform_group_instructions. + {eCLBuiltinSubgroupReduceMul, "sub_group_reduce_mul", OpenCLC30}, + {eCLBuiltinSubgroupReduceAnd, "sub_group_reduce_and", OpenCLC30}, + {eCLBuiltinSubgroupReduceOr, "sub_group_reduce_or", OpenCLC30}, + {eCLBuiltinSubgroupReduceXor, "sub_group_reduce_xor", OpenCLC30}, + {eCLBuiltinSubgroupReduceLogicalAnd, "sub_group_reduce_logical_and", + OpenCLC30}, + {eCLBuiltinSubgroupReduceLogicalOr, "sub_group_reduce_logical_or", + OpenCLC30}, + {eCLBuiltinSubgroupReduceLogicalXor, "sub_group_reduce_logical_xor", + OpenCLC30}, + {eCLBuiltinSubgroupScanMulInclusive, "sub_group_scan_inclusive_mul", + OpenCLC30}, + {eCLBuiltinSubgroupScanMulExclusive, "sub_group_scan_exclusive_mul", + OpenCLC30}, + {eCLBuiltinSubgroupScanAndInclusive, "sub_group_scan_inclusive_and", + OpenCLC30}, + {eCLBuiltinSubgroupScanAndExclusive, "sub_group_scan_exclusive_and", + OpenCLC30}, + {eCLBuiltinSubgroupScanOrInclusive, "sub_group_scan_inclusive_or", + OpenCLC30}, + {eCLBuiltinSubgroupScanOrExclusive, "sub_group_scan_exclusive_or", + OpenCLC30}, + {eCLBuiltinSubgroupScanXorInclusive, "sub_group_scan_inclusive_xor", + OpenCLC30}, + {eCLBuiltinSubgroupScanXorExclusive, "sub_group_scan_exclusive_xor", + OpenCLC30}, + {eCLBuiltinSubgroupScanLogicalAndInclusive, + "sub_group_scan_inclusive_logical_and", OpenCLC30}, + {eCLBuiltinSubgroupScanLogicalAndExclusive, + "sub_group_scan_exclusive_logical_and", OpenCLC30}, + {eCLBuiltinSubgroupScanLogicalOrInclusive, + "sub_group_scan_inclusive_logical_or", OpenCLC30}, + {eCLBuiltinSubgroupScanLogicalOrExclusive, + "sub_group_scan_exclusive_logical_or", OpenCLC30}, + {eCLBuiltinSubgroupScanLogicalXorInclusive, + "sub_group_scan_inclusive_logical_xor", OpenCLC30}, + {eCLBuiltinSubgroupScanLogicalXorExclusive, + "sub_group_scan_exclusive_logical_xor", OpenCLC30}, + + {eBuiltinUnknown, nullptr}}; + +//////////////////////////////////////////////////////////////////////////////// + +Function *CLBuiltinInfo::declareBuiltin(Module *M, BuiltinID ID, Type *RetTy, + ArrayRef ArgTys, + ArrayRef ArgQuals, + Twine Suffix) { + // Determine the builtin function name. + if (!M) { + return nullptr; + } + std::string BuiltinName = getBuiltinName(ID).str(); + if (BuiltinName.empty()) { + return nullptr; + } + + // Add the optional suffix. + SmallVector SuffixVec; + Suffix.toVector(SuffixVec); + if (!SuffixVec.empty()) { + BuiltinName.append(SuffixVec.begin(), SuffixVec.end()); + } + + // Mangle the function name and look it up in the module. + NameMangler Mangler(&M->getContext()); + const std::string MangledName = + Mangler.mangleName(BuiltinName, ArgTys, ArgQuals); + Function *Builtin = M->getFunction(MangledName); + + // Declare the builtin if necessary. + if (!Builtin) { + FunctionType *FT = FunctionType::get(RetTy, ArgTys, false); + M->getOrInsertFunction(MangledName, FT); + Builtin = M->getFunction(MangledName); + Builtin->setCallingConv(CallingConv::SPIR_FUNC); + } + return Builtin; +} + +std::optional CLBuiltinInfo::getPrintfBuiltin() const { + return eCLBuiltinPrintf; +} + +Module *CLBuiltinInfo::getBuiltinsModule() { + if (!Loader) { + return nullptr; + } + return Loader->getBuiltinsModule(); +} + +Function *CLBuiltinInfo::materializeBuiltin(StringRef BuiltinName, + Module *DestM, + BuiltinMatFlags Flags) { + // First try to find the builtin in the target module. + if (DestM) { + Function *Builtin = DestM->getFunction(BuiltinName); + // If a builtin was found, it might be either a declaration or a definition. + // If the definition flag (eBuiltinMatDefinition) is set, we can not return + // just a declaration. + if (Builtin && + (!(Flags & eBuiltinMatDefinition) || !Builtin->isDeclaration())) { + return Builtin; + } + } + + if (!Loader) { + return nullptr; + } + // Try to find the builtin in the builtins module + return Loader->materializeBuiltin(BuiltinName, DestM, Flags); +} + +std::optional +CLBuiltinInfo::identifyBuiltin(const Function &F) const { + NameMangler Mangler(nullptr); + const StringRef Name = F.getName(); + const CLBuiltinEntry *entry = Builtins; + const auto Version = getOpenCLVersion(*F.getParent()); + const StringRef DemangledName = Mangler.demangleName(Name); + while (entry->ID != eBuiltinUnknown) { + if (Version >= entry->MinVer && DemangledName == entry->OpenCLFnName) { + return entry->ID; + } + entry++; + } + + if (DemangledName == Name) { + // The function name is not mangled and so it can not be an OpenCL builtin. + return std::nullopt; + } + + Lexer L(Mangler.demangleName(Name)); + if (L.Consume("vload")) { + unsigned Width = 0; + if (L.Consume("_half")) { + // We have both `vload_half` and `vload_halfN` variants. + if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) { + // If there's nothing left to parse we're good to go. + if (!L.Left()) { + return eCLBuiltinVLoadHalf; + } + } + } else if (L.ConsumeInteger(Width) && !L.Left() && isValidVecWidth(Width)) { + // There are no scalar variants of this builtin. + return eCLBuiltinVLoad; + } + } else if (L.Consume("vstore")) { + unsigned Width = 0; + if (L.Consume("_half")) { + // We have both `vstore_half` and `vstore_halfN` variants. + if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) { + // Rounding modes are optional. + L.Consume("_rte") || L.Consume("_rtz") || L.Consume("_rtp") || + L.Consume("_rtn"); + + // If there's nothing left to parse we're good to go. + if (!L.Left()) { + return eCLBuiltinVStoreHalf; + } + } + } else if (L.ConsumeInteger(Width) && !L.Left() && isValidVecWidth(Width)) { + // There are no scalar variants of this builtin. + return eCLBuiltinVStore; + } + } else if (L.Consume("as_")) { + if (L.Consume("char") || L.Consume("uchar") || L.Consume("short") || + L.Consume("ushort") || L.Consume("int") || L.Consume("uint") || + L.Consume("long") || L.Consume("ulong") || L.Consume("float") || + L.Consume("double") || L.Consume("half")) { + unsigned Width = 0; + if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) { + if (!L.Left()) { + return eCLBuiltinAs; + } + } + } + } + + return eBuiltinUnknown; +} + +llvm::StringRef CLBuiltinInfo::getBuiltinName(BuiltinID ID) const { + const CLBuiltinEntry *entry = Builtins; + while (entry->ID != eBuiltinUnknown) { + if (ID == entry->ID) { + return entry->OpenCLFnName; + } + entry++; + } + return llvm::StringRef(); +} + +BuiltinUniformity CLBuiltinInfo::isBuiltinUniform(const Builtin &, + const CallInst *CI, + unsigned) const { + // Assume that builtins with side effects are varying. + if (Function *Callee = CI->getCalledFunction()) { + if (auto B = analyzeBuiltin(*Callee)) { + const auto Props = B->properties; + if (Props & eBuiltinPropertySideEffects) { + return eBuiltinUniformityNever; + } + } + } + + return eBuiltinUniformityLikeInputs; +} + +std::optional +CLBuiltinInfo::analyzeBuiltin(const Function &Callee) const { + const auto ID = identifyBuiltin(Callee); + if (!ID) { + return std::nullopt; + } + + bool IsConvergent = false; + unsigned Properties = eBuiltinPropertyNone; + switch (*ID) { + default: + // Assume convergence on unknown builtins. + IsConvergent = true; + break; + case eBuiltinUnknown: { + // Assume convergence on unknown builtins. + IsConvergent = true; + // If we know that this is an OpenCL builtin, but we don't have any + // special information about it, we can determine if it has side effects + // or not by its return type and its paramaters. This depends on being + // able to identify all the "special" builtins, such as barriers and + // fences. + bool HasSideEffects = false; + + // Void functions have side effects + if (Callee.getReturnType() == Type::getVoidTy(Callee.getContext())) { + HasSideEffects = true; + } + // Functions that take pointers probably have side effects + for (const auto &arg : Callee.args()) { + if (arg.getType()->isPointerTy()) { + HasSideEffects = true; + } + } + Properties |= HasSideEffects ? eBuiltinPropertySideEffects + : eBuiltinPropertyNoSideEffects; + } break; + case eCLBuiltinBarrier: + IsConvergent = true; + Properties |= eBuiltinPropertyExecutionFlow; + Properties |= eBuiltinPropertySideEffects; + Properties |= eBuiltinPropertyLowerToMuxBuiltin; + break; + case eCLBuiltinMemFence: + case eCLBuiltinReadMemFence: + case eCLBuiltinWriteMemFence: + Properties |= eBuiltinPropertySupportsInstantiation; + Properties |= eBuiltinPropertyLowerToMuxBuiltin; + break; + case eCLBuiltinPrintf: + Properties |= eBuiltinPropertySideEffects; + Properties |= eBuiltinPropertySupportsInstantiation; + break; + case eCLBuiltinAsyncWorkGroupCopy: + case eCLBuiltinAsyncWorkGroupStridedCopy: + case eCLBuiltinWaitGroupEvents: + case eCLBuiltinAsyncWorkGroupCopy2D2D: + case eCLBuiltinAsyncWorkGroupCopy3D3D: + // Our implementation of these builtins uses thread checks against + // specific work-item IDs, so they are convergent. + IsConvergent = true; + Properties |= eBuiltinPropertyNoSideEffects; + Properties |= eBuiltinPropertyLowerToMuxBuiltin; + break; + case eCLBuiltinAtomicAdd: + case eCLBuiltinAtomicSub: + case eCLBuiltinAtomicXchg: + case eCLBuiltinAtomicInc: + case eCLBuiltinAtomicDec: + case eCLBuiltinAtomicCmpxchg: + case eCLBuiltinAtomicMin: + case eCLBuiltinAtomicMax: + case eCLBuiltinAtomicAnd: + case eCLBuiltinAtomicOr: + case eCLBuiltinAtomicXor: + Properties |= eBuiltinPropertySideEffects; + Properties |= eBuiltinPropertySupportsInstantiation; + Properties |= eBuiltinPropertyAtomic; + break; + case eCLBuiltinGetWorkDim: + case eCLBuiltinGetGroupId: + case eCLBuiltinGetGlobalSize: + case eCLBuiltinGetGlobalOffset: + case eCLBuiltinGetNumGroups: + case eCLBuiltinGetGlobalId: + case eCLBuiltinGetLocalSize: + case eCLBuiltinGetEnqueuedLocalSize: + case eCLBuiltinGetLocalLinearId: + case eCLBuiltinGetGlobalLinearId: + case eCLBuiltinGetSubgroupLocalId: + Properties |= eBuiltinPropertyWorkItem; + Properties |= eBuiltinPropertyRematerializable; + Properties |= eBuiltinPropertyLowerToMuxBuiltin; + break; + case eCLBuiltinGetLocalId: + Properties |= eBuiltinPropertyWorkItem; + Properties |= eBuiltinPropertyLocalID; + Properties |= eBuiltinPropertyRematerializable; + Properties |= eBuiltinPropertyLowerToMuxBuiltin; + break; + case eCLBuiltinDot: + case eCLBuiltinCross: + case eCLBuiltinFastDistance: + case eCLBuiltinFastLength: + case eCLBuiltinFastNormalize: + Properties |= eBuiltinPropertyReduction; + Properties |= eBuiltinPropertyNoVectorEquivalent; + Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinDistance: + case eCLBuiltinLength: + case eCLBuiltinNormalize: + Properties |= eBuiltinPropertyReduction; + Properties |= eBuiltinPropertyNoVectorEquivalent; + // XXX The inline implementation seems to have precision issues. The dot + // product can overflow to +inf which results in the wrong result. + // See redmine #6427 and #9115 + // Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinIsEqual: + case eCLBuiltinIsNotEqual: + case eCLBuiltinIsGreater: + case eCLBuiltinIsGreaterEqual: + case eCLBuiltinIsLess: + case eCLBuiltinIsLessEqual: + case eCLBuiltinIsLessGreater: + case eCLBuiltinIsOrdered: + case eCLBuiltinIsUnordered: + case eCLBuiltinIsFinite: + case eCLBuiltinIsInf: + case eCLBuiltinIsNan: + case eCLBuiltinIsNormal: + case eCLBuiltinSignBit: + // Scalar variants return '0' or '1', vector variants '0' or '111...1'. + Properties |= eBuiltinPropertyNoVectorEquivalent; + Properties |= eBuiltinPropertyCanEmitInline; + Properties |= eBuiltinPropertySupportsInstantiation; + break; + case eCLBuiltinAny: + case eCLBuiltinAll: + Properties |= eBuiltinPropertyNoVectorEquivalent; + Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinFract: + case eCLBuiltinModF: + case eCLBuiltinSinCos: + Properties |= eBuiltinPropertyPointerReturnEqualRetTy; + break; + case eCLBuiltinFrexp: + case eCLBuiltinLGammaR: + case eCLBuiltinRemquo: + Properties |= eBuiltinPropertyPointerReturnEqualIntRetTy; + break; + case eCLBuiltinShuffle: + case eCLBuiltinShuffle2: + // While there are vector equivalents for these builtins, they require a + // modified mask, so we cannot use them by simply packetizing their + // arguments. + Properties |= eBuiltinPropertyNoVectorEquivalent; + Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinFMax: + case eCLBuiltinFMin: + case eCLBuiltinAddSat: + case eCLBuiltinSubSat: + Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinConvertChar: + case eCLBuiltinConvertShort: + case eCLBuiltinConvertInt: + case eCLBuiltinConvertLong: + case eCLBuiltinConvertUChar: + case eCLBuiltinConvertUShort: + case eCLBuiltinConvertUInt: + case eCLBuiltinConvertULong: + Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinVLoad: + case eCLBuiltinVLoadHalf: + Properties |= eBuiltinPropertyNoSideEffects; + Properties |= eBuiltinPropertyNoVectorEquivalent; + Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinVStore: + case eCLBuiltinVStoreHalf: + Properties |= eBuiltinPropertySideEffects; + Properties |= eBuiltinPropertyNoVectorEquivalent; + Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinSelect: + case eCLBuiltinAs: + // Some of these builtins do have vector equivalents, but since we can + // emit all variants inline, we mark them as having none for simplicity. + Properties |= eBuiltinPropertyNoVectorEquivalent; + Properties |= eBuiltinPropertyCanEmitInline; + break; + case eCLBuiltinWorkGroupBarrier: + case eCLBuiltinSubGroupBarrier: + IsConvergent = true; + LLVM_FALLTHROUGH; + case eCLBuiltinAtomicWorkItemFence: + Properties |= eBuiltinPropertyLowerToMuxBuiltin; + break; + case eCLBuiltinGetSubgroupSize: + case eCLBuiltinGetMaxSubgroupSize: + case eCLBuiltinGetNumSubgroups: + case eCLBuiltinGetEnqueuedNumSubgroups: + case eCLBuiltinGetSubgroupId: + Properties |= eBuiltinPropertyLowerToMuxBuiltin; + break; + // Subgroup collectives + case eCLBuiltinSubgroupAll: + case eCLBuiltinSubgroupAny: + case eCLBuiltinSubgroupBroadcast: + case eCLBuiltinSubgroupReduceAdd: + case eCLBuiltinSubgroupReduceMin: + case eCLBuiltinSubgroupReduceMax: + case eCLBuiltinSubgroupScanAddInclusive: + case eCLBuiltinSubgroupScanAddExclusive: + case eCLBuiltinSubgroupScanMinInclusive: + case eCLBuiltinSubgroupScanMinExclusive: + case eCLBuiltinSubgroupScanMaxInclusive: + case eCLBuiltinSubgroupScanMaxExclusive: + case eCLBuiltinSubgroupReduceMul: + case eCLBuiltinSubgroupReduceAnd: + case eCLBuiltinSubgroupReduceOr: + case eCLBuiltinSubgroupReduceXor: + case eCLBuiltinSubgroupReduceLogicalAnd: + case eCLBuiltinSubgroupReduceLogicalOr: + case eCLBuiltinSubgroupReduceLogicalXor: + case eCLBuiltinSubgroupScanMulInclusive: + case eCLBuiltinSubgroupScanMulExclusive: + case eCLBuiltinSubgroupScanAndInclusive: + case eCLBuiltinSubgroupScanAndExclusive: + case eCLBuiltinSubgroupScanOrInclusive: + case eCLBuiltinSubgroupScanOrExclusive: + case eCLBuiltinSubgroupScanXorInclusive: + case eCLBuiltinSubgroupScanXorExclusive: + case eCLBuiltinSubgroupScanLogicalAndInclusive: + case eCLBuiltinSubgroupScanLogicalAndExclusive: + case eCLBuiltinSubgroupScanLogicalOrInclusive: + case eCLBuiltinSubgroupScanLogicalOrExclusive: + case eCLBuiltinSubgroupScanLogicalXorInclusive: + case eCLBuiltinSubgroupScanLogicalXorExclusive: + // Work-group collectives + case eCLBuiltinWorkgroupAll: + case eCLBuiltinWorkgroupAny: + case eCLBuiltinWorkgroupBroadcast: + case eCLBuiltinWorkgroupReduceAdd: + case eCLBuiltinWorkgroupReduceMin: + case eCLBuiltinWorkgroupReduceMax: + case eCLBuiltinWorkgroupScanAddInclusive: + case eCLBuiltinWorkgroupScanAddExclusive: + case eCLBuiltinWorkgroupScanMinInclusive: + case eCLBuiltinWorkgroupScanMinExclusive: + case eCLBuiltinWorkgroupScanMaxInclusive: + case eCLBuiltinWorkgroupScanMaxExclusive: + case eCLBuiltinWorkgroupReduceMul: + case eCLBuiltinWorkgroupReduceAnd: + case eCLBuiltinWorkgroupReduceOr: + case eCLBuiltinWorkgroupReduceXor: + case eCLBuiltinWorkgroupReduceLogicalAnd: + case eCLBuiltinWorkgroupReduceLogicalOr: + case eCLBuiltinWorkgroupReduceLogicalXor: + case eCLBuiltinWorkgroupScanMulInclusive: + case eCLBuiltinWorkgroupScanMulExclusive: + case eCLBuiltinWorkgroupScanAndInclusive: + case eCLBuiltinWorkgroupScanAndExclusive: + case eCLBuiltinWorkgroupScanOrInclusive: + case eCLBuiltinWorkgroupScanOrExclusive: + case eCLBuiltinWorkgroupScanXorInclusive: + case eCLBuiltinWorkgroupScanXorExclusive: + case eCLBuiltinWorkgroupScanLogicalAndInclusive: + case eCLBuiltinWorkgroupScanLogicalAndExclusive: + case eCLBuiltinWorkgroupScanLogicalOrInclusive: + case eCLBuiltinWorkgroupScanLogicalOrExclusive: + case eCLBuiltinWorkgroupScanLogicalXorInclusive: + case eCLBuiltinWorkgroupScanLogicalXorExclusive: + IsConvergent = true; + Properties |= eBuiltinPropertyLowerToMuxBuiltin; + break; + } + + if (!IsConvergent) { + Properties |= eBuiltinPropertyKnownNonConvergent; + } + + return Builtin{Callee, *ID, (BuiltinProperties)Properties}; +} + +Function *CLBuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width, + Module *M) { + // Analyze the builtin. Some functions have no vector equivalent. + const auto Props = B.properties; + if (Props & eBuiltinPropertyNoVectorEquivalent) { + return nullptr; + } + + // Builtin functions have mangled names. If it's not mangled, there will be + // no vector equivalent. + NameMangler Mangler(&B.function.getContext()); + SmallVector BuiltinArgTypes, BuiltinPointeeTypes; + SmallVector BuiltinArgQuals; + const StringRef BuiltinName = + Mangler.demangleName(B.function.getName(), BuiltinArgTypes, + BuiltinPointeeTypes, BuiltinArgQuals); + if (BuiltinName.empty()) { + return nullptr; + } + + // Determine the mangled name of the vector equivalent. + // This means creating a list of qualified types for the arguments. + SmallVector VectorTypes; + SmallVector VectorQuals; + for (unsigned i = 0; i < BuiltinArgTypes.size(); i++) { + Type *OldTy = BuiltinArgTypes[i]; + const TypeQualifiers OldQuals = BuiltinArgQuals[i]; + if (isa(OldTy)) { + return nullptr; + } + PointerType *OldPtrTy = dyn_cast(OldTy); + if (OldPtrTy) { + if (auto *const PtrRetPointeeTy = + getPointerReturnPointeeTy(B.function, Props)) { + [[maybe_unused]] auto *OldPointeeTy = BuiltinPointeeTypes[i]; + assert(OldPointeeTy && OldPointeeTy == PtrRetPointeeTy && + "Demangling inconsistency"); + if (!FixedVectorType::isValidElementType(PtrRetPointeeTy)) { + return nullptr; + } + Type *NewType = OldPtrTy; + TypeQualifiers NewQuals; + TypeQualifiers EleQuals = OldQuals; + NewQuals.push_back(EleQuals.pop_front()); // Pointer qualifier + NewQuals.push_back(eTypeQualNone); // Vector qualifier + NewQuals.push_back(EleQuals); + + VectorTypes.push_back(NewType); + VectorQuals.push_back(NewQuals); + + continue; + } + } + + if (!FixedVectorType::isValidElementType(OldTy)) { + return nullptr; + } + TypeQualifiers NewQuals; + Type *NewType = FixedVectorType::get(OldTy, Width); + NewQuals.push_back(eTypeQualNone); // Vector qualifier + NewQuals.push_back(OldQuals); // Element qualifier + + VectorTypes.push_back(NewType); + VectorQuals.push_back(NewQuals); + } + + // Handle special builtin naming equivalents. + std::string EquivNameBase = BuiltinName.str(); + StringRef FirstChunk; + Lexer L(BuiltinName); + if (L.ConsumeUntil('_', FirstChunk)) { + const bool AsBuiltin = FirstChunk == "as"; + const bool ConvertBuiltin = FirstChunk == "convert"; + if (!L.Consume("_")) { + return nullptr; + } + StringRef SecondChunkNoWidth; + if (!L.ConsumeAlpha(SecondChunkNoWidth)) { + return nullptr; + } + if (AsBuiltin || ConvertBuiltin) { + // as_* and convert_* builtins have vector equivalents, with a vector + // width suffix. Add the width suffix to the scalar builtin name. + if (AsBuiltin && L.Left()) { + return nullptr; + } + const Twine WidthText(Width); + EquivNameBase.insert(L.CurrentPos(), WidthText.str()); + } + } + + const std::string EquivName = + Mangler.mangleName(EquivNameBase, VectorTypes, VectorQuals); + + // Lookup the vector equivalent and make sure the return type agrees. + Function *VectorBuiltin = materializeBuiltin(EquivName, M); + if (VectorBuiltin) { + Type *RetTy = B.function.getReturnType(); + auto *VecRetTy = dyn_cast(VectorBuiltin->getReturnType()); + if (!VecRetTy || (VecRetTy->getElementType() != RetTy) || + (VecRetTy->getNumElements() != Width)) { + VectorBuiltin = nullptr; + } + } + return VectorBuiltin; +} + +Function *CLBuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) { + // Analyze the builtin. Some functions have no scalar equivalent. + const auto Props = B.properties; + if (Props & eBuiltinPropertyNoVectorEquivalent) { + return nullptr; + } + + // Check the return type. + auto *VecRetTy = dyn_cast(B.function.getReturnType()); + if (!VecRetTy) { + return nullptr; + } + + // Builtin functions have mangled names. If it's not mangled, there will be + // no scalar equivalent. + NameMangler Mangler(&B.function.getContext()); + SmallVector BuiltinArgTypes, BuiltinPointeeTypes; + SmallVector BuiltinArgQuals; + const StringRef BuiltinName = + Mangler.demangleName(B.function.getName(), BuiltinArgTypes, + BuiltinPointeeTypes, BuiltinArgQuals); + if (BuiltinName.empty()) { + return nullptr; + } + + // Determine the mangled name of the scalar equivalent. + // This means creating a list of qualified types for the arguments. + const unsigned Width = VecRetTy->getNumElements(); + SmallVector ScalarTypes; + SmallVector ScalarQuals; + for (unsigned i = 0; i < BuiltinArgTypes.size(); i++) { + Type *OldTy = BuiltinArgTypes[i]; + const TypeQualifiers OldQuals = BuiltinArgQuals[i]; + if (auto *OldVecTy = dyn_cast(OldTy)) { + if (OldVecTy->getNumElements() != Width) { + return nullptr; + } + Type *NewTy = OldVecTy->getElementType(); + TypeQualifiers NewQuals = OldQuals; + NewQuals.pop_front(); + + ScalarTypes.push_back(NewTy); + ScalarQuals.push_back(NewQuals); + } else if (PointerType *OldPtrTy = dyn_cast(OldTy)) { + Type *const PtrRetPointeeTy = + getPointerReturnPointeeTy(B.function, Props); + if (PtrRetPointeeTy && PtrRetPointeeTy->isVectorTy()) { + [[maybe_unused]] auto *OldPointeeTy = BuiltinPointeeTypes[i]; + assert(OldPointeeTy && OldPointeeTy == PtrRetPointeeTy && + "Demangling inconsistency"); + Type *NewTy = OldPtrTy; + TypeQualifiers NewQuals = OldQuals; + const TypeQualifier PtrQual = NewQuals.pop_front(); + const TypeQualifier VecQual = NewQuals.pop_front(); + (void)VecQual; + const TypeQualifier EleQual = NewQuals.pop_front(); + NewQuals.push_back(PtrQual); + NewQuals.push_back(EleQual); + ScalarTypes.push_back(NewTy); + ScalarQuals.push_back(NewQuals); + } else { + ScalarTypes.push_back(OldTy); + ScalarQuals.push_back(OldQuals); + } + } else { + if (!OldTy) { + return nullptr; + } + ScalarTypes.push_back(OldTy); + ScalarQuals.push_back(OldQuals); + } + } + + // Handle special builtin naming equivalents. + std::string EquivNameBase = BuiltinName.str(); + StringRef FirstChunk; + Lexer L(BuiltinName); + if (L.ConsumeUntil('_', FirstChunk)) { + const bool AsBuiltin = FirstChunk == "as"; + const bool ConvertBuiltin = FirstChunk == "convert"; + if (!L.Consume("_")) { + return nullptr; + } + StringRef SecondChunkNoWidth; + if (!L.ConsumeAlpha(SecondChunkNoWidth)) { + return nullptr; + } + if (AsBuiltin || ConvertBuiltin) { + // as_* and convert_* builtins have scalar equivalents, with no width + // suffix. Remove the width suffix from the vector builtin name. + const unsigned WidthStart = L.CurrentPos(); + unsigned Width = 0; + if (!L.ConsumeInteger(Width)) { + return nullptr; + } + const unsigned WidthEnd = L.CurrentPos(); + EquivNameBase.erase(WidthStart, WidthEnd - WidthStart); + } + } + + const std::string EquivName = + Mangler.mangleName(EquivNameBase, ScalarTypes, ScalarQuals); + + // Lookup the scalar equivalent and make sure the return type agrees. + Function *ScalarBuiltin = materializeBuiltin(EquivName, M); + if (!ScalarBuiltin) { + return nullptr; + } + Type *RetTy = ScalarBuiltin->getReturnType(); + if (VecRetTy->getElementType() != RetTy) { + return nullptr; + } + return ScalarBuiltin; +} + +/// @brief Returns whether the parameter corresponding to given index to the +/// (assumed builtin) Function is known to possess the given qualifier. +/// @return true if the parameter is known to have the qualifier, false if not, +/// and None on error. +static std::optional +paramHasTypeQual(const Function &F, unsigned ParamIdx, TypeQualifier Q) { + // Demangle the function name to get the type qualifiers. + SmallVector Types; + SmallVector Quals; + NameMangler Mangler(&F.getContext()); + if (Mangler.demangleName(F.getName(), Types, Quals).empty()) { + return std::nullopt; + } + + if (ParamIdx >= Quals.size()) { + return std::nullopt; + } + + auto &Qual = Quals[ParamIdx]; + while (Qual.getCount()) { + if (Qual.pop_front() == Q) { + return true; + } + } + return false; +} + +Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B, + ArrayRef Args) { + if (!F) { + return nullptr; + } + + // Handle 'common' builtins. + const auto BuiltinID = identifyBuiltin(*F); + if (BuiltinID && *BuiltinID != eBuiltinUnknown) { + // Note we have to handle these specially since we need to deduce whether + // the source operand is signed or not. It is not possible to do this based + // solely on the BuiltinID. + switch (*BuiltinID) { + // 6.2 Explicit Conversions + case eCLBuiltinConvertChar: + case eCLBuiltinConvertShort: + case eCLBuiltinConvertInt: + case eCLBuiltinConvertLong: + case eCLBuiltinConvertUChar: + case eCLBuiltinConvertUShort: + case eCLBuiltinConvertUInt: + case eCLBuiltinConvertULong: + return emitBuiltinInlineConvert(F, *BuiltinID, B, Args); + // 6.12.3 Integer Functions + case eCLBuiltinAddSat: + case eCLBuiltinSubSat: { + std::optional IsParamSignedOrNone = + paramHasTypeQual(*F, 0, eTypeQualSignedInt); + if (!IsParamSignedOrNone.has_value()) { + return nullptr; + } + const bool IsSigned = *IsParamSignedOrNone; + const Intrinsic::ID IntrinsicOpc = [=] { + if (BuiltinID == eCLBuiltinSubSat) { + return IsSigned ? Intrinsic::ssub_sat : Intrinsic::usub_sat; + } else { + return IsSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat; + } + }(); + return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1], + IntrinsicOpc); + } + case eCLBuiltinVLoad: { + NameMangler Mangler(&F->getContext()); + Lexer L(Mangler.demangleName(F->getName())); + if (L.Consume("vload")) { + unsigned Width = 0; + if (L.ConsumeInteger(Width)) { + return emitBuiltinInlineVLoad(F, Width, B, Args); + } + } + } break; + case eCLBuiltinVLoadHalf: { + NameMangler Mangler(&F->getContext()); + const auto name = Mangler.demangleName(F->getName()); + if (name == "vload_half") { + // TODO handle "vload_halfn" + return emitBuiltinInlineVLoadHalf(F, B, Args); + } + } break; + case eCLBuiltinVStore: { + NameMangler Mangler(&F->getContext()); + Lexer L(Mangler.demangleName(F->getName())); + if (L.Consume("vstore")) { + unsigned Width = 0; + if (L.ConsumeInteger(Width)) { + return emitBuiltinInlineVStore(F, Width, B, Args); + } + } + } break; + case eCLBuiltinVStoreHalf: { + NameMangler Mangler(&F->getContext()); + Lexer L(Mangler.demangleName(F->getName())); + if (L.Consume("vstore_half")) { + // TODO handle "vstore_halfn" + return emitBuiltinInlineVStoreHalf(F, L.TextLeft(), B, Args); + } + } break; + case eCLBuiltinSelect: + return emitBuiltinInlineSelect(F, B, Args); + case eCLBuiltinAs: + return emitBuiltinInlineAs(F, B, Args); + default: + break; + } + return emitBuiltinInline(*BuiltinID, B, Args); + } + + return nullptr; +} + +Value *CLBuiltinInfo::emitBuiltinInline(BuiltinID BuiltinID, IRBuilder<> &B, + ArrayRef Args) { + switch (BuiltinID) { + default: + return nullptr; + + case eCLBuiltinDot: + case eCLBuiltinCross: + case eCLBuiltinLength: + case eCLBuiltinDistance: + case eCLBuiltinNormalize: + case eCLBuiltinFastLength: + case eCLBuiltinFastDistance: + case eCLBuiltinFastNormalize: + return emitBuiltinInlineGeometrics(BuiltinID, B, Args); + // 6.12.2 Math Functions + case eCLBuiltinFMax: + return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1], + llvm::Intrinsic::maxnum); + case eCLBuiltinFMin: + return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1], + llvm::Intrinsic::minnum); + // 6.12.6 Relational Functions + case eCLBuiltinAll: + return emitBuiltinInlineAll(B, Args); + case eCLBuiltinAny: + return emitBuiltinInlineAny(B, Args); + case eCLBuiltinIsEqual: + case eCLBuiltinIsNotEqual: + case eCLBuiltinIsGreater: + case eCLBuiltinIsGreaterEqual: + case eCLBuiltinIsLess: + case eCLBuiltinIsLessEqual: + case eCLBuiltinIsLessGreater: + case eCLBuiltinIsOrdered: + case eCLBuiltinIsUnordered: + return emitBuiltinInlineRelationalsWithTwoArguments(BuiltinID, B, Args); + case eCLBuiltinIsFinite: + case eCLBuiltinIsInf: + case eCLBuiltinIsNan: + case eCLBuiltinIsNormal: + case eCLBuiltinSignBit: + assert(Args.size() == 1 && "Invalid number of arguments"); + return emitBuiltinInlineRelationalsWithOneArgument(BuiltinID, B, Args[0]); + // 6.12.12 Miscellaneous Vector Functions + case eCLBuiltinShuffle: + case eCLBuiltinShuffle2: + return emitBuiltinInlineShuffle(BuiltinID, B, Args); + + case eCLBuiltinPrintf: + return emitBuiltinInlinePrintf(BuiltinID, B, Args); + } +} + +Value *CLBuiltinInfo::emitBuiltinInlineGeometrics(BuiltinID BuiltinID, + IRBuilder<> &B, + ArrayRef Args) { + Value *Src = nullptr; + switch (BuiltinID) { + default: + return nullptr; + case eCLBuiltinDot: + return emitBuiltinInlineDot(B, Args); + case eCLBuiltinCross: + return emitBuiltinInlineCross(B, Args); + case eCLBuiltinLength: + case eCLBuiltinFastLength: + return emitBuiltinInlineLength(B, Args); + case eCLBuiltinDistance: + case eCLBuiltinFastDistance: + if (Args.size() != 2) { + return nullptr; + } + Src = B.CreateFSub(Args[0], Args[1], "distance"); + return emitBuiltinInlineLength(B, ArrayRef(&Src, 1)); + case eCLBuiltinNormalize: + case eCLBuiltinFastNormalize: + return emitBuiltinInlineNormalize(B, Args); + } +} + +Value *CLBuiltinInfo::emitBuiltinInlineDot(IRBuilder<> &B, + ArrayRef Args) { + if (Args.size() != 2) { + return nullptr; + } + Value *Src0 = Args[0]; + Value *Src1 = Args[1]; + auto *SrcVecTy = dyn_cast(Src0->getType()); + if (SrcVecTy) { + Value *LHS0 = B.CreateExtractElement(Src0, B.getInt32(0), "lhs"); + Value *RHS0 = B.CreateExtractElement(Src1, B.getInt32(0), "rhs"); + Value *Sum = B.CreateFMul(LHS0, RHS0, "dot"); + for (unsigned i = 1; i < SrcVecTy->getNumElements(); i++) { + Value *LHS = B.CreateExtractElement(Src0, B.getInt32(i), "lhs"); + Value *RHS = B.CreateExtractElement(Src1, B.getInt32(i), "rhs"); + Sum = B.CreateFAdd(Sum, B.CreateFMul(LHS, RHS, "dot"), "dot"); + } + return Sum; + } else { + return B.CreateFMul(Src0, Src1, "dot"); + } +} + +Value *CLBuiltinInfo::emitBuiltinInlineCross(IRBuilder<> &B, + ArrayRef Args) { + if (Args.size() != 2) { + return nullptr; + } + Value *Src0 = Args[0]; + Value *Src1 = Args[1]; + auto *RetTy = dyn_cast(Src0->getType()); + if (!RetTy) { + return nullptr; + } + const int SrcIndices[] = {1, 2, 2, 0, 0, 1}; + SmallVector Src0Lanes; + SmallVector Src1Lanes; + for (unsigned i = 0; i < 3; i++) { + Src0Lanes.push_back(B.CreateExtractElement(Src0, B.getInt32(i))); + Src1Lanes.push_back(B.CreateExtractElement(Src1, B.getInt32(i))); + } + + Value *Result = PoisonValue::get(RetTy); + for (unsigned i = 0; i < 3; i++) { + const int Idx0 = SrcIndices[(i * 2) + 0]; + const int Idx1 = SrcIndices[(i * 2) + 1]; + Value *Src0A = Src0Lanes[Idx0]; + Value *Src1A = Src1Lanes[Idx1]; + Value *TempA = B.CreateFMul(Src0A, Src1A); + Value *Src0B = Src0Lanes[Idx1]; + Value *Src1B = Src1Lanes[Idx0]; + Value *TempB = B.CreateFMul(Src0B, Src1B); + Value *Lane = B.CreateFSub(TempA, TempB); + Result = B.CreateInsertElement(Result, Lane, B.getInt32(i)); + } + if (RetTy->getNumElements() == 4) { + Type *EleTy = RetTy->getElementType(); + Result = B.CreateInsertElement(Result, Constant::getNullValue(EleTy), + B.getInt32(3)); + } + return Result; +} + +Value *CLBuiltinInfo::emitBuiltinInlineLength(IRBuilder<> &B, + ArrayRef Args) { + if (Args.size() != 1) { + return nullptr; + } + Value *Src0 = Args[0]; + Value *Src1 = Src0; + + NameMangler Mangler(&B.getContext()); + Type *SrcType = Src0->getType(); + auto *SrcVecType = dyn_cast(SrcType); + if (SrcVecType) { + SrcType = SrcVecType->getElementType(); + } + + TypeQualifiers SrcQuals; + SmallVector Tys; + SmallVector Quals; + SrcQuals.push_back(eTypeQualNone); + + // Materialize 'sqrt', 'fabs' and 'isinf'. + Tys.push_back(SrcType); + Quals.push_back(SrcQuals); + BasicBlock *BB = B.GetInsertBlock(); + if (!BB) { + return nullptr; + } + Function *F = BB->getParent(); + if (!F) { + return nullptr; + } + Module *M = F->getParent(); + if (!M) { + return nullptr; + } + + const std::string FabsName = Mangler.mangleName("fabs", Tys, Quals); + Function *Fabs = materializeBuiltin(FabsName, M); + if (!Fabs) { + return nullptr; + } + if (!SrcVecType) { + // The "length" of a scalar is just the absolute value. + return CreateBuiltinCall(B, Fabs, Src0, "scalar_length"); + } + + const std::string SqrtName = Mangler.mangleName("sqrt", Tys, Quals); + Function *Sqrt = materializeBuiltin(SqrtName, M); + if (!Sqrt) { + return nullptr; + } + + const std::string IsInfName = Mangler.mangleName("isinf", Tys, Quals); + Function *IsInf = materializeBuiltin(IsInfName, M); + if (!IsInf) { + return nullptr; + } + Tys.clear(); + Quals.clear(); + + // Materialize 'fmax'. + Tys.push_back(SrcType); + Quals.push_back(SrcQuals); + Tys.push_back(SrcType); + Quals.push_back(SrcQuals); + const std::string FmaxName = Mangler.mangleName("fmax", Tys, Quals); + Function *Fmax = materializeBuiltin(FmaxName, M); + if (!Fmax) { + return nullptr; + } + + // Emit length or distance inline. + SmallVector Ops; + Ops.push_back(Src0); + Ops.push_back(Src1); + Value *Result = emitBuiltinInline(eCLBuiltinDot, B, Ops); + Result = CreateBuiltinCall(B, Sqrt, Result, "result"); + + // Handle the case where the result is infinite. + Value *AltResult = ConstantFP::get(SrcType, 0.0); + if (SrcVecType) { + for (unsigned i = 0; i < SrcVecType->getNumElements(); i++) { + Value *SrcLane = B.CreateExtractElement(Src0, B.getInt32(i), "src_lane"); + SrcLane = CreateBuiltinCall(B, Fabs, SrcLane, "src_lane"); + AltResult = + CreateBuiltinCall(B, Fmax, {SrcLane, AltResult}, "alt_result"); + } + } else { + Value *SrcLane = CreateBuiltinCall(B, Fabs, Src0, "src_lane"); + AltResult = CreateBuiltinCall(B, Fmax, {SrcLane, AltResult}, "alt_result"); + } + Value *Cond = CreateBuiltinCall(B, IsInf, Result, "cond"); + Cond = B.CreateICmpEQ(Cond, B.getInt32(0), "cmp"); + Result = B.CreateSelect(Cond, Result, AltResult, "final_result"); + return Result; +} + +Value *CLBuiltinInfo::emitBuiltinInlineNormalize(IRBuilder<> &B, + ArrayRef Args) { + if (Args.size() != 1) { + return nullptr; + } + + Value *Src0 = Args[0]; + + NameMangler Mangler(&B.getContext()); + Type *SrcType = Src0->getType(); + auto *SrcVecType = dyn_cast(SrcType); + if (SrcVecType) { + SrcType = SrcVecType->getElementType(); + } + + TypeQualifiers SrcQuals; + SmallVector Tys; + SmallVector Quals; + SrcQuals.push_back(eTypeQualNone); + + // Materialize 'rsqrt'. + Tys.push_back(SrcType); + Quals.push_back(SrcQuals); + BasicBlock *BB = B.GetInsertBlock(); + if (!BB) { + return nullptr; + } + Function *F = BB->getParent(); + if (!F) { + return nullptr; + } + Module *M = F->getParent(); + if (!M) { + return nullptr; + } + + if (!SrcVecType) { + // A normalized scalar is either 1.0 or -1.0, unless the input was NaN, or + // in other words, just the sign. + const std::string SignName = Mangler.mangleName("sign", Tys, Quals); + Function *Sign = materializeBuiltin(SignName, M); + if (!Sign) { + return nullptr; + } + return CreateBuiltinCall(B, Sign, Src0, "scalar_normalize"); + } + + const std::string RSqrtName = Mangler.mangleName("rsqrt", Tys, Quals); + Function *RSqrt = materializeBuiltin(RSqrtName, M); + if (!RSqrt) { + return nullptr; + } + + // Call 'dot' on the input. + SmallVector DotArgs; + DotArgs.push_back(Src0); + DotArgs.push_back(Src0); + Value *Result = emitBuiltinInlineDot(B, DotArgs); + Result = CreateBuiltinCall(B, RSqrt, Result, "normalize"); + if (SrcVecType) { + Result = B.CreateVectorSplat(SrcVecType->getNumElements(), Result); + } + Result = B.CreateFMul(Result, Src0, "normalized"); + return Result; +} + +static Value *emitAllAnyReduction(IRBuilder<> &B, ArrayRef Args, + Instruction::BinaryOps ReduceOp) { + if (Args.size() != 1) { + return nullptr; + } + Value *Arg0 = Args[0]; + IntegerType *EleTy = dyn_cast(Arg0->getType()->getScalarType()); + if (!EleTy) { + return nullptr; + } + + // Reduce the MSB of all vector lanes. + Value *ReducedVal = nullptr; + auto *VecTy = dyn_cast(Arg0->getType()); + if (VecTy) { + ReducedVal = B.CreateExtractElement(Arg0, B.getInt32(0)); + for (unsigned i = 1; i < VecTy->getNumElements(); i++) { + Value *Lane = B.CreateExtractElement(Arg0, B.getInt32(i)); + ReducedVal = B.CreateBinOp(ReduceOp, ReducedVal, Lane); + } + } else { + ReducedVal = Arg0; + } + + // Shift the MSB to return either 0 or 1. + const unsigned ShiftAmount = EleTy->getPrimitiveSizeInBits() - 1; + Value *ShiftAmountVal = ConstantInt::get(EleTy, ShiftAmount); + Value *Result = B.CreateLShr(ReducedVal, ShiftAmountVal); + return B.CreateZExtOrTrunc(Result, B.getInt32Ty()); +} + +Value *CLBuiltinInfo::emitBuiltinInlineAll(IRBuilder<> &B, + ArrayRef Args) { + return emitAllAnyReduction(B, Args, Instruction::And); +} + +Value *CLBuiltinInfo::emitBuiltinInlineAny(IRBuilder<> &B, + ArrayRef Args) { + return emitAllAnyReduction(B, Args, Instruction::Or); +} + +Value *CLBuiltinInfo::emitBuiltinInlineSelect(Function *F, IRBuilder<> &B, + ArrayRef Args) { + if (F->arg_size() != 3) { + return nullptr; + } + Value *FalseVal = Args[0]; + Value *TrueVal = Args[1]; + Value *Cond = Args[2]; + Type *RetTy = F->getReturnType(); + auto *VecRetTy = dyn_cast(RetTy); + Type *CondEleTy = Cond->getType()->getScalarType(); + const unsigned CondEleBits = CondEleTy->getPrimitiveSizeInBits(); + if (VecRetTy) { + const unsigned SimdWidth = VecRetTy->getNumElements(); + Constant *ShiftAmount = ConstantInt::get(CondEleTy, CondEleBits - 1); + Constant *VecShiftAmount = ConstantVector::getSplat( + ElementCount::getFixed(SimdWidth), ShiftAmount); + Value *Mask = B.CreateAShr(Cond, VecShiftAmount); + Value *TrueValRaw = TrueVal; + Value *FalseValRaw = FalseVal; + if (VecRetTy->getElementType()->isFloatingPointTy()) { + auto *RawType = FixedVectorType::getInteger(VecRetTy); + TrueValRaw = B.CreateBitCast(TrueVal, RawType); + FalseValRaw = B.CreateBitCast(FalseVal, RawType); + } + Value *Result = B.CreateXor(TrueValRaw, FalseValRaw); + Result = B.CreateAnd(Result, Mask); + Result = B.CreateXor(Result, FalseValRaw); + if (Result->getType() != VecRetTy) { + Result = B.CreateBitCast(Result, VecRetTy); + } + return Result; + } else { + Value *Cmp = B.CreateICmpNE(Cond, Constant::getNullValue(CondEleTy)); + return B.CreateSelect(Cmp, TrueVal, FalseVal); + } +} + +/// @brief Emit the body of a builtin function as a call to a binary LLVM +/// intrinsic. If one argument is a scalar type and the other a vector type, +/// the scalar argument is splatted to the vector type. +/// +/// @param[in] B Builder used to emit instructions. +/// @param[in] LHS first argument to be passed to the intrinsic. +/// @param[in] RHS second argument to be passed to the intrinsic. +/// @param[in] ID the LLVM intrinsic ID. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineAsLLVMBinaryIntrinsic( + IRBuilder<> &B, Value *LHS, Value *RHS, llvm::Intrinsic::ID ID) { + const Triple TT(B.GetInsertBlock()->getModule()->getTargetTriple()); + if (TT.getArch() == Triple::arm || TT.getArch() == Triple::aarch64) { + // fmin and fmax fail CTS on arm targets. + // This is a HACK and should be removed when it is resolved. + return nullptr; + } + + const auto *LHSTy = LHS->getType(); + const auto *RHSTy = RHS->getType(); + if (LHSTy->isVectorTy() != RHSTy->isVectorTy()) { + auto VectorEC = + multi_llvm::getVectorElementCount(LHSTy->isVectorTy() ? LHSTy : RHSTy); + if (!LHS->getType()->isVectorTy()) { + LHS = B.CreateVectorSplat(VectorEC, LHS); + } + if (!RHS->getType()->isVectorTy()) { + RHS = B.CreateVectorSplat(VectorEC, RHS); + } + } + return B.CreateBinaryIntrinsic(ID, LHS, RHS); +} + +/// @brief Emit the body of the 'as_*' builtin function. +/// +/// @param[in] F Function to emit the body inline. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Args Arguments passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineAs(Function *F, llvm::IRBuilder<> &B, + llvm::ArrayRef Args) { + if (Args.size() != 1) { + return nullptr; + } + Value *Src = Args[0]; + Type *SrcTy = Src->getType(); + Type *DstTy = F->getReturnType(); + auto *SrcVecTy = dyn_cast(SrcTy); + auto *DstVecTy = dyn_cast(DstTy); + Type *SrcEleTy = SrcVecTy ? SrcVecTy->getElementType() : nullptr; + Type *DstEleTy = DstVecTy ? DstVecTy->getElementType() : nullptr; + const unsigned SrcEleBits = SrcEleTy ? SrcEleTy->getPrimitiveSizeInBits() : 0; + const unsigned DstEleBits = DstEleTy ? DstEleTy->getPrimitiveSizeInBits() : 0; + const bool SrcDstHaveSameWidth = + SrcEleTy && DstEleTy && (SrcEleBits == DstEleBits); + const bool SrcVec3 = SrcVecTy && (SrcVecTy->getNumElements() == 3); + const bool SrcVec4 = SrcVecTy && (SrcVecTy->getNumElements() == 4); + const bool DstVec3 = DstVecTy && (DstVecTy->getNumElements() == 3); + const bool DstVec4 = DstVecTy && (DstVecTy->getNumElements() == 4); + bool LowerAsShuffle = false; + if (SrcVec3 && !DstVec3) { + if (!DstVec4 || !SrcDstHaveSameWidth) { + return nullptr; + } + LowerAsShuffle = true; + } else if (DstVec3 && !SrcVec3) { + if (!SrcVec4 || !SrcDstHaveSameWidth) { + return nullptr; + } + LowerAsShuffle = true; + } + + // Lower some vec3 variants of as_* using vector shuffles. + if (LowerAsShuffle) { + SmallVector Indices; + for (unsigned i = 0; i < DstVecTy->getNumElements(); i++) { + if (i < SrcVecTy->getNumElements()) { + Indices.push_back(B.getInt32(i)); + } else { + Indices.push_back(PoisonValue::get(B.getInt32Ty())); + } + } + Value *Mask = ConstantVector::get(Indices); + Src = B.CreateShuffleVector(Src, PoisonValue::get(SrcVecTy), Mask); + } + + // Common case: as_* is a simple bitcast. + return B.CreateBitCast(Src, DstTy, "as"); +} + +/// @brief Emit the body of the 'convert_*' builtin functions. +/// +/// @param[in] F the function to emit inline. +/// @param[in] builtinID Builtin ID of the function. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Args Arguments passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineConvert(Function *F, BuiltinID builtinID, + IRBuilder<> &B, + ArrayRef Args) { + if (Args.size() != 1) { + return nullptr; + } + Type *DstTy = nullptr; + bool DstIsSigned = false; + auto &Ctx = B.getContext(); + switch (builtinID) { + case eCLBuiltinConvertChar: + DstIsSigned = true; + LLVM_FALLTHROUGH; + case eCLBuiltinConvertUChar: + DstTy = IntegerType::getInt8Ty(Ctx); + break; + case eCLBuiltinConvertShort: + DstIsSigned = true; + LLVM_FALLTHROUGH; + case eCLBuiltinConvertUShort: + DstTy = IntegerType::getInt16Ty(Ctx); + break; + case eCLBuiltinConvertInt: + DstIsSigned = true; + LLVM_FALLTHROUGH; + case eCLBuiltinConvertUInt: + DstTy = IntegerType::getInt32Ty(Ctx); + break; + case eCLBuiltinConvertLong: + DstIsSigned = true; + LLVM_FALLTHROUGH; + case eCLBuiltinConvertULong: + DstTy = IntegerType::getInt64Ty(Ctx); + break; + + default: + return nullptr; + } + if (!DstTy) { + return nullptr; + } + + Value *Src = Args[0]; + bool SrcIsSigned; + if (Src->getType()->isFloatingPointTy()) { + // All floating point types are signed + SrcIsSigned = true; + } else { + auto IsParamSignedOrNone = paramHasTypeQual(*F, 0, eTypeQualSignedInt); + if (!IsParamSignedOrNone) { + return nullptr; + } + SrcIsSigned = *IsParamSignedOrNone; + } + + auto Opcode = CastInst::getCastOpcode(Src, SrcIsSigned, DstTy, DstIsSigned); + return B.CreateCast(Opcode, Src, DstTy, "inline_convert"); +} + +/// @brief Emit the body of the 'vloadN' builtin function. +/// +/// @param[in] F Function to emit the body inline. +/// @param[in] Width Number of elements to load. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Args Arguments passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineVLoad(Function *F, unsigned Width, + IRBuilder<> &B, + ArrayRef Args) { + if (Width < 2) { + return nullptr; + } + (void)F; + + Type *RetTy = F->getReturnType(); + assert(isa(RetTy) && "vloadN must return a vector type"); + Type *EltTy = RetTy->getScalarType(); + + Value *Ptr = Args[1]; + PointerType *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy) { + return nullptr; + } + auto *DataTy = FixedVectorType::get(EltTy, Width); + Value *Data = PoisonValue::get(DataTy); + + // Emit the base pointer. + Value *Offset = Args[0]; + IntegerType *OffsetTy = dyn_cast(Offset->getType()); + if (!OffsetTy) { + return nullptr; + } + Value *Stride = ConstantInt::get(OffsetTy, Width); + Offset = B.CreateMul(Offset, Stride); + Value *GEPBase = B.CreateGEP(EltTy, Ptr, Offset, "vload_base"); + + if (Width == 3) { + for (unsigned i = 0; i < Width; i++) { + Value *Index = B.getInt32(i); + Value *GEP = B.CreateGEP(EltTy, GEPBase, Index); + Value *Lane = B.CreateLoad(EltTy, GEP, false, "vload"); + Data = B.CreateInsertElement(Data, Lane, Index, "vload_insert"); + } + } else { + auto *Load = B.CreateLoad(DataTy, GEPBase, false, "vload"); + + const unsigned Align = DataTy->getScalarSizeInBits() / 8; + Load->setAlignment(MaybeAlign(Align).valueOrOne()); + Data = Load; + } + + return Data; +} + +/// @brief Emit the body of the 'vstoreN' builtin function. +/// +/// @param[in] F Function to emit the body inline. +/// @param[in] Width Number of elements to store. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Args Arguments passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineVStore(Function *F, unsigned Width, + IRBuilder<> &B, + ArrayRef Args) { + if (Width < 2) { + return nullptr; + } + (void)F; + + Value *Data = Args[0]; + auto *VecDataTy = dyn_cast(Data->getType()); + if (!VecDataTy || (VecDataTy->getNumElements() != Width)) { + return nullptr; + } + + Value *Ptr = Args[2]; + PointerType *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy) { + return nullptr; + } + + // Emit the base pointer. + Value *Offset = Args[1]; + IntegerType *OffsetTy = dyn_cast(Offset->getType()); + if (!OffsetTy) { + return nullptr; + } + Value *Stride = ConstantInt::get(OffsetTy, Width); + Offset = B.CreateMul(Offset, Stride); + Value *GEPBase = + B.CreateGEP(VecDataTy->getElementType(), Ptr, Offset, "vstore_base"); + + // Emit store(s). + StoreInst *Store = nullptr; + if (Width == 3) { + for (unsigned i = 0; i < Width; i++) { + Value *Index = B.getInt32(i); + Value *Lane = B.CreateExtractElement(Data, Index, "vstore_extract"); + Value *GEP = B.CreateGEP(VecDataTy->getElementType(), GEPBase, Index); + Store = B.CreateStore(Lane, GEP, false); + } + } else { + Store = B.CreateStore(Data, GEPBase, false); + + const unsigned Align = VecDataTy->getScalarSizeInBits() / 8; + Store->setAlignment(MaybeAlign(Align).valueOrOne()); + } + return Store; +} + +/// @brief Emit the body of the 'vload_half' builtin function. +/// +/// @param[in] F Function to emit the body inline. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Args Arguments passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineVLoadHalf(Function *F, IRBuilder<> &B, + ArrayRef Args) { + if (F->getType()->isVectorTy()) { + return nullptr; + } + + // Cast the pointer to ushort*. + Value *Ptr = Args[1]; + PointerType *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy) { + return nullptr; + } + Type *U16Ty = B.getInt16Ty(); + + // Emit the base pointer. + Value *Offset = Args[0]; + Value *DataPtr = B.CreateGEP(U16Ty, Ptr, Offset, "vload_base"); + + // Load a ushort. + Value *Data = B.CreateLoad(B.getInt16Ty(), DataPtr, "vload_half"); + + // Declare the conversion builtin. + Module *M = F->getParent(); + Function *HalfToFloatFn = + declareBuiltin(M, eCLBuiltinConvertHalfToFloat, B.getFloatTy(), + {B.getInt16Ty()}, {eTypeQualNone}); + if (!HalfToFloatFn) { + return nullptr; + } + + // Convert it to float. + CallInst *CI = CreateBuiltinCall(B, HalfToFloatFn, {Data}); + CI->setCallingConv(F->getCallingConv()); + + return CI; +} + +/// @brief Emit the body of the 'vstore_half' builtin function. +/// +/// @param[in] F Function to emit the body inline. +/// @param[in] Mode Rounding mode to use, e.g. '_rte'. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Args Arguments passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineVStoreHalf(Function *F, StringRef Mode, + IRBuilder<> &B, + ArrayRef Args) { + Value *Data = Args[0]; + if (!Data || Data->getType()->isVectorTy()) { + return nullptr; + } + + // Declare the conversion builtin. + std::optional ConvID; + + if (Data->getType() == B.getFloatTy()) { + ConvID = StringSwitch>(Mode) + .Case("", eCLBuiltinConvertFloatToHalf) + .Case("_rte", eCLBuiltinConvertFloatToHalfRte) + .Case("_rtz", eCLBuiltinConvertFloatToHalfRtz) + .Case("_rtp", eCLBuiltinConvertFloatToHalfRtp) + .Case("_rtn", eCLBuiltinConvertFloatToHalfRtn) + .Default(std::nullopt); + } else { + ConvID = StringSwitch>(Mode) + .Case("", eCLBuiltinConvertDoubleToHalf) + .Case("_rte", eCLBuiltinConvertDoubleToHalfRte) + .Case("_rtz", eCLBuiltinConvertDoubleToHalfRtz) + .Case("_rtp", eCLBuiltinConvertDoubleToHalfRtp) + .Case("_rtn", eCLBuiltinConvertDoubleToHalfRtn) + .Default(std::nullopt); + } + if (!ConvID) { + return nullptr; + } + Module *M = F->getParent(); + + // Normally, the vstore_half functions take the number to store as a float. + // However, if the double extension is enabled, it is also possible to use + // double instead. This means that we might have to convert either a float or + // a double to a half. + Function *FloatToHalfFn = declareBuiltin(M, *ConvID, B.getInt16Ty(), + {Data->getType()}, {eTypeQualNone}); + if (!FloatToHalfFn) { + return nullptr; + } + + // Convert the data from float/double to half. + CallInst *CI = CreateBuiltinCall(B, FloatToHalfFn, {Data}); + CI->setCallingConv(F->getCallingConv()); + Data = CI; + + // Cast the pointer to ushort*. + Value *Ptr = Args[2]; + PointerType *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy) { + return nullptr; + } + auto U16Ty = B.getInt16Ty(); + + // Emit the base pointer. + Value *Offset = Args[1]; + Value *DataPtr = B.CreateGEP(U16Ty, Ptr, Offset, "vstore_base"); + + // Store the ushort. + return B.CreateStore(Data, DataPtr); +} + +/// @brief Emit the body of a relational builtin function. +/// +/// This function handles relational builtins that accept two arguments, such as +/// the comparison builtins. +/// +/// @param[in] BuiltinID Identifier of the builtin to emit the body inline. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Args Arguments passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineRelationalsWithTwoArguments( + BuiltinID BuiltinID, IRBuilder<> &B, ArrayRef Args) { + CmpInst::Predicate Pred = CmpInst::FCMP_FALSE; + CmpInst::Predicate Pred2 = CmpInst::FCMP_FALSE; + switch (BuiltinID) { + default: + return nullptr; + case eCLBuiltinIsEqual: + Pred = CmpInst::FCMP_OEQ; + break; + case eCLBuiltinIsNotEqual: + Pred = CmpInst::FCMP_UNE; + break; + case eCLBuiltinIsGreater: + Pred = CmpInst::FCMP_OGT; + break; + case eCLBuiltinIsGreaterEqual: + Pred = CmpInst::FCMP_OGE; + break; + case eCLBuiltinIsLess: + Pred = CmpInst::FCMP_OLT; + break; + case eCLBuiltinIsLessEqual: + Pred = CmpInst::FCMP_OLE; + break; + case eCLBuiltinIsLessGreater: + Pred = CmpInst::FCMP_OLT; + Pred2 = CmpInst::FCMP_OGT; + break; + case eCLBuiltinIsOrdered: + Pred = CmpInst::FCMP_ORD; + break; + case eCLBuiltinIsUnordered: + Pred = CmpInst::FCMP_UNO; + break; + } + + if (Args.size() != 2) { + return nullptr; + } + Value *Src0 = Args[0], *Src1 = Args[1]; + Value *Cmp = B.CreateFCmp(Pred, Src0, Src1, "relational"); + + Type *ResultEleTy = nullptr; + Type *Src0Ty = Src0->getType(); + if (Src0->getType() == B.getDoubleTy()) { + // Special case because relational(doubleN, doubleN) returns longn while + // relational(double, double) returns int. + if (Src0Ty->isVectorTy()) { + ResultEleTy = B.getInt64Ty(); + } else { + ResultEleTy = B.getInt32Ty(); + } + } else if (Src0->getType() == B.getHalfTy()) { + // Special case because relational(HalfTyN, HalfTyN) returns i16 while + // relational(HalfTy, HalfTy) returns int. + if (Src0Ty->isVectorTy()) { + ResultEleTy = B.getInt16Ty(); + } else { + ResultEleTy = B.getInt32Ty(); + } + } else { + // All the other cases can be handled here. + ResultEleTy = B.getIntNTy(Src0->getType()->getScalarSizeInBits()); + } + Value *Result = nullptr; + auto *SrcVecTy = dyn_cast(Src0->getType()); + if (SrcVecTy) { + auto *ResultVecTy = + FixedVectorType::get(ResultEleTy, SrcVecTy->getNumElements()); + Result = B.CreateSExt(Cmp, ResultVecTy, "relational"); + } else { + Result = B.CreateZExt(Cmp, ResultEleTy, "relational"); + } + + if (Pred2 != CmpInst::FCMP_FALSE) { + Value *Cmp2 = B.CreateFCmp(Pred2, Src0, Src1, "relational"); + Value *True = SrcVecTy ? Constant::getAllOnesValue(Result->getType()) + : ConstantInt::get(Result->getType(), 1); + Result = B.CreateSelect(Cmp2, True, Result); + } + + return Result; +} + +/// @brief Emit the body of a relational builtin function. +/// +/// This function handles relational builtins that accept a single argument, +/// such as the builtins checking if the argument is infinite or not. +/// +/// @param[in] BuiltinID Identifier of the builtin to emit the body inline. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Arg Argument passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineRelationalsWithOneArgument( + BuiltinID BuiltinID, IRBuilder<> &B, Value *Arg) { + Value *Result = nullptr; + // The types (and misc info) that we will be using + Type *ArgTy = Arg->getType(); + const bool isVectorTy = ArgTy->isVectorTy(); + const unsigned Width = + isVectorTy ? multi_llvm::getVectorNumElements(ArgTy) : 0; + Type *ArgEleTy = isVectorTy ? multi_llvm::getVectorElementType(ArgTy) : ArgTy; + Type *SignedTy = ArgEleTy == B.getFloatTy() ? B.getInt32Ty() : B.getInt64Ty(); + Type *ReturnTy = (ArgEleTy == B.getDoubleTy() && isVectorTy) ? B.getInt64Ty() + : B.getInt32Ty(); + + if (ArgEleTy != B.getFloatTy() && ArgEleTy != B.getDoubleTy()) { + return nullptr; + } + // Create all the masks we are going to be using + Constant *ExponentMask = nullptr; + Constant *MantissaMask = nullptr; + Constant *NonSignMask = nullptr; + Constant *Zero = nullptr; + if (ArgEleTy == B.getFloatTy()) { + ExponentMask = B.getInt32(0x7F800000u); + MantissaMask = B.getInt32(0x007FFFFFu); + NonSignMask = B.getInt32(0x7FFFFFFFu); + Zero = B.getInt32(0u); + } else if (ArgEleTy == B.getDoubleTy()) { + ExponentMask = B.getInt64(0x7FF0000000000000u); + MantissaMask = B.getInt64(0x000FFFFFFFFFFFFFu); + NonSignMask = B.getInt64(0x7FFFFFFFFFFFFFFFu); + Zero = B.getInt64(0u); + } + + // For the vector versions, we need to create vector types and values + if (isVectorTy) { + SignedTy = FixedVectorType::get(SignedTy, Width); + ReturnTy = FixedVectorType::get(ReturnTy, Width); + const auto EC = ElementCount::getFixed(Width); + ExponentMask = ConstantVector::getSplat(EC, ExponentMask); + MantissaMask = ConstantVector::getSplat(EC, MantissaMask); + NonSignMask = ConstantVector::getSplat(EC, NonSignMask); + Zero = ConstantVector::getSplat(EC, Zero); + } + + // We will be needing access to the argument as an integer (bitcast) value + Value *STArg = B.CreateBitCast(Arg, SignedTy); + + // Emit the IR that will calculate the result + switch (BuiltinID) { + default: + llvm_unreachable("Invalid Builtin ID"); + break; + case eCLBuiltinIsFinite: + Result = B.CreateAnd(STArg, NonSignMask); + Result = B.CreateICmpSLT(Result, ExponentMask); + break; + case eCLBuiltinIsInf: + Result = B.CreateAnd(STArg, NonSignMask); + Result = B.CreateICmpEQ(Result, ExponentMask); + break; + case eCLBuiltinIsNan: { + Result = B.CreateAnd(STArg, NonSignMask); + // This checks if the exponent is all ones (the same as the ExponentMask) + // and also if the significant (the mantissa) is not zero. If the mantissa + // is zero then it would be infinite, not NaN. + Value *ExponentAllOnes = + B.CreateICmpEQ(ExponentMask, B.CreateAnd(ExponentMask, Result)); + Value *MantissaNotZero = + B.CreateICmpSGT(B.CreateAnd(MantissaMask, Result), Zero); + Result = B.CreateAnd(ExponentAllOnes, MantissaNotZero); + break; + } + case eCLBuiltinIsNormal: { + Result = B.CreateAnd(STArg, NonSignMask); + Value *ExponentBitsNotAllSet = B.CreateICmpSLT(Result, ExponentMask); + Value *ExponentBitsNonZero = B.CreateICmpSGT(Result, MantissaMask); + Result = B.CreateAnd(ExponentBitsNotAllSet, ExponentBitsNonZero); + break; + } + case eCLBuiltinSignBit: + Result = B.CreateICmpSLT(STArg, Zero); + break; + } + + // Convert the i1 result from the comparison instruction to the type that the + // builtin returns + if (isVectorTy) { + // 0 for false, -1 (all 1s) for true + Result = B.CreateSExt(Result, ReturnTy); + } else { + // 0 for false, 1 for true + Result = B.CreateZExt(Result, ReturnTy); + } + + return Result; +} + +/// @brief Emit the body of a vector shuffle builtin function. +/// +/// @param[in] BuiltinID Identifier of the builtin to emit the body inline. +/// @param[in] B Builder used to emit instructions. +/// @param[in] Args Arguments passed to the function. +/// +/// @return Value returned by the builtin implementation or null on failure. +Value *CLBuiltinInfo::emitBuiltinInlineShuffle(BuiltinID BuiltinID, + IRBuilder<> &B, + ArrayRef Args) { + // Make sure we have the correct number of arguments. + assert(((BuiltinID == eCLBuiltinShuffle && Args.size() == 2) || + (BuiltinID == eCLBuiltinShuffle2 && Args.size() == 3)) && + "Wrong number of arguments!"); + + // It is not worth splitting shuffle and shuffle2 into two functions as a lot + // of the code is the same. + const bool isShuffle2 = (BuiltinID == eCLBuiltinShuffle2); + + // Get the mask and the mask type. + Value *Mask = Args[isShuffle2 ? 2 : 1]; + auto MaskVecTy = cast(Mask->getType()); + IntegerType *MaskTy = cast(MaskVecTy->getElementType()); + const int MaskWidth = MaskVecTy->getNumElements(); + + // TODO: Support non-constant masks (in a less efficient way) + if (!isa(Mask)) { + return nullptr; + } + + // We need to mask the mask elements, since the OpenCL standard specifies that + // we should only take the ilogb(2N-1)+1 least significant bits from each mask + // element into consideration, where N the number of elements in the vector + // according to vec_step. + auto ShuffleTy = cast(Args[0]->getType()); + const int Width = ShuffleTy->getNumElements(); + // Vectors for size 3 are not supported by the shuffle builtin. + assert(Width != 3 && "Invalid vector width of 3!"); + const int N = (Width == 3 ? 4 : Width); + const int SignificantBits = + stdcompat::ilogb((2 * N) - 1) + (isShuffle2 ? 1 : 0); + const unsigned BitMask = ~((~0u) << SignificantBits); + Value *BitMaskV = ConstantVector::getSplat(ElementCount::getFixed(MaskWidth), + ConstantInt::get(MaskTy, BitMask)); + // The builtin's mask may have different integer types, while the LLVM + // instruction only supports i32. + // Mask the mask. + Value *MaskedMask = B.CreateAnd(Mask, BitMaskV, "mask"); + MaskedMask = B.CreateIntCast( + MaskedMask, FixedVectorType::get(B.getInt32Ty(), MaskWidth), false); + + // Create the shufflevector instruction. + Value *Arg1 = (isShuffle2 ? Args[1] : PoisonValue::get(ShuffleTy)); + return B.CreateShuffleVector(Args[0], Arg1, MaskedMask, "shuffle"); +} + +Value *CLBuiltinInfo::emitBuiltinInlinePrintf(BuiltinID, IRBuilder<> &B, + ArrayRef Args) { + Module &M = *(B.GetInsertBlock()->getModule()); + + // Declare printf if needed. + Function *Printf = M.getFunction("printf"); + if (!Printf) { + PointerType *PtrTy = B.getPtrTy(/*AddrSpace=*/0); + FunctionType *PrintfTy = FunctionType::get(B.getInt32Ty(), {PtrTy}, true); + Printf = + Function::Create(PrintfTy, GlobalValue::ExternalLinkage, "printf", &M); + Printf->setCallingConv(CallingConv::SPIR_FUNC); + } + + return CreateBuiltinCall(B, Printf, Args); +} + +// Must be kept in sync with our OpenCL headers! +enum : uint32_t { + CLK_LOCAL_MEM_FENCE = 1, + CLK_GLOBAL_MEM_FENCE = 2, + // FIXME: We don't support image fences in our headers +}; + +// Must be kept in sync with our OpenCL headers! +enum : uint32_t { + memory_scope_work_item = 1, + memory_scope_sub_group = 2, + memory_scope_work_group = 3, + memory_scope_device = 4, + memory_scope_all_svm_devices = 5, + memory_scope_all_devices = 6, +}; + +// Must be kept in sync with our OpenCL headers! +enum : uint32_t { + memory_order_relaxed = 0, + memory_order_acquire = 1, + memory_order_release = 2, + memory_order_acq_rel = 3, + memory_order_seq_cst = 4, +}; + +static std::optional parseMemFenceFlagsParam(Value *const P) { + // Grab the 'flags' parameter. + if (auto *const Flags = dyn_cast(P)) { + // cl_mem_fence_flags is a bitfield and can be 0 or a combination of + // CLK_(GLOBAL|LOCAL|IMAGE)_MEM_FENCE values ORed together. + switch (Flags->getZExtValue()) { + case 0: + return std::nullopt; + case CLK_LOCAL_MEM_FENCE: + return BIMuxInfoConcept::MemSemanticsWorkGroupMemory; + case CLK_GLOBAL_MEM_FENCE: + return BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory; + case CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE: + return (BIMuxInfoConcept::MemSemanticsWorkGroupMemory | + BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory); + default: + llvm_unreachable("unhandled memory fence flags"); + } + } + return std::nullopt; +} + +static std::optional parseMemoryScopeParam(Value *const P) { + if (auto *const Scope = dyn_cast(P)) { + switch (Scope->getZExtValue()) { + case memory_scope_work_item: + return BIMuxInfoConcept::MemScopeWorkItem; + case memory_scope_sub_group: + return BIMuxInfoConcept::MemScopeSubGroup; + case memory_scope_work_group: + return BIMuxInfoConcept::MemScopeWorkGroup; + case memory_scope_device: + return BIMuxInfoConcept::MemScopeDevice; + // 3.3.5. memory_scope_all_devices is an alias for + // memory_scope_all_svm_devices. + case memory_scope_all_devices: + case memory_scope_all_svm_devices: + return BIMuxInfoConcept::MemScopeCrossDevice; + default: + llvm_unreachable("unhandled memory scope"); + } + } + return std::nullopt; +} + +static std::optional parseMemoryOrderParam(Value *const P) { + if (auto *const Order = dyn_cast(P)) { + switch (Order->getZExtValue()) { + case memory_order_relaxed: + return BIMuxInfoConcept::MemSemanticsRelaxed; + case memory_order_acquire: + return BIMuxInfoConcept::MemSemanticsAcquire; + case memory_order_release: + return BIMuxInfoConcept::MemSemanticsRelease; + case memory_order_acq_rel: + return BIMuxInfoConcept::MemSemanticsAcquireRelease; + case memory_order_seq_cst: + return BIMuxInfoConcept::MemSemanticsSequentiallyConsistent; + default: + llvm_unreachable("unhandled memory order"); + } + } + return std::nullopt; +} + +// This function returns a mux builtin ID for the corresponding CL builtin ID +// when that lowering is straightforward and the function types of each builtin +// are identical. +static std::optional get1To1BuiltinLowering(BuiltinID CLBuiltinID) { + switch (CLBuiltinID) { + default: + return std::nullopt; + case eCLBuiltinGetWorkDim: + return eMuxBuiltinGetWorkDim; + case eCLBuiltinGetGroupId: + return eMuxBuiltinGetGroupId; + case eCLBuiltinGetGlobalSize: + return eMuxBuiltinGetGlobalSize; + case eCLBuiltinGetGlobalOffset: + return eMuxBuiltinGetGlobalOffset; + case eCLBuiltinGetLocalId: + return eMuxBuiltinGetLocalId; + case eCLBuiltinGetLocalSize: + return eMuxBuiltinGetLocalSize; + case eCLBuiltinGetEnqueuedLocalSize: + return eMuxBuiltinGetEnqueuedLocalSize; + case eCLBuiltinGetNumGroups: + return eMuxBuiltinGetNumGroups; + case eCLBuiltinGetGlobalId: + return eMuxBuiltinGetGlobalId; + case eCLBuiltinGetLocalLinearId: + return eMuxBuiltinGetLocalLinearId; + case eCLBuiltinGetGlobalLinearId: + return eMuxBuiltinGetGlobalLinearId; + case eCLBuiltinGetSubgroupSize: + return eMuxBuiltinGetSubGroupSize; + case eCLBuiltinGetMaxSubgroupSize: + return eMuxBuiltinGetMaxSubGroupSize; + case eCLBuiltinGetSubgroupLocalId: + return eMuxBuiltinGetSubGroupLocalId; + case eCLBuiltinGetNumSubgroups: + return eMuxBuiltinGetNumSubGroups; + case eCLBuiltinGetEnqueuedNumSubgroups: + // Note - this is mapping to the same builtin as + // eCLBuiltinGetNumSubgroups, as we don't currently support + // non-uniform work-group sizes. + return eMuxBuiltinGetNumSubGroups; + case eCLBuiltinGetSubgroupId: + return eMuxBuiltinGetSubGroupId; + } +} + +Instruction * +CLBuiltinInfo::lowerBuiltinToMuxBuiltin(CallInst &CI, + BIMuxInfoConcept &BIMuxImpl) { + auto &M = *CI.getModule(); + auto *const F = CI.getCalledFunction(); + if (!F) { + return nullptr; + } + const auto ID = identifyBuiltin(*F); + if (!ID) { + return nullptr; + } + + // Handle straightforward 1:1 mappings. + if (auto MuxID = get1To1BuiltinLowering(*ID)) { + auto *const MuxBuiltinFn = BIMuxImpl.getOrDeclareMuxBuiltin(*MuxID, M); + assert(MuxBuiltinFn && "Could not get/declare mux builtin"); + const SmallVector Args(CI.args()); + auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName()); + NewCI->insertBefore(CI.getIterator()); + NewCI->takeName(&CI); + NewCI->setAttributes(MuxBuiltinFn->getAttributes()); + return NewCI; + } + + IRBuilder<> B(&CI); + LLVMContext &Ctx = M.getContext(); + auto *const I32Ty = Type::getInt32Ty(Ctx); + + auto CtrlBarrierID = eMuxBuiltinWorkGroupBarrier; + unsigned DefaultMemScope = BIMuxInfoConcept::MemScopeWorkGroup; + unsigned DefaultMemOrder = + BIMuxInfoConcept::MemSemanticsSequentiallyConsistent; + + switch (*ID) { + default: + // Sub-group and work-group builtins need lowering to their mux + // equivalents. + if (auto *const NewI = lowerGroupBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl)) { + return NewI; + } + return nullptr; + case eCLBuiltinSubGroupBarrier: + CtrlBarrierID = eMuxBuiltinSubGroupBarrier; + DefaultMemScope = BIMuxInfoConcept::MemScopeSubGroup; + LLVM_FALLTHROUGH; + case eCLBuiltinBarrier: + case eCLBuiltinWorkGroupBarrier: { + // Memory Scope which the barrier controls. Defaults to 'workgroup' or + // 'subgroup' scope depending on the barrier, but sub_group_barrier and + // work_group_barrier can optionally provide a scope. + unsigned ScopeVal = DefaultMemScope; + if ((ID == eCLBuiltinSubGroupBarrier || ID == eCLBuiltinWorkGroupBarrier) && + F->arg_size() == 2) { + if (auto Scope = parseMemoryScopeParam(CI.getOperand(1))) { + ScopeVal = *Scope; + } + } + + const unsigned SemanticsVal = + DefaultMemOrder | parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0); + + auto *const CtrlBarrier = + BIMuxImpl.getOrDeclareMuxBuiltin(CtrlBarrierID, M); + + auto *const BarrierID = ConstantInt::get(I32Ty, 0); + auto *const Scope = ConstantInt::get(I32Ty, ScopeVal); + auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal); + auto *const NewCI = + B.CreateCall(CtrlBarrier, {BarrierID, Scope, Semantics}, CI.getName()); + NewCI->setAttributes(CtrlBarrier->getAttributes()); + NewCI->takeName(&CI); + return NewCI; + } + case eCLBuiltinAtomicWorkItemFence: + // atomic_work_item_fence has two parameters which we can parse. + DefaultMemOrder = + parseMemoryOrderParam(CI.getOperand(1)).value_or(DefaultMemOrder); + DefaultMemScope = + parseMemoryScopeParam(CI.getOperand(2)).value_or(DefaultMemScope); + LLVM_FALLTHROUGH; + case eCLBuiltinMemFence: + case eCLBuiltinReadMemFence: + case eCLBuiltinWriteMemFence: { + // The deprecated 'fence' builtins default to memory_scope_work_group and + // have one possible order each. + if (ID == eCLBuiltinMemFence) { + DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquireRelease; + } else if (ID == eCLBuiltinReadMemFence) { + DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquire; + } else if (ID == eCLBuiltinWriteMemFence) { + DefaultMemOrder = BIMuxInfoConcept::MemSemanticsRelease; + } + const unsigned SemanticsVal = + DefaultMemOrder | parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0); + auto *const MemBarrier = + BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M); + auto *const Scope = ConstantInt::get(I32Ty, DefaultMemScope); + auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal); + auto *const NewCI = + B.CreateCall(MemBarrier, {Scope, Semantics}, CI.getName()); + NewCI->setAttributes(MemBarrier->getAttributes()); + NewCI->takeName(&CI); + return NewCI; + } + case eCLBuiltinAsyncWorkGroupCopy: + case eCLBuiltinAsyncWorkGroupStridedCopy: + case eCLBuiltinAsyncWorkGroupCopy2D2D: + case eCLBuiltinAsyncWorkGroupCopy3D3D: + return lowerAsyncBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl); + case eCLBuiltinWaitGroupEvents: { + auto *const MuxWait = + BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinDMAWait, M); + assert(MuxWait && "Could not get/declare __mux_dma_wait"); + auto *const Count = CI.getArgOperand(0); + auto *Events = CI.getArgOperand(1); + + assert(Events->getType()->isPointerTy() && + (Events->getType()->getPointerAddressSpace() == + compiler::utils::AddressSpace::Private || + Events->getType()->getPointerAddressSpace() == + compiler::utils::AddressSpace::Generic) && + "Pointer to event must be in address space 0 or 4."); + + Events = B.CreatePointerBitCastOrAddrSpaceCast( + Events, PointerType::getUnqual(Ctx), "mux.events"); + auto *const NewCI = B.CreateCall(MuxWait, {Count, Events}, CI.getName()); + NewCI->setAttributes(MuxWait->getAttributes()); + NewCI->takeName(&CI); + return NewCI; + } + } +} + +Instruction * +CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(CallInst &CI, BuiltinID ID, + BIMuxInfoConcept &BIMuxImpl) { + auto &M = *CI.getModule(); + auto *const F = CI.getCalledFunction(); + assert(F && "No calling function?"); + + // Some ops need extra checking to determine their mux ID: + // * add/mul operations are split into integer/float + // * min/max operations are split into signed/unsigned/float + // So we set a 'base' builtin ID for these operations to the (unsigned) + // integer variant and do a checking step afterwards where we refine the + // builtin ID. + bool RecheckOpType = false; + BaseBuiltinID MuxBuiltinID; + switch (ID) { + default: + return nullptr; + case eCLBuiltinSubgroupAll: + MuxBuiltinID = eMuxBuiltinSubgroupAll; + break; + case eCLBuiltinSubgroupAny: + MuxBuiltinID = eMuxBuiltinSubgroupAny; + break; + case eCLBuiltinSubgroupBroadcast: + MuxBuiltinID = eMuxBuiltinSubgroupBroadcast; + break; + case eCLBuiltinSubgroupReduceAdd: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupReduceAdd; + break; + case eCLBuiltinSubgroupReduceMin: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupReduceUMin; + break; + case eCLBuiltinSubgroupReduceMax: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupReduceUMax; + break; + case eCLBuiltinSubgroupReduceMul: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupReduceMul; + break; + case eCLBuiltinSubgroupReduceAnd: + MuxBuiltinID = eMuxBuiltinSubgroupReduceAnd; + break; + case eCLBuiltinSubgroupReduceOr: + MuxBuiltinID = eMuxBuiltinSubgroupReduceOr; + break; + case eCLBuiltinSubgroupReduceXor: + MuxBuiltinID = eMuxBuiltinSubgroupReduceXor; + break; + case eCLBuiltinSubgroupReduceLogicalAnd: + MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalAnd; + break; + case eCLBuiltinSubgroupReduceLogicalOr: + MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalOr; + break; + case eCLBuiltinSubgroupReduceLogicalXor: + MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalXor; + break; + case eCLBuiltinSubgroupScanAddInclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupScanAddInclusive; + break; + case eCLBuiltinSubgroupScanAddExclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupScanAddExclusive; + break; + case eCLBuiltinSubgroupScanMinInclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupScanUMinInclusive; + break; + case eCLBuiltinSubgroupScanMinExclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupScanUMinExclusive; + break; + case eCLBuiltinSubgroupScanMaxInclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxInclusive; + break; + case eCLBuiltinSubgroupScanMaxExclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxExclusive; + break; + case eCLBuiltinSubgroupScanMulInclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupScanMulInclusive; + break; + case eCLBuiltinSubgroupScanMulExclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinSubgroupScanMulExclusive; + break; + case eCLBuiltinSubgroupScanAndInclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanAndInclusive; + break; + case eCLBuiltinSubgroupScanAndExclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanAndExclusive; + break; + case eCLBuiltinSubgroupScanOrInclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanOrInclusive; + break; + case eCLBuiltinSubgroupScanOrExclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanOrExclusive; + break; + case eCLBuiltinSubgroupScanXorInclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanXorInclusive; + break; + case eCLBuiltinSubgroupScanXorExclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanXorExclusive; + break; + case eCLBuiltinSubgroupScanLogicalAndInclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndInclusive; + break; + case eCLBuiltinSubgroupScanLogicalAndExclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndExclusive; + break; + case eCLBuiltinSubgroupScanLogicalOrInclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrInclusive; + break; + case eCLBuiltinSubgroupScanLogicalOrExclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrExclusive; + break; + case eCLBuiltinSubgroupScanLogicalXorInclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorInclusive; + break; + case eCLBuiltinSubgroupScanLogicalXorExclusive: + MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorExclusive; + break; + case eCLBuiltinWorkgroupAll: + MuxBuiltinID = eMuxBuiltinWorkgroupAll; + break; + case eCLBuiltinWorkgroupAny: + MuxBuiltinID = eMuxBuiltinWorkgroupAny; + break; + case eCLBuiltinWorkgroupBroadcast: + MuxBuiltinID = eMuxBuiltinWorkgroupBroadcast; + break; + case eCLBuiltinWorkgroupReduceAdd: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupReduceAdd; + break; + case eCLBuiltinWorkgroupReduceMin: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMin; + break; + case eCLBuiltinWorkgroupReduceMax: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMax; + break; + case eCLBuiltinWorkgroupReduceMul: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupReduceMul; + break; + case eCLBuiltinWorkgroupReduceAnd: + MuxBuiltinID = eMuxBuiltinWorkgroupReduceAnd; + break; + case eCLBuiltinWorkgroupReduceOr: + MuxBuiltinID = eMuxBuiltinWorkgroupReduceOr; + break; + case eCLBuiltinWorkgroupReduceXor: + MuxBuiltinID = eMuxBuiltinWorkgroupReduceXor; + break; + case eCLBuiltinWorkgroupReduceLogicalAnd: + MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalAnd; + break; + case eCLBuiltinWorkgroupReduceLogicalOr: + MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalOr; + break; + case eCLBuiltinWorkgroupReduceLogicalXor: + MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalXor; + break; + case eCLBuiltinWorkgroupScanAddInclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupScanAddInclusive; + break; + case eCLBuiltinWorkgroupScanAddExclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupScanAddExclusive; + break; + case eCLBuiltinWorkgroupScanMinInclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinInclusive; + break; + case eCLBuiltinWorkgroupScanMinExclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinExclusive; + break; + case eCLBuiltinWorkgroupScanMaxInclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxInclusive; + break; + case eCLBuiltinWorkgroupScanMaxExclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxExclusive; + break; + case eCLBuiltinWorkgroupScanMulInclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupScanMulInclusive; + break; + case eCLBuiltinWorkgroupScanMulExclusive: + RecheckOpType = true; + MuxBuiltinID = eMuxBuiltinWorkgroupScanMulExclusive; + break; + case eCLBuiltinWorkgroupScanAndInclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanAndInclusive; + break; + case eCLBuiltinWorkgroupScanAndExclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanAndExclusive; + break; + case eCLBuiltinWorkgroupScanOrInclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanOrInclusive; + break; + case eCLBuiltinWorkgroupScanOrExclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanOrExclusive; + break; + case eCLBuiltinWorkgroupScanXorInclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanXorInclusive; + break; + case eCLBuiltinWorkgroupScanXorExclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanXorExclusive; + break; + case eCLBuiltinWorkgroupScanLogicalAndInclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndInclusive; + break; + case eCLBuiltinWorkgroupScanLogicalAndExclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndExclusive; + break; + case eCLBuiltinWorkgroupScanLogicalOrInclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrInclusive; + break; + case eCLBuiltinWorkgroupScanLogicalOrExclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrExclusive; + break; + case eCLBuiltinWorkgroupScanLogicalXorInclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorInclusive; + break; + case eCLBuiltinWorkgroupScanLogicalXorExclusive: + MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorExclusive; + break; + } + + if (RecheckOpType) { + // We've assumed (unsigned) integer operations, but we may actually have + // signed integer, or floating point, operations. Refine the builtin ID to + // the correct 'overload' now. + compiler::utils::NameMangler Mangler(&F->getContext()); + SmallVector ArgumentTypes; + SmallVector Qualifiers; + + Mangler.demangleName(F->getName(), ArgumentTypes, Qualifiers); + + assert(Qualifiers.size() == 1 && ArgumentTypes.size() == 1 && + "Unknown collective builtin"); + auto &Qual = Qualifiers[0]; + + bool IsSignedInt = false; + while (!IsSignedInt && Qual.getCount()) { + IsSignedInt |= Qual.pop_front() == compiler::utils::eTypeQualSignedInt; + } + + const bool IsFP = ArgumentTypes[0]->isFloatingPointTy(); + switch (MuxBuiltinID) { + default: + llvm_unreachable("unknown group operation for which to check the type"); + case eMuxBuiltinSubgroupReduceAdd: + if (IsFP) + MuxBuiltinID = eMuxBuiltinSubgroupReduceFAdd; + break; + case eMuxBuiltinSubgroupReduceMul: + if (IsFP) + MuxBuiltinID = eMuxBuiltinSubgroupReduceFMul; + break; + case eMuxBuiltinSubgroupReduceUMin: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinSubgroupReduceFMin; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinSubgroupReduceSMin; + } + break; + case eMuxBuiltinSubgroupReduceUMax: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinSubgroupReduceFMax; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinSubgroupReduceSMax; + } + break; + case eMuxBuiltinSubgroupScanAddInclusive: + if (IsFP) + MuxBuiltinID = eMuxBuiltinSubgroupScanFAddInclusive; + break; + case eMuxBuiltinSubgroupScanAddExclusive: + if (IsFP) + MuxBuiltinID = eMuxBuiltinSubgroupScanFAddExclusive; + break; + case eMuxBuiltinSubgroupScanMulInclusive: + if (IsFP) + MuxBuiltinID = eMuxBuiltinSubgroupScanFMulInclusive; + break; + case eMuxBuiltinSubgroupScanMulExclusive: + if (IsFP) + MuxBuiltinID = eMuxBuiltinSubgroupScanFMulExclusive; + break; + case eMuxBuiltinSubgroupScanUMinInclusive: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinSubgroupScanFMinInclusive; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinSubgroupScanSMinInclusive; + } + break; + case eMuxBuiltinSubgroupScanUMinExclusive: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinSubgroupScanFMinExclusive; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinSubgroupScanSMinExclusive; + } + break; + case eMuxBuiltinSubgroupScanUMaxInclusive: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxInclusive; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxInclusive; + } + break; + case eMuxBuiltinSubgroupScanUMaxExclusive: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxExclusive; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxExclusive; + } + break; + case eMuxBuiltinWorkgroupReduceAdd: + if (IsFP) + MuxBuiltinID = eMuxBuiltinWorkgroupReduceFAdd; + break; + case eMuxBuiltinWorkgroupReduceMul: + if (IsFP) + MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMul; + break; + case eMuxBuiltinWorkgroupReduceUMin: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMin; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMin; + } + break; + case eMuxBuiltinWorkgroupReduceUMax: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMax; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMax; + } + break; + case eMuxBuiltinWorkgroupScanAddInclusive: + if (IsFP) + MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddInclusive; + break; + case eMuxBuiltinWorkgroupScanAddExclusive: + if (IsFP) + MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddExclusive; + break; + case eMuxBuiltinWorkgroupScanMulInclusive: + if (IsFP) + MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulInclusive; + break; + case eMuxBuiltinWorkgroupScanMulExclusive: + if (IsFP) + MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulExclusive; + break; + case eMuxBuiltinWorkgroupScanUMinInclusive: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinInclusive; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinInclusive; + } + break; + case eMuxBuiltinWorkgroupScanUMinExclusive: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinExclusive; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinExclusive; + } + break; + case eMuxBuiltinWorkgroupScanUMaxInclusive: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxInclusive; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxInclusive; + } + break; + case eMuxBuiltinWorkgroupScanUMaxExclusive: + if (IsFP) { + MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxExclusive; + } else if (IsSignedInt) { + MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxExclusive; + } + break; + } + } + + const bool IsAnyAll = MuxBuiltinID == eMuxBuiltinSubgroupAny || + MuxBuiltinID == eMuxBuiltinSubgroupAll || + MuxBuiltinID == eMuxBuiltinWorkgroupAny || + MuxBuiltinID == eMuxBuiltinWorkgroupAll; + SmallVector OverloadInfo; + if (!IsAnyAll) { + OverloadInfo.push_back(CI.getOperand(0)->getType()); + } else { + OverloadInfo.push_back(IntegerType::getInt1Ty(M.getContext())); + } + + auto *const MuxBuiltinFn = + BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, OverloadInfo); + + assert(MuxBuiltinFn && "Missing mux builtin"); + auto *const SizeTy = getSizeType(M); + auto *const I32Ty = Type::getInt32Ty(M.getContext()); + + SmallVector Args; + if (MuxBuiltinID >= eFirstMuxWorkgroupCollectiveBuiltin && + MuxBuiltinID <= eLastMuxWorkgroupCollectiveBuiltin) { + // Work-group operations have a barrier ID first. + Args.push_back(ConstantInt::get(I32Ty, 0)); + } + // Then the arg itself + // If it's an any/all operation, we must first reduce to i1 because that's how + // the mux builtins expect their arguments. + auto *Val = CI.getOperand(0); + if (!IsAnyAll) { + Args.push_back(Val); + } else { + assert(Val->getType()->isIntegerTy()); + auto *NEZero = ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE, Val, + ConstantInt::getNullValue(Val->getType())); + NEZero->insertBefore(CI.getIterator()); + Args.push_back(NEZero); + } + + if (MuxBuiltinID == eMuxBuiltinSubgroupBroadcast) { + // Pass on the ID parameter + Args.push_back(CI.getOperand(1)); + } + if (MuxBuiltinID == eMuxBuiltinWorkgroupBroadcast) { + // The mux version always has three indices. Any missing ones are replaced + // with zeros + for (unsigned i = 0, e = CI.arg_size(); i != 3; i++) { + Args.push_back(1 + i < e ? CI.getOperand(1 + i) + : ConstantInt::getNullValue(SizeTy)); + } + } + + auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName()); + NewCI->insertBefore(CI.getIterator()); + NewCI->takeName(&CI); + NewCI->setAttributes(MuxBuiltinFn->getAttributes()); + + if (!IsAnyAll) { + return NewCI; + } + // For any/all we need to recreate the original i32 return value. + auto *SExt = SExtInst::Create(Instruction::SExt, NewCI, CI.getType(), "sext"); + SExt->insertBefore(CI.getIterator()); + return SExt; +} + +Instruction * +CLBuiltinInfo::lowerAsyncBuiltinToMuxBuiltin(CallInst &CI, BuiltinID ID, + BIMuxInfoConcept &BIMuxImpl) { + assert((ID == eCLBuiltinAsyncWorkGroupCopy || + ID == eCLBuiltinAsyncWorkGroupStridedCopy || + ID == eCLBuiltinAsyncWorkGroupCopy2D2D || + ID == eCLBuiltinAsyncWorkGroupCopy3D3D) && + "Invalid ID"); + + IRBuilder<> B(&CI); + auto &M = *CI.getModule(); + LLVMContext &Ctx = M.getContext(); + const auto &DL = M.getDataLayout(); + + switch (ID) { + default: + llvm_unreachable("Unhandled builtin"); + case eCLBuiltinAsyncWorkGroupCopy: + case eCLBuiltinAsyncWorkGroupStridedCopy: { + NameMangler Mangler(&Ctx); + + // Do a full demangle to determing the pointer element type of the first + // argument. + SmallVector BuiltinArgTypes, BuiltinArgPointeeTypes; + SmallVector BuiltinArgQuals; + + [[maybe_unused]] const StringRef BuiltinName = + Mangler.demangleName(CI.getCalledFunction()->getName(), BuiltinArgTypes, + BuiltinArgPointeeTypes, BuiltinArgQuals); + assert(!BuiltinName.empty() && BuiltinArgTypes[0]->isPointerTy() && + BuiltinArgPointeeTypes[0] && "Could not demangle async builtin"); + + auto *const DataTy = BuiltinArgPointeeTypes[0]; + const bool IsStrided = ID == eCLBuiltinAsyncWorkGroupStridedCopy; + + auto *const Dst = CI.getArgOperand(0); + auto *const Src = CI.getArgOperand(1); + auto *const NumElements = CI.getArgOperand(2); + auto *const EventIn = CI.getArgOperand(3 + IsStrided); + + // Find out which way the DMA is going and declare the appropriate mux + // builtin. + const bool IsRead = Dst->getType()->getPointerAddressSpace() == + compiler::utils::AddressSpace::Local; + const auto ElementTypeWidthInBytes = + DL.getTypeAllocSize(DataTy).getFixedValue(); + auto *const ElementSize = + ConstantInt::get(NumElements->getType(), ElementTypeWidthInBytes); + + auto *const WidthInBytes = + IsStrided ? ElementSize + : B.CreateMul(ElementSize, NumElements, "width.bytes"); + + const BuiltinID MuxBuiltinID = [&] { + if (IsRead) { + return IsStrided ? eMuxBuiltinDMARead2D : eMuxBuiltinDMARead1D; + } else { + return IsStrided ? eMuxBuiltinDMAWrite2D : eMuxBuiltinDMAWrite1D; + } + }(); + + auto *const MuxDMA = + BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, EventIn->getType()); + assert(MuxDMA && "Could not get/declare mux dma read/write"); + + CallInst *NewCI = nullptr; + if (!IsStrided) { + NewCI = B.CreateCall(MuxDMA, {Dst, Src, WidthInBytes, EventIn}, + "mux.out.event"); + } else { + // The stride from async_work_group_strided_copy is in elements, but the + // stride in the __mux builtins are in bytes so we need to scale the + // value. + auto *const Stride = CI.getArgOperand(3); + auto *const StrideInBytes = + B.CreateMul(ElementSize, Stride, "stride.bytes"); + + // For async_work_group_strided_copy, the stride only applies to the + // global memory, as we are doing scatters/gathers. + auto *const DstStride = IsRead ? ElementSize : StrideInBytes; + auto *const SrcStride = IsRead ? StrideInBytes : ElementSize; + + NewCI = B.CreateCall( + MuxDMA, + {Dst, Src, WidthInBytes, DstStride, SrcStride, NumElements, EventIn}, + "mux.out.event"); + } + NewCI->setAttributes(MuxDMA->getAttributes()); + NewCI->takeName(&CI); + return NewCI; + } + case eCLBuiltinAsyncWorkGroupCopy2D2D: { + // Unpack the arguments for ease of access. + auto *const Dst = CI.getArgOperand(0); + auto *const DstOffset = CI.getArgOperand(1); + auto *const Src = CI.getArgOperand(2); + auto *const SrcOffset = CI.getArgOperand(3); + auto *const NumBytesPerEl = CI.getArgOperand(4); + auto *const NumElsPerLine = CI.getArgOperand(5); + auto *const NumLines = CI.getArgOperand(6); + auto *const SrcTotalLineLength = CI.getArgOperand(7); + auto *const DstTotalLineLength = CI.getArgOperand(8); + auto *const EventIn = CI.getArgOperand(9); + + // Find out which way the DMA is going and declare the appropriate mux + // builtin. + const bool IsRead = Dst->getType()->getPointerAddressSpace() == + compiler::utils::AddressSpace::Local; + auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin( + IsRead ? eMuxBuiltinDMARead2D : eMuxBuiltinDMAWrite2D, M, + EventIn->getType()); + assert(MuxDMA && "Could not get/declare mux dma read/write"); + + auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl); + auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl); + auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl); + auto *const ByteTy = B.getInt8Ty(); + auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes); + auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes); + auto *const SrcStrideBytes = B.CreateMul(SrcTotalLineLength, NumBytesPerEl); + auto *const DstStrideBytes = B.CreateMul(DstTotalLineLength, NumBytesPerEl); + auto *const NewCI = B.CreateCall( + MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes, DstStrideBytes, + SrcStrideBytes, NumLines, EventIn}); + NewCI->setAttributes(MuxDMA->getAttributes()); + NewCI->takeName(&CI); + return NewCI; + } + case eCLBuiltinAsyncWorkGroupCopy3D3D: { + auto *const Dst = CI.getArgOperand(0); + auto *const DstOffset = CI.getArgOperand(1); + auto *const Src = CI.getArgOperand(2); + auto *const SrcOffset = CI.getArgOperand(3); + auto *const NumBytesPerEl = CI.getArgOperand(4); + auto *const NumElsPerLine = CI.getArgOperand(5); + auto *const NumLines = CI.getArgOperand(6); + auto *const NumPlanes = CI.getArgOperand(7); + auto *const SrcTotalLineLength = CI.getArgOperand(8); + auto *const SrcTotalPlaneArea = CI.getArgOperand(9); + auto *const DstTotalLineLength = CI.getArgOperand(10); + auto *const DstTotalPlaneArea = CI.getArgOperand(11); + auto *const EventIn = CI.getArgOperand(12); + + // Find out which way the DMA is going and declare the appropriate mux + // builtin. + const bool IsRead = Dst->getType()->getPointerAddressSpace() == + compiler::utils::AddressSpace::Local; + auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin( + IsRead ? eMuxBuiltinDMARead3D : eMuxBuiltinDMAWrite3D, M, + EventIn->getType()); + assert(MuxDMA && "Could not get/declare mux dma read/write"); + + auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl); + auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl); + auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl); + auto *const ByteTy = B.getInt8Ty(); + auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes); + auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes); + auto *const SrcLineStrideBytes = + B.CreateMul(SrcTotalLineLength, NumBytesPerEl); + auto *const DstLineStrideBytes = + B.CreateMul(DstTotalLineLength, NumBytesPerEl); + auto *const SrcPlaneStrideBytes = + B.CreateMul(SrcTotalPlaneArea, NumBytesPerEl); + auto *const DstPlaneStrideBytes = + B.CreateMul(DstTotalPlaneArea, NumBytesPerEl); + auto *const NewCI = B.CreateCall( + MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes, + DstLineStrideBytes, SrcLineStrideBytes, NumLines, + DstPlaneStrideBytes, SrcPlaneStrideBytes, NumPlanes, EventIn}); + NewCI->setAttributes(MuxDMA->getAttributes()); + NewCI->takeName(&CI); + return NewCI; + } + } + + return nullptr; +} + +//////////////////////////////////////////////////////////////////////////////// + +Function *CLBuiltinLoader::materializeBuiltin(StringRef BuiltinName, + Module *DestM, + BuiltinMatFlags Flags) { + auto *const BuiltinModule = this->getBuiltinsModule(); + + // Retrieve it from the builtin module. + if (!BuiltinModule) { + return nullptr; + } + Function *SrcBuiltin = BuiltinModule->getFunction(BuiltinName); + if (!SrcBuiltin) { + return nullptr; + } + + // The user only wants a declaration. + if (!(Flags & eBuiltinMatDefinition)) { + if (!DestM) { + return SrcBuiltin; + } else { + FunctionType *FT = dyn_cast(SrcBuiltin->getFunctionType()); + Function *BuiltinDecl = cast( + DestM->getOrInsertFunction(BuiltinName, FT).getCallee()); + BuiltinDecl->copyAttributesFrom(SrcBuiltin); + BuiltinDecl->setCallingConv(SrcBuiltin->getCallingConv()); + return BuiltinDecl; + } + } + + // Materialize the builtin and its callees. + std::set Callees; + std::vector Worklist; + Worklist.push_back(SrcBuiltin); + while (!Worklist.empty()) { + // Materialize the first function in the work list. + Function *Current = Worklist.front(); + Worklist.erase(Worklist.begin()); + if (!Callees.insert(Current).second) { + continue; + } + if (!BuiltinModule->materialize(Current)) { + return nullptr; + } + + // Find any callees in the function and add them to the list. + for (BasicBlock &BB : *Current) { + for (Instruction &I : BB) { + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + Function *callee = CI->getCalledFunction(); + if (!callee) { + continue; + } + Worklist.push_back(callee); + } + } + } + + if (!DestM) { + return SrcBuiltin; + } + + // Copy builtin and callees to the target module if requested by the user. + ValueToValueMapTy ValueMap; + SmallVector Returns; + // Avoid linking errors. + const GlobalValue::LinkageTypes Linkage = GlobalValue::LinkOnceAnyLinkage; + + // Declare the callees in the module if they don't already exist. + for (Function *Callee : Callees) { + Function *NewCallee = DestM->getFunction(Callee->getName()); + if (!NewCallee) { + FunctionType *FT = Callee->getFunctionType(); + NewCallee = Function::Create(FT, Linkage, Callee->getName(), DestM); + } else { + NewCallee->setLinkage(Linkage); + } + Function::arg_iterator NewArgI = NewCallee->arg_begin(); + for (Argument &Arg : Callee->args()) { + NewArgI->setName(Arg.getName()); + ValueMap[&Arg] = &*(NewArgI++); + } + NewCallee->copyAttributesFrom(Callee); + ValueMap[Callee] = NewCallee; + } + + // Clone the callees' bodies into the module. + GlobalValueMaterializer Materializer(*DestM); + for (Function *Callee : Callees) { + if (Callee->isDeclaration()) { + continue; + } + Function *NewCallee = cast(ValueMap[Callee]); + assert(DestM); + const auto CloneType = DestM == Callee->getParent() + ? CloneFunctionChangeType::LocalChangesOnly + : CloneFunctionChangeType::DifferentModule; + CloneFunctionInto(NewCallee, Callee, ValueMap, CloneType, Returns, "", + nullptr, nullptr, &Materializer); + Returns.clear(); + } + + // Clone global variable initializers. + for (GlobalVariable *var : Materializer.variables()) { + GlobalVariable *newVar = dyn_cast_or_null(ValueMap[var]); + if (!newVar) { + return nullptr; + } + Constant *oldInit = var->getInitializer(); + Constant *newInit = MapValue(oldInit, ValueMap); + newVar->setInitializer(newInit); + } + + return cast(ValueMap[SrcBuiltin]); +} +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp new file mode 100644 index 0000000000000..a176ace88c196 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp @@ -0,0 +1,69 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include + +#define DEBUG_TYPE "define-mux-builtins" + +using namespace llvm; + +PreservedAnalyses +compiler::utils::DefineMuxBuiltinsPass::run(Module &M, + ModuleAnalysisManager &AM) { + bool Changed = false; + auto &BI = AM.getResult(M); + + auto functionNeedsDefining = [&BI](Function &F) { + if (F.isDeclaration() && !F.isIntrinsic()) { + if (auto B = BI.analyzeBuiltin(F)) { + return BI.isMuxBuiltinID(B->ID); + } + } + return false; + }; + + // Define all mux builtins + for (auto &F : M.functions()) { + if (!functionNeedsDefining(F)) { + continue; + } + LLVM_DEBUG(dbgs() << " Defining mux builtin: " << F.getName() << "\n";); + + // Define the builtin. If it declares any new dependent builtins, those + // will be appended to the module's function list and so will be + // encountered by later iterations. + auto Builtin = BI.analyzeBuiltin(F); + assert(Builtin && "Failed to analyze builtin"); + if (BI.defineMuxBuiltin(Builtin->ID, M, Builtin->mux_overload_info)) { + Changed = true; + } + } + + // While declaring any builtins should go to the end of the module's list of + // functions, it's not technically impossible for something else to happen. + // As such, assert that we are leaving the module in the state we are + // contractually obliged to: with all functions that need defining having + // been defined. + assert(all_of(M.functions(), + [&](Function &F) { + return F.isDeclaration() || !functionNeedsDefining(F); + }) && + "Did not define a function that requires it"); + + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp new file mode 100644 index 0000000000000..66cb934125195 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp @@ -0,0 +1,74 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace compiler { +namespace utils { + +llvm::Value *isThreadEQ(llvm::BasicBlock *bb, unsigned x, unsigned y, + unsigned z, llvm::Function &LocalIDFn) { + llvm::IRBuilder<> builder(bb); + LocalIDFn.setCallingConv(llvm::CallingConv::SPIR_FUNC); + auto *const indexType = LocalIDFn.arg_begin()->getType(); + llvm::Value *result = llvm::ConstantInt::getTrue(bb->getContext()); + + const std::array threadIDs{x, y, z}; + for (unsigned i = 0; i < threadIDs.size(); ++i) { + auto *const index = llvm::ConstantInt::get(indexType, i); + auto *const localID = builder.CreateCall(&LocalIDFn, index); + localID->setCallingConv(LocalIDFn.getCallingConv()); + + auto *thread = + llvm::ConstantInt::get(LocalIDFn.getReturnType(), threadIDs[i]); + auto *const cmp = builder.CreateICmpEQ(localID, thread); + result = (i == 0) ? cmp : builder.CreateAnd(result, cmp); + } + + return result; +} + +llvm::Value *isThreadZero(llvm::BasicBlock *BB, llvm::Function &LocalIDFn) { + return isThreadEQ(BB, 0, 0, 0, LocalIDFn); +} + +void buildThreadCheck(llvm::BasicBlock *entryBlock, llvm::BasicBlock *trueBlock, + llvm::BasicBlock *falseBlock, llvm::Function &LocalIDFn) { + // only thread 0 in the work group should execute the DMA. + llvm::IRBuilder<> entryBuilder(entryBlock); + entryBuilder.CreateCondBr(isThreadZero(entryBlock, LocalIDFn), trueBlock, + falseBlock); +} + +llvm::StructType *getOrCreateMuxDMAEventType(llvm::Module &m) { + if (auto *eventType = llvm::StructType::getTypeByName( + m.getContext(), MuxBuiltins::dma_event_type)) { + return eventType; + } + + return llvm::StructType::create(m.getContext(), MuxBuiltins::dma_event_type); +} +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp new file mode 100644 index 0000000000000..5b4db40b0d6be --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp @@ -0,0 +1,54 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include + +using namespace llvm; + +PreservedAnalyses +compiler::utils::TransferKernelMetadataPass::run(Module &M, + ModuleAnalysisManager &) { + SmallVector Kernels; + populateKernelList(M, Kernels); + + for (const auto &Kernel : Kernels) { + if (auto *F = M.getFunction(Kernel.Name)) { + setOrigFnName(*F); + setIsKernelEntryPt(*F); + if (Kernel.ReqdWGSize) { + encodeLocalSizeMetadata(*F, *Kernel.ReqdWGSize); + } + } + } + + return PreservedAnalyses::all(); +} + +PreservedAnalyses +compiler::utils::EncodeKernelMetadataPass::run(Module &M, + ModuleAnalysisManager &) { + if (auto *F = M.getFunction(KernelName)) { + setOrigFnName(*F); + setIsKernelEntryPt(*F); + if (LocalSizes) { + encodeLocalSizeMetadata(*F, *LocalSizes); + } + } + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp new file mode 100644 index 0000000000000..ace34338bb5d5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp @@ -0,0 +1,71 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include + +using namespace llvm; +static llvm::Constant *getNeutralIdentityHelper(RecurKind Kind, Type *Ty, + bool UseNaN, bool UseFZero) { + switch (Kind) { + default: + return nullptr; + case RecurKind::And: + return ConstantInt::getAllOnesValue(Ty); + case RecurKind::Or: + case RecurKind::Add: + case RecurKind::Xor: + return ConstantInt::getNullValue(Ty); + case RecurKind::SMin: + return ConstantInt::get( + Ty, APInt::getSignedMaxValue(Ty->getScalarSizeInBits())); + case RecurKind::SMax: + return ConstantInt::get( + Ty, APInt::getSignedMinValue(Ty->getScalarSizeInBits())); + case RecurKind::UMin: + return ConstantInt::get(Ty, APInt::getMaxValue(Ty->getScalarSizeInBits())); + case RecurKind::UMax: + return ConstantInt::get(Ty, APInt::getMinValue(Ty->getScalarSizeInBits())); + case RecurKind::FAdd: + // -0.0 + 0.0 = 0.0 meaning -0.0 (not 0.0) is the neutral value for floats + // under addition. + return UseFZero ? ConstantFP::get(Ty, 0.0) : ConstantFP::get(Ty, -0.0); + case RecurKind::FMin: + return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ false) + : ConstantFP::getInfinity(Ty, /*Negative*/ false); + case RecurKind::FMax: + return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ true) + : ConstantFP::getInfinity(Ty, /*Negative*/ true); + case RecurKind::Mul: + return ConstantInt::get(Ty, 1); + case RecurKind::FMul: + return ConstantFP::get(Ty, 1.0); + } +} + +llvm::Constant *compiler::utils::getNeutralVal(RecurKind Kind, Type *Ty) { + return getNeutralIdentityHelper(Kind, Ty, /*UseNaN*/ true, + /*UseFZero*/ false); +} + +llvm::Constant *compiler::utils::getIdentityVal(RecurKind Kind, Type *Ty) { + return getNeutralIdentityHelper(Kind, Ty, /*UseNaN*/ false, /*UseFZero*/ + true); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp new file mode 100644 index 0000000000000..d31b3022c7eb5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp @@ -0,0 +1,889 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace compiler { +namespace utils { +using namespace llvm; + +NameMangler::NameMangler(LLVMContext *context) : Context(context) {} + +std::string NameMangler::mangleName(StringRef Name, ArrayRef Tys, + ArrayRef Quals) { + std::string MangledName; + raw_string_ostream O(MangledName); + O << "_Z" << Name.size() << Name; + for (unsigned i = 0; i < Tys.size(); i++) { + const ArrayRef PrevTys = Tys.slice(0, i); + const ArrayRef PrevQuals = Quals.slice(0, i); + if (!mangleType(O, Tys[i], Quals[i], PrevTys, PrevQuals)) { + return std::string(); + } + } + O.flush(); + return MangledName; +} + +StringRef +NameMangler::demangleName(StringRef Name, SmallVectorImpl &Types, + SmallVectorImpl &PointerElementTypes, + SmallVectorImpl &Quals) { + // Parse the name part. + Lexer L(Name); + Name = demangleName(L); + if (Name.empty()) { + return StringRef{}; + } + + // Parse the argument part. + while (L.Left() > 0) { + Type *ArgTy = nullptr; + Type *ArgEltTy = nullptr; + TypeQualifiers ArgQuals; + if (!demangleType(L, ArgTy, &ArgEltTy, ArgQuals, Types, Quals)) { + return StringRef{}; + } + Types.push_back(ArgTy); + PointerElementTypes.push_back(ArgEltTy); + Quals.push_back(ArgQuals); + } + return Name; +} + +StringRef NameMangler::demangleName(StringRef Name, + SmallVectorImpl &Types, + SmallVectorImpl &Quals) { + SmallVector EltTys; + return demangleName(Name, Types, EltTys, Quals); +} + +StringRef NameMangler::demangleName(StringRef Name) { + Lexer L(Name); + StringRef DemangledName = demangleName(L); + if (!DemangledName.empty()) { + return DemangledName; + } + return Name; +} + +int NameMangler::resolveSubstitution(unsigned SubID, + SmallVectorImpl &Tys, + SmallVectorImpl &Quals) { + unsigned CurrentSubID = 0; + int ResolvedID = -1; + for (unsigned i = 0; i < Tys.size(); i++) { + // Determine whether the type is a builtin or not. + // Builtin types cannot be substituted. + Type *Ty = Tys[i]; + TypeQualifiers &TyQuals = Quals[i]; + if (isTypeBuiltin(Ty, TyQuals)) { + continue; + } + if (CurrentSubID == SubID) { + ResolvedID = (int)i; + break; + } + CurrentSubID++; + } + return ResolvedID; +} + +bool NameMangler::emitSubstitution(raw_ostream &O, Type *Ty, + TypeQualifiers Quals, + ArrayRef PrevTys, + ArrayRef PrevQuals) { + if (isTypeBuiltin(Ty, Quals)) { + return false; + } + + // Look for a previously-mangled non-builtin type we could use as a + // substitution. + int SubstitutionID = -1; + bool FoundMatch = false; + for (unsigned j = 0; j < PrevTys.size(); j++) { + Type *PrevTy = PrevTys[j]; + TypeQualifiers PrevQual = PrevQuals[j]; + if (!isTypeBuiltin(PrevTy, PrevQual)) { + SubstitutionID++; + if ((PrevTy == Ty) && (PrevQual == Quals)) { + FoundMatch = true; + break; + } + } + } + if (!FoundMatch) { + return false; + } + + // Found a match, emit the substitution. + O << "S"; + if (SubstitutionID > 0) { + O << SubstitutionID; + } + O << "_"; + return true; +} + +bool NameMangler::isTypeBuiltin(Type *Ty, TypeQualifiers &Quals) { + (void)Quals; + switch (Ty->getTypeID()) { + default: + case Type::StructTyID: + case Type::ArrayTyID: + case Type::PointerTyID: + case Type::FixedVectorTyID: + return false; + case Type::VoidTyID: + case Type::HalfTyID: + case Type::FloatTyID: + case Type::DoubleTyID: + case Type::IntegerTyID: + return true; + } +} + +const char *NameMangler::mangleSimpleType(Type *Ty, TypeQualifier Qual) { + const bool IsSigned = (Qual & eTypeQualSignedInt); + switch (Ty->getTypeID()) { + default: + break; + case Type::VoidTyID: + return "v"; + case Type::HalfTyID: + return "Dh"; + case Type::FloatTyID: + return "f"; + case Type::DoubleTyID: + return "d"; + case Type::IntegerTyID: + switch (cast(Ty)->getBitWidth()) { + default: + break; + case 1: + return "b"; // bool + case 8: + return IsSigned ? "c" : "h"; + case 16: + return IsSigned ? "s" : "t"; + case 32: + return IsSigned ? "i" : "j"; + case 64: + return IsSigned ? "l" : "m"; + } + } + return nullptr; +} + +bool NameMangler::mangleType(raw_ostream &O, Type *Ty, TypeQualifiers Qual) { + return mangleType(O, Ty, Qual, ArrayRef(), + ArrayRef()); +} + +static void manglePointerQuals(raw_ostream &O, TypeQualifier Qual, + unsigned AddressSpace) { + if (Qual & eTypeQualPointerRestrict) { + O << 'r'; + } + if (Qual & eTypeQualPointerVolatile) { + O << 'V'; + } + if (Qual & eTypeQualPointerConst) { + O << 'K'; + } + if (AddressSpace > 0) { + O << "U3AS" << AddressSpace; + } +} + +bool NameMangler::mangleType(raw_ostream &O, Type *Ty, TypeQualifiers Quals, + ArrayRef PrevTys, + ArrayRef PrevQuals) { + if (emitSubstitution(O, Ty, Quals, PrevTys, PrevQuals)) { + return true; + } + + const TypeQualifier Qual = Quals.pop_front(); + if (const char *SimpleName = mangleSimpleType(Ty, Qual)) { + O << SimpleName; + return true; + } else if (isa(Ty)) { + std::string tmp; + raw_string_ostream Otmp(tmp); + auto *VecTy = cast(Ty); + Otmp << "nxv" + << multi_llvm::getVectorElementCount(VecTy).getKnownMinValue(); + if (!mangleType(Otmp, VecTy->getElementType(), Quals, PrevTys, PrevQuals)) { + return false; + } + O << "u" << tmp.size() << tmp; + return true; + } else if (Ty->isVectorTy()) { + auto *VecTy = cast(Ty); + O << "Dv" << VecTy->getNumElements() << "_"; + return mangleType(O, VecTy->getElementType(), Quals, PrevTys, PrevQuals); + } else if (Ty->isPointerTy()) { + PointerType *PtrTy = cast(Ty); + const unsigned AddressSpace = PtrTy->getAddressSpace(); + O << "u3ptr"; + manglePointerQuals(O, Qual, AddressSpace); + return true; + } else if (Ty->isTargetExtTy()) { + if (auto Name = mangleBuiltinType(Ty)) { + O << *Name; + return true; + } + return false; + } else { + return false; + } +} + +bool NameMangler::demangleSimpleType(Lexer &L, Type *&Ty, TypeQualifier &Qual) { + const int c = L.Current(); + Ty = nullptr; + Qual = eTypeQualNone; + if ((c < 0) || !Context) { + return false; + } + + switch (c) { + default: + return false; + case 'v': + Ty = llvm::Type::getVoidTy(*Context); + break; + case 'D': + if (!L.Consume("Dh")) { + return false; + } + Ty = llvm::Type::getHalfTy(*Context); + return true; + case 'f': + Ty = llvm::Type::getFloatTy(*Context); + break; + case 'd': + Ty = llvm::Type::getDoubleTy(*Context); + break; + case 'b': + Ty = llvm::Type::getInt1Ty(*Context); + break; + case 'c': + case 'h': + Ty = llvm::Type::getInt8Ty(*Context); + if (c == 'c') { + Qual = eTypeQualSignedInt; + } + break; + case 's': + case 't': + Ty = llvm::Type::getInt16Ty(*Context); + if (c == 's') { + Qual = eTypeQualSignedInt; + } + break; + case 'i': + case 'j': + Ty = llvm::Type::getInt32Ty(*Context); + if (c == 'i') { + Qual = eTypeQualSignedInt; + } + break; + case 'l': + case 'm': + Ty = llvm::Type::getInt64Ty(*Context); + if (c == 'l') { + Qual = eTypeQualSignedInt; + } + break; + } + L.Consume(); + return true; +} + +std::optional NameMangler::mangleBuiltinType(Type *Ty) { + auto *const TgtTy = cast(Ty); + const StringRef Name = TgtTy->getName(); + + if (Name == "spirv.Event") { + return "9ocl_event"; + } + + if (Name == "spirv.Sampler") { + return "11ocl_sampler"; + } + + if (Name != "spirv.Image") { + // FIXME: Some types don't have official target extension types. + // "opencl.clk_event_t" -> "12ocl_clkevent" + // "opencl.queue_t" -> "9ocl_queue" + // "opencl.ndrange_t" -> "11ocl_ndrange" + // "opencl.reserve_id_t" -> "13ocl_reserveid" + return std::nullopt; + } + + auto Dim = TgtTy->getIntParameter(tgtext::ImageTyDimensionalityIdx); + auto Depth = TgtTy->getIntParameter(tgtext::ImageTyDepthIdx); + auto Arrayed = TgtTy->getIntParameter(tgtext::ImageTyArrayedIdx); + auto MS = TgtTy->getIntParameter(tgtext::ImageTyMSIdx); + + std::string MangledName = "ocl_image"; + + switch (Dim) { + default: + return std::nullopt; + case tgtext::ImageDim1D: + MangledName += "1d"; + break; + case tgtext::ImageDim2D: + MangledName += "2d"; + break; + case tgtext::ImageDim3D: + MangledName += "3d"; + break; + case tgtext::ImageDimBuffer: + MangledName += "1dbuffer"; + break; + } + + if (Arrayed == tgtext::ImageArrayed) { + MangledName += "array"; + } + + if (MS == tgtext::ImageMSMultiSampled) { + MangledName += "msaa"; + } + + if (Depth == tgtext::ImageDepth) { + MangledName += "depth"; + } + + return std::to_string(MangledName.size()) + MangledName; +} + +bool NameMangler::demangleOpenCLBuiltinType(Lexer &L, Type *&Ty) { + if (L.Consume("12memory_scope") || L.Consume("12memory_order")) { + Ty = IntegerType::getInt32Ty(*Context); + return true; + } + + if (auto *TargetExtTy = [this, &L]() -> Type * { + if (L.Consume("11ocl_image1d")) { + return compiler::utils::tgtext::getImage1DTy(*Context); + } else if (L.Consume("16ocl_image1darray")) { + return compiler::utils::tgtext::getImage1DArrayTy(*Context); + } else if (L.Consume("17ocl_image1dbuffer")) { + return compiler::utils::tgtext::getImage1DBufferTy(*Context); + } else if (L.Consume("11ocl_image2d")) { + return compiler::utils::tgtext::getImage2DTy(*Context); + } else if (L.Consume("16ocl_image2darray")) { + return compiler::utils::tgtext::getImage2DArrayTy(*Context); + } else if (L.Consume("16ocl_image2ddepth")) { + return compiler::utils::tgtext::getImage2DTy(*Context, /*Depth*/ true, + /*MS*/ false); + } else if (L.Consume("21ocl_image2darraydepth")) { + return compiler::utils::tgtext::getImage2DArrayTy(*Context); + } else if (L.Consume("15ocl_image2dmsaa")) { + return compiler::utils::tgtext::getImage2DTy( + *Context, /*Depth*/ false, /*MS*/ true); + } else if (L.Consume("20ocl_image2darraymsaa")) { + return compiler::utils::tgtext::getImage2DArrayTy( + *Context, /*Depth*/ false, /*MS*/ true); + } else if (L.Consume("20ocl_image2dmsaadepth")) { + return compiler::utils::tgtext::getImage2DTy(*Context, /*Depth*/ true, + /*MS*/ true); + } else if (L.Consume("35ocl_image2darraymsaadepth")) { + return compiler::utils::tgtext::getImage2DArrayTy( + *Context, /*Depth*/ true, /*MS*/ true); + } else if (L.Consume("11ocl_image3d")) { + return compiler::utils::tgtext::getImage3DTy(*Context); + } else if (L.Consume("11ocl_sampler")) { + return compiler::utils::tgtext::getSamplerTy(*Context); + } else if (L.Consume("9ocl_event")) { + return compiler::utils::tgtext::getEventTy(*Context); + } + return nullptr; + }()) { + Ty = TargetExtTy; + return true; + } + + StringRef Name; + // + // TODO: Avoid hard coded name. See redmine issue #8656 please. + // + if (L.Consume("11ocl_image1d")) { + Name = "opencl.image1d_t"; + } else if (L.Consume("16ocl_image1darray")) { + Name = "opencl.image1d_array_t"; + } else if (L.Consume("17ocl_image1dbuffer")) { + Name = "opencl.image1d_buffer_t"; + } else if (L.Consume("11ocl_image2d")) { + Name = "opencl.image2d_t"; + } else if (L.Consume("16ocl_image2darray")) { + Name = "opencl.image2d_array_t"; + } else if (L.Consume("16ocl_image2ddepth")) { + Name = "opencl.image2d_depth_t"; + } else if (L.Consume("21ocl_image2darraydepth")) { + Name = "opencl.image2d_array_depth_t"; + } else if (L.Consume("15ocl_image2dmsaa")) { + Name = "opencl.image2d_msaa_t"; + } else if (L.Consume("20ocl_image2darraymsaa")) { + Name = "opencl.image2d_array_msaa_t"; + } else if (L.Consume("20ocl_image2dmsaadepth")) { + Name = "opencl.image2d_msaa_depth_t"; + } else if (L.Consume("35ocl_image2darraymsaadepth")) { + Name = "opencl.image2d_array_msaa_depth_t"; + } else if (L.Consume("11ocl_image3d")) { + Name = "opencl.image3d_t"; + } else if (L.Consume("11ocl_sampler")) { + Name = "opencl_sampler_t"; + } else if (L.Consume("9ocl_event")) { + Name = "opencl.event_t"; + } else if (L.Consume("12ocl_clkevent")) { + Name = "opencl.clk_event_t"; + } else if (L.Consume("9ocl_queue")) { + Name = "opencl.queue_t"; + } else if (L.Consume("11ocl_ndrange")) { + Name = "opencl.ndrange_t"; + } else if (L.Consume("13ocl_reserveid")) { + Name = "opencl.reserve_id_t"; + } else { + return false; + } + + if (auto *const OpenCLType = + llvm::StructType::getTypeByName(*Context, Name)) { + Ty = OpenCLType; + } else { + Ty = llvm::StructType::create(*Context, Name); + } + + return true; +} + +struct PointerASQuals { + unsigned AS; + TypeQualifier Qual; +}; + +static std::optional demanglePointerQuals(Lexer &L) { + TypeQualifier PointerQual = eTypeQualNone; + + // Parse the optional pointer qualifier. + if (L.Current() < 0) { + return std::nullopt; + } + + // Parse the optional address space qualifier. + bool DemangledAS = false; + unsigned AddressSpace = 0; + + if (L.Consume("U3AS")) { + if (!L.ConsumeInteger(AddressSpace)) { + return std::nullopt; + } + DemangledAS = true; + } + + switch (L.Current()) { + default: + break; + case 'K': + PointerQual = eTypeQualPointerConst; + L.Consume(); + break; + case 'r': + PointerQual = eTypeQualPointerRestrict; + L.Consume(); + break; + case 'V': + PointerQual = eTypeQualPointerVolatile; + L.Consume(); + break; + } + + if (!DemangledAS && L.Consume("U3AS") && !L.ConsumeInteger(AddressSpace)) { + return std::nullopt; + } + + return PointerASQuals{AddressSpace, PointerQual}; +} + +bool NameMangler::demangleType(Lexer &L, Type *&Ty, Type **PointerEltTy, + TypeQualifiers &Quals, + SmallVectorImpl &CtxTypes, + SmallVectorImpl &CtxQuals) { + Ty = nullptr; + if (L.Left() < 1) { + return false; + } + + // Assume the element type is null, and set it if we find a pointer. + if (PointerEltTy) { + *PointerEltTy = nullptr; + } + + // Match vector types. + if (L.Consume("Dv")) { + const TypeQualifier VectorQual = eTypeQualNone; + unsigned NumElements = 0; + Quals.push_back(VectorQual); + if (!L.ConsumeInteger(NumElements) || !L.Consume("_")) { + return false; + } + + // Parse the vector element type. + Type *ElementType = nullptr; + if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) { + return false; + } + Ty = FixedVectorType::get(ElementType, NumElements); + return true; + } + + // Match opaque pointer types + if (L.Consume("u3ptr")) { + const auto QualsAS = demanglePointerQuals(L); + if (!QualsAS) { + return false; + } + Quals.push_back(QualsAS->Qual); + return llvm::PointerType::get(*Context, QualsAS->AS); + } + + // Match scalable vector types. + if (L.Consume("u")) { + unsigned TypeNameLength = 0; + if (!L.ConsumeInteger(TypeNameLength) || !L.Consume("nxv")) { + return false; + } + if (TypeNameLength > L.Left()) { + return false; + } + const TypeQualifier VectorQual = eTypeQualNone; + unsigned NumElements = 0; + Quals.push_back(VectorQual); + if (!L.ConsumeInteger(NumElements)) { + return false; + } + + // Parse the vector element type. + Type *ElementType = nullptr; + if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) { + return false; + } + Ty = llvm::VectorType::get(ElementType, + ElementCount::getScalable(NumElements)); + return true; + } + + // Match pointer types. + if (L.Consume("P")) { + const auto QualsAS = demanglePointerQuals(L); + if (!QualsAS) { + return false; + } + + Quals.push_back(QualsAS->Qual); + + // Parse the element type. + Type *ElementType = nullptr; + if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) { + return false; + } + assert(ElementType); + if (PointerEltTy) { + *PointerEltTy = ElementType; + } + Ty = llvm::PointerType::get(*Context, QualsAS->AS); + return true; + } + + // Match simple types. + TypeQualifier SimpleQual = eTypeQualNone; + if (demangleSimpleType(L, Ty, SimpleQual)) { + Quals.push_back(SimpleQual); + return true; + } + + // Handle substitutions. + if (L.Consume("S")) { + unsigned SubID = 0; + if (L.ConsumeInteger(SubID)) { + SubID++; + } + if (!L.Consume("_")) { + return false; + } + + // Resolve it, using a previous type and qualifier. + const int entryIndex = resolveSubstitution(SubID, CtxTypes, CtxQuals); + if ((entryIndex < 0) || ((unsigned)entryIndex >= CtxTypes.size())) { + return false; + } + Ty = CtxTypes[entryIndex]; + Quals.push_back(CtxQuals[entryIndex]); + return true; + } + + if (demangleOpenCLBuiltinType(L, Ty)) { + return true; + } + + return false; +} + +StringRef NameMangler::demangleName(Lexer &L) { + unsigned NameLength = 0; + if (!L.Consume("_Z")) { + return StringRef(); + } else if (!L.ConsumeInteger(NameLength)) { + return StringRef(); + } else if (NameLength > L.Left()) { + return StringRef(); + } + StringRef Name = L.TextLeft().substr(0, NameLength); + L.Consume(NameLength); + return Name; +} + +//////////////////////////////////////////////////////////////////////////////// + +TypeQualifiers::TypeQualifiers() : storage_(0) {} + +TypeQualifiers::TypeQualifiers(TypeQualifier Qual) : storage_(0) { + push_back(Qual); +} + +TypeQualifiers::TypeQualifiers(TypeQualifier Qual1, TypeQualifier Qual2) + : storage_(0) { + push_back(Qual1); + push_back(Qual2); +} + +TypeQualifiers::TypeQualifiers(unsigned Qual) : storage_(0) { push_back(Qual); } + +TypeQualifiers::TypeQualifiers(unsigned Qual1, unsigned Qual2) : storage_(0) { + push_back(Qual1); + push_back(Qual2); +} + +TypeQualifiers::StorageT TypeQualifiers::getCount() const { + const StorageT Mask = ((1 << NumCountBits) - 1); + return storage_ & Mask; +} + +void TypeQualifiers::setCount(StorageT NewCount) { + const StorageT Mask = ((1 << NumCountBits) - 1); + // Clear the old count. + storage_ &= ~Mask; + // Set the new count. + storage_ |= ((NewCount << 0) & Mask); +} + +TypeQualifier TypeQualifiers::front() const { + const StorageT Size = getCount(); + if (Size == 0) { + return eTypeQualNone; + } + const unsigned Mask = ((1 << NumQualBits) - 1); + const unsigned Field = (storage_ >> NumCountBits) & Mask; + return (TypeQualifier)Field; +} + +TypeQualifier TypeQualifiers::pop_front() { + const TypeQualifier Qual = front(); + const StorageT Size = getCount(); + if (Size > 0) { + // Pop the field bits. + storage_ >>= NumQualBits; + // Set the new count, since the old one was overwritten. + setCount(Size - 1); + } + return Qual; +} + +TypeQualifier TypeQualifiers::at(unsigned Idx) const { + const StorageT Size = getCount(); + if (Idx >= Size) { + return eTypeQualNone; + } + const unsigned ShAmt = NumCountBits + (Idx * NumQualBits); + const unsigned Field = (storage_ >> ShAmt) & ((1 << NumQualBits) - 1); + return TypeQualifier(Field); +} + +bool TypeQualifiers::push_back(TypeQualifier Qual) { + const StorageT Size = getCount(); + if (Size == MaxSize) { + return false; + } + const unsigned Offset = NumCountBits + (Size * NumQualBits); + const unsigned Field = Qual & ((1 << NumQualBits) - 1); + storage_ |= (static_cast(Field) << Offset); + setCount(Size + 1); + return true; +} + +bool TypeQualifiers::push_back(unsigned Qual) { + return push_back((TypeQualifier)Qual); +} + +bool TypeQualifiers::push_back(TypeQualifiers Quals) { + while (Quals.getCount() > 0) { + if (!push_back(Quals.pop_front())) { + return false; + } + } + return true; +} + +//////////////////////////////////////////////////////////////////////////////// + +Lexer::Lexer(StringRef text) : Text(text), Pos(0) {} + +unsigned Lexer::Left() const { return Text.size() - Pos; } + +unsigned Lexer::CurrentPos() const { return Pos; } + +StringRef Lexer::TextLeft() const { return Text.substr(Pos); } + +int Lexer::Current() const { return Left() ? Text[Pos] : -1; } + +bool Lexer::Consume() { return Consume(1); } + +bool Lexer::Consume(unsigned Size) { + if (Left() < Size) { + return false; + } + Pos += Size; + return true; +} + +bool Lexer::Consume(StringRef Pattern) { + if (Left() < Pattern.size()) { + return false; + } else if (!TextLeft().starts_with(Pattern)) { + return false; + } + Pos += Pattern.size(); + return true; +} + +bool Lexer::ConsumeInteger(unsigned &Result) { + size_t NumDigits = 0; + size_t i = Pos; + while ((i < Text.size()) && isdigit(Text[i])) { + i++; + NumDigits++; + } + const StringRef NumText = Text.substr(Pos, NumDigits); + if (NumText.size() == 0) { + return false; + } + if (NumText.getAsInteger(10, Result)) { + return false; + } + Pos += NumDigits; + return true; +} + +bool Lexer::ConsumeSignedInteger(int &Result) { + size_t NumChars = 0; + size_t i = Pos; + if (Text[i] == '-') { + i++; + NumChars++; + } + while ((i < Text.size()) && isdigit(Text[i])) { + i++; + NumChars++; + } + const StringRef NumText = Text.substr(Pos, NumChars); + if (NumText.size() == 0) { + return false; + } + if (NumText.getAsInteger(10, Result)) { + return false; + } + Pos += NumChars; + return true; +} + +bool Lexer::ConsumeAlpha(StringRef &Result) { + size_t NumChars = 0; + size_t i = Pos; + while ((i < Text.size()) && isalpha(Text[i])) { + i++; + NumChars++; + } + if (NumChars == 0) { + return false; + } + Result = Text.substr(Pos, NumChars); + Pos += NumChars; + return true; +} + +bool Lexer::ConsumeAlphanumeric(StringRef &Result) { + size_t NumChars = 0; + size_t i = Pos; + while ((i < Text.size()) && isalnum(Text[i])) { + i++; + NumChars++; + } + if (NumChars == 0) { + return false; + } + Result = Text.substr(Pos, NumChars); + Pos += NumChars; + return true; +} + +bool Lexer::ConsumeUntil(char C, StringRef &Result) { + const size_t CPos = Text.find_first_of(C, Pos); + if (CPos == std::string::npos) { + Result = StringRef(); + return false; + } + Result = Text.substr(Pos, CPos - Pos); + Pos = CPos; + return true; +} + +bool Lexer::ConsumeWhitespace() { + bool consumed = false; + while (Pos < Text.size() && isspace(Text[Pos])) { + consumed = true; + ++Pos; + } + + return consumed; +} +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp new file mode 100644 index 0000000000000..985008873c7a8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp @@ -0,0 +1,395 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include + +using namespace llvm; + +namespace compiler { +namespace utils { + +uint32_t getOpenCLVersion(const llvm::Module &m) { + if (auto *const md = m.getNamedMetadata("opencl.ocl.version")) { + if (md->getNumOperands() == 1) { + auto *const op = md->getOperand(0); + if (op->getNumOperands() == 2) { + const auto major = + mdconst::extract(op->getOperand(0))->getZExtValue(); + const auto minor = + mdconst::extract(op->getOperand(1))->getZExtValue(); + return (major * 100 + minor) * 1000; + } + } + } + return OpenCLC12; +} + +static constexpr const char *ReqdWGSizeMD = "reqd_work_group_size"; + +static MDTuple *encodeVectorizationInfo(const VectorizationInfo &info, + LLVMContext &Ctx) { + auto *const i32Ty = Type::getInt32Ty(Ctx); + + return MDTuple::get( + Ctx, + {ConstantAsMetadata::get( + ConstantInt::get(i32Ty, info.vf.getKnownMinValue())), + ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.vf.isScalable())), + ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.simdDimIdx)), + ConstantAsMetadata::get( + ConstantInt::get(i32Ty, info.IsVectorPredicated))}); +} + +static std::optional extractVectorizationInfo(MDTuple *md) { + if (md->getNumOperands() != 4) { + return std::nullopt; + } + auto *const widthMD = mdconst::extract(md->getOperand(0)); + auto *const isScalableMD = mdconst::extract(md->getOperand(1)); + auto *const simdDimIdxMD = mdconst::extract(md->getOperand(2)); + auto *const isVPMD = mdconst::extract(md->getOperand(3)); + + VectorizationInfo info; + + info.vf = llvm::ElementCount::get(widthMD->getZExtValue(), + isScalableMD->equalsInt(1)); + info.simdDimIdx = simdDimIdxMD->getZExtValue(); + info.IsVectorPredicated = isVPMD->equalsInt(1); + + return info; +} + +static std::optional parseVectorLinkMD(MDNode *mdnode) { + if (auto info = + extractVectorizationInfo(dyn_cast(mdnode->getOperand(0)))) { + // The Function may well be null. + Function *vecFn = mdconst::extract_or_null(mdnode->getOperand(1)); + return LinkMetadataResult(vecFn, *info); + } + return std::nullopt; +} + +void encodeVectorizationFailedMetadata(Function &f, + const VectorizationInfo &info) { + auto *veczInfo = encodeVectorizationInfo(info, f.getContext()); + f.addMetadata("codeplay_ca_vecz.base.fail", *veczInfo); +} + +void linkOrigToVeczFnMetadata(Function &origF, Function &vectorF, + const VectorizationInfo &info) { + auto *veczInfo = encodeVectorizationInfo(info, origF.getContext()); + auto *const mdTuple = MDTuple::get( + origF.getContext(), {veczInfo, ValueAsMetadata::get(&vectorF)}); + origF.addMetadata("codeplay_ca_vecz.base", *mdTuple); +} + +void linkVeczToOrigFnMetadata(Function &vectorizedF, Function &origF, + const VectorizationInfo &info) { + auto *veczInfo = encodeVectorizationInfo(info, vectorizedF.getContext()); + auto *const mdTuple = MDTuple::get(origF.getContext(), + {veczInfo, ValueAsMetadata::get(&origF)}); + vectorizedF.addMetadata("codeplay_ca_vecz.derived", *mdTuple); +} + +static bool parseVectorizedFunctionLinkMetadata( + Function &f, StringRef mdName, + SmallVectorImpl &results) { + SmallVector nodes; + + f.getMetadata(mdName, nodes); + if (nodes.empty()) { + return false; + } + results.reserve(results.size() + nodes.size()); + for (auto *mdnode : nodes) { + if (auto link = parseVectorLinkMD(mdnode)) { + results.emplace_back(*link); + } else { + return false; + } + } + return true; +} + +bool parseOrigToVeczFnLinkMetadata(Function &f, + SmallVectorImpl &VFs) { + return parseVectorizedFunctionLinkMetadata(f, "codeplay_ca_vecz.base", VFs); +} + +std::optional parseVeczToOrigFnLinkMetadata(Function &f) { + auto *mdnode = f.getMetadata("codeplay_ca_vecz.derived"); + if (!mdnode) { + return std::nullopt; + } + return parseVectorLinkMD(mdnode); +} + +void dropVeczOrigMetadata(Function &f) { + f.setMetadata("codeplay_ca_vecz.base", nullptr); +} + +void dropVeczDerivedMetadata(Function &f) { + f.setMetadata("codeplay_ca_vecz.derived", nullptr); +} + +void encodeWrapperFnMetadata(Function &f, const VectorizationInfo &mainInfo, + std::optional tailInfo) { + MDTuple *tailInfoMD = nullptr; + auto *mainInfoMD = encodeVectorizationInfo(mainInfo, f.getContext()); + + if (tailInfo) { + tailInfoMD = encodeVectorizationInfo(*tailInfo, f.getContext()); + } + + f.setMetadata("codeplay_ca_wrapper", + MDTuple::get(f.getContext(), {mainInfoMD, tailInfoMD})); +} + +std::optional>> +parseWrapperFnMetadata(Function &f) { + auto *const mdnode = f.getMetadata("codeplay_ca_wrapper"); + if (!mdnode || mdnode->getNumOperands() != 2) { + return std::nullopt; + } + + auto *const mainTuple = dyn_cast_or_null(mdnode->getOperand(0)); + if (!mainTuple) { + return std::nullopt; + } + + VectorizationInfo mainInfo; + std::optional tailInfo; + + if (auto info = extractVectorizationInfo(mainTuple)) { + mainInfo = *info; + } else { + return std::nullopt; + } + + if (auto *const tailTuple = + dyn_cast_or_null(mdnode->getOperand(1))) { + if (auto info = extractVectorizationInfo(tailTuple)) { + tailInfo = info; + } + } + + return std::make_pair(mainInfo, tailInfo); +} + +void copyFunctionMetadata(Function &fromF, Function &toF, bool includeDebug) { + if (includeDebug) { + toF.copyMetadata(&fromF, 0); + return; + } + // Copy the metadata into the new kernel ignoring any debug info. + SmallVector, 5> metadata; + fromF.getAllMetadata(metadata); + + // Iterate through the metadata and only add nodes to the new one if they + // are not debug info. + for (const auto &pair : metadata) { + if (auto *nonDebug = dyn_cast_or_null(pair.second)) { + toF.setMetadata(pair.first, nonDebug); + } + } +} + +void encodeLocalSizeMetadata(Function &f, const std::array &size) { + // We may be truncating i64 to i32 but we don't expect local sizes to ever + // exceed 32 bits. + auto *const i32Ty = Type::getInt32Ty(f.getContext()); + auto *const mdTuple = + MDTuple::get(f.getContext(), + {ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[0])), + ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[1])), + ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[2]))}); + f.setMetadata(ReqdWGSizeMD, mdTuple); +} + +std::optional> getLocalSizeMetadata(const Function &f) { + if (auto *md = f.getMetadata(ReqdWGSizeMD)) { + return std::array{ + mdconst::extract(md->getOperand(0))->getZExtValue(), + mdconst::extract(md->getOperand(1))->getZExtValue(), + mdconst::extract(md->getOperand(2))->getZExtValue()}; + } + return std::nullopt; +} + +static constexpr const char *MuxScheduledFnMD = "mux_scheduled_fn"; + +void dropSchedulingParameterMetadata(Function &f) { + f.setMetadata(MuxScheduledFnMD, nullptr); +} + +SmallVector getSchedulingParameterFunctionMetadata(const Function &f) { + SmallVector idxs; + if (auto *md = f.getMetadata(MuxScheduledFnMD)) { + for (auto &op : md->operands()) { + idxs.push_back(mdconst::extract(op)->getSExtValue()); + } + } + return idxs; +} + +void setSchedulingParameterFunctionMetadata(Function &f, ArrayRef idxs) { + if (idxs.empty()) { + return; + } + SmallVector mdOps; + auto *const i32Ty = Type::getInt32Ty(f.getContext()); + for (auto idx : idxs) { + mdOps.push_back(ConstantAsMetadata::get(ConstantInt::get(i32Ty, idx))); + } + auto *const mdOpsTuple = MDTuple::get(f.getContext(), mdOps); + f.setMetadata(MuxScheduledFnMD, mdOpsTuple); +} + +static constexpr const char *MuxSchedulingParamsMD = "mux-scheduling-params"; + +void setSchedulingParameterModuleMetadata(Module &m, + ArrayRef names) { + SmallVector paramDebugNames; + for (const auto &name : names) { + paramDebugNames.push_back(MDString::get(m.getContext(), name)); + } + auto *const md = m.getOrInsertNamedMetadata(MuxSchedulingParamsMD); + md->clearOperands(); + md->addOperand(MDNode::get(m.getContext(), paramDebugNames)); +} + +NamedMDNode *getSchedulingParameterModuleMetadata(const Module &m) { + return m.getNamedMetadata(MuxSchedulingParamsMD); +} + +std::optional isSchedulingParameter(const Function &f, unsigned idx) { + if (auto *md = f.getMetadata(MuxScheduledFnMD)) { + for (const auto &op : enumerate(md->operands())) { + auto paramIdx = mdconst::extract(op.value())->getSExtValue(); + if (paramIdx >= 0 && (unsigned)paramIdx == idx) { + return op.index(); + } + } + } + return std::nullopt; +} + +// Uses the format of a metadata node directly applied to a function. +std::optional> +parseRequiredWGSMetadata(const Function &f) { + if (auto mdnode = f.getMetadata(ReqdWGSizeMD)) { + std::array wgs = {0, 1, 1}; + assert(mdnode->getNumOperands() >= 1 && mdnode->getNumOperands() <= 3 && + "Unsupported number of operands in reqd_work_group_size"); + for (const auto &[idx, op] : enumerate(mdnode->operands())) { + wgs[idx] = mdconst::extract(op)->getZExtValue(); + } + return wgs; + } + return std::nullopt; +} + +// Uses the format of a metadata node that's a part of the opencl.kernels node. +std::optional> +parseRequiredWGSMetadata(const MDNode &node) { + for (uint32_t i = 1; i < node.getNumOperands(); ++i) { + MDNode *const subNode = cast(node.getOperand(i)); + MDString *const operandName = cast(subNode->getOperand(0)); + if (operandName->getString() == ReqdWGSizeMD) { + auto *const op0 = mdconst::extract(subNode->getOperand(1)); + auto *const op1 = mdconst::extract(subNode->getOperand(2)); + auto *const op2 = mdconst::extract(subNode->getOperand(3)); + // KLOCWORK "UNINIT.STACK.ARRAY.MUST" possible false positive + // Initialization of looks like an uninitialized access to Klocwork + std::array wgs = { + {op0->getZExtValue(), op1->getZExtValue(), op2->getZExtValue()}}; + return wgs; + } + } + return std::nullopt; +} + +std::optional parseMaxWorkDimMetadata(const Function &f) { + if (auto *mdnode = f.getMetadata("max_work_dim")) { + auto *op0 = mdconst::extract(mdnode->getOperand(0)); + return op0->getZExtValue(); + } + + return std::nullopt; +} + +void populateKernelList(Module &m, SmallVectorImpl &results) { + // Construct list of kernels from metadata, if present. + if (auto *md = m.getNamedMetadata("opencl.kernels")) { + for (uint32_t i = 0, e = md->getNumOperands(); i < e; ++i) { + MDNode *const kernelNode = md->getOperand(i); + ValueAsMetadata *vmdKernel = + cast(kernelNode->getOperand(0)); + KernelInfo info{vmdKernel->getValue()->getName()}; + if (auto wgs = parseRequiredWGSMetadata(*kernelNode)) { + info.ReqdWGSize = *wgs; + } + results.push_back(info); + } + return; + } + + // No metadata - assume all functions with the SPIR_KERNEL calling + // convention are kernels. + for (auto &f : m) { + if (f.hasName() && f.getCallingConv() == CallingConv::SPIR_KERNEL) { + KernelInfo info(f.getName()); + if (auto wgs = parseRequiredWGSMetadata(f)) { + info.ReqdWGSize = *wgs; + } + results.push_back(info); + } + } +} + +void replaceKernelInOpenCLKernelsMetadata(Function &fromF, Function &toF, + Module &M) { + // update the kernel metadata + if (auto *const namedMD = M.getNamedMetadata("opencl.kernels")) { + for (auto *md : namedMD->operands()) { + if (md && md->getOperand(0) == ValueAsMetadata::get(&fromF)) { + md->replaceOperandWith(0, ValueAsMetadata::get(&toF)); + } + } + } +} + +static constexpr const char *ReqdSGSizeMD = "intel_reqd_sub_group_size"; + +void encodeReqdSubgroupSizeMetadata(Function &f, uint32_t size) { + auto *const i32Ty = Type::getInt32Ty(f.getContext()); + auto *const mdTuple = MDTuple::get( + f.getContext(), ConstantAsMetadata::get(ConstantInt::get(i32Ty, size))); + f.setMetadata(ReqdSGSizeMD, mdTuple); +} + +std::optional getReqdSubgroupSize(const Function &f) { + if (auto *md = f.getMetadata(ReqdSGSizeMD)) { + return mdconst::extract(md->getOperand(0))->getZExtValue(); + } + return std::nullopt; +} + +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp new file mode 100644 index 0000000000000..51268147e1345 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp @@ -0,0 +1,1319 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace llvm; + +namespace compiler { +namespace utils { + +namespace SchedParamIndices { +enum { + WI = 0, + WG = 1, + TOTAL = 2, +}; +} + +static Function *defineLocalWorkItemBuiltin(BIMuxInfoConcept &BI, BuiltinID ID, + Module &M) { + // Simple 'local' work-item getters and setters. + bool IsSetter = false; + bool HasRankArg = false; + std::optional WIFieldIdx; + switch (ID) { + default: + return nullptr; + case eMuxBuiltinSetLocalId: + IsSetter = true; + LLVM_FALLTHROUGH; + case eMuxBuiltinGetLocalId: + HasRankArg = true; + WIFieldIdx = WorkItemInfoStructField::local_id; + break; + case eMuxBuiltinSetSubGroupId: + IsSetter = true; + LLVM_FALLTHROUGH; + case eMuxBuiltinGetSubGroupId: + WIFieldIdx = WorkItemInfoStructField::sub_group_id; + break; + case eMuxBuiltinSetNumSubGroups: + IsSetter = true; + LLVM_FALLTHROUGH; + case eMuxBuiltinGetNumSubGroups: + WIFieldIdx = WorkItemInfoStructField::num_sub_groups; + break; + case eMuxBuiltinSetMaxSubGroupSize: + IsSetter = true; + LLVM_FALLTHROUGH; + case eMuxBuiltinGetMaxSubGroupSize: + WIFieldIdx = WorkItemInfoStructField::max_sub_group_size; + break; + } + + Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID)); + assert(F && WIFieldIdx); + + // Gather up the list of scheduling parameters on this builtin + const auto &SchedParams = BI.getFunctionSchedulingParameters(*F); + assert(SchedParamIndices::WI < SchedParams.size()); + + // Grab the work-item info argument + const auto &SchedParam = SchedParams[SchedParamIndices::WI]; + auto *const StructTy = dyn_cast(SchedParam.ParamPointeeTy); + assert(SchedParam.ArgVal && StructTy == getWorkItemInfoStructTy(M) && + "Inconsistent scheduling parameter data"); + + if (IsSetter) { + populateStructSetterFunction(*F, *SchedParam.ArgVal, StructTy, *WIFieldIdx, + HasRankArg); + } else { + populateStructGetterFunction(*F, *SchedParam.ArgVal, StructTy, *WIFieldIdx, + HasRankArg); + } + + return F; +} + +static Function *defineLocalWorkGroupBuiltin(BIMuxInfoConcept &BI, BuiltinID ID, + Module &M) { + // Simple work-group getters + bool HasRankArg = true; + size_t DefaultVal = 0; + std::optional WGFieldIdx; + switch (ID) { + default: + return nullptr; + case eMuxBuiltinGetLocalSize: + DefaultVal = 1; + WGFieldIdx = WorkGroupInfoStructField::local_size; + break; + case eMuxBuiltinGetGroupId: + DefaultVal = 0; + WGFieldIdx = WorkGroupInfoStructField::group_id; + break; + case eMuxBuiltinGetNumGroups: + DefaultVal = 1; + WGFieldIdx = WorkGroupInfoStructField::num_groups; + break; + case eMuxBuiltinGetGlobalOffset: + DefaultVal = 0; + WGFieldIdx = WorkGroupInfoStructField::global_offset; + break; + case eMuxBuiltinGetWorkDim: + DefaultVal = 1; + HasRankArg = false; + WGFieldIdx = WorkGroupInfoStructField::work_dim; + break; + } + + Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID)); + assert(F && WGFieldIdx); + + // Gather up the list of scheduling parameters on this builtin + const auto &SchedParams = BI.getFunctionSchedulingParameters(*F); + assert(SchedParamIndices::WG < SchedParams.size()); + + // Grab the work-group info argument + const auto &SchedParam = SchedParams[SchedParamIndices::WG]; + auto *const StructTy = dyn_cast(SchedParam.ParamPointeeTy); + assert(SchedParam.ArgVal && StructTy == getWorkGroupInfoStructTy(M) && + "Inconsistent scheduling parameter data"); + + populateStructGetterFunction(*F, *SchedParam.ArgVal, StructTy, *WGFieldIdx, + HasRankArg, DefaultVal); + return F; +} + +// FIXME: Assumes a sub-group size of 1. +static Function *defineSubGroupGroupOpBuiltin(Function &F, + GroupCollective GroupOp, + ArrayRef OverloadInfo) { + if (!GroupOp.isSubGroupScope()) { + return nullptr; + } + + auto *Arg = F.getArg(0); + + IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F)); + + [&] { + switch (GroupOp.Op) { + case GroupCollective::OpKind::Any: + case GroupCollective::OpKind::All: + case GroupCollective::OpKind::Broadcast: + case GroupCollective::OpKind::Reduction: + case GroupCollective::OpKind::ScanInclusive: + // In the trivial size=1 case, all of these operations just return the + // argument back again + B.CreateRet(Arg); + return; + case GroupCollective::OpKind::ScanExclusive: { + // In the trivial size=1 case, exclusive scans return the identity. + assert(!OverloadInfo.empty()); + auto *const IdentityVal = + getIdentityVal(GroupOp.Recurrence, OverloadInfo[0]); + assert(IdentityVal && "Unable to deduce identity val"); + B.CreateRet(IdentityVal); + return; + } + case GroupCollective::OpKind::Shuffle: + case GroupCollective::OpKind::ShuffleXor: + // In the trivial size=1 case, all of these operations just return the + // argument back again. Any computed shuffle index other than the only + // one in the sub-group would be out of bounds anyway. + B.CreateRet(Arg); + return; + case GroupCollective::OpKind::ShuffleUp: { + auto *const Prev = F.getArg(0); + auto *const Curr = F.getArg(1); + auto *const Delta = F.getArg(2); + // In the trivial size=1 case, negative delta is the desired index + // (since we're subtracting it from zero). If it's greater than zero and + // less than the size, we return 'current', else if it's less than zero + // and greater than or equal to the negative size, we return 'prev'. So + // if 'delta' is zero, return 'current', else return 'prev'. Anything + // else is out of bounds so we can simplify things here. + auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero"); + auto *const Sel = B.CreateSelect(EqZero, Curr, Prev, "sel"); + B.CreateRet(Sel); + return; + } + case GroupCollective::OpKind::ShuffleDown: { + auto *const Curr = F.getArg(0); + auto *const Next = F.getArg(1); + auto *const Delta = F.getArg(2); + // In the trivial size=1 case, the delta is the desired index (since + // we're adding it to zero). If it's less than the size, we return + // 'current', else if it's greater or equal to the size but less than + // twice the size, we return 'next'. So if 'delta' is zero, return + // 'current', else return 'next'. Anything else is out of bounds so we + // can simplify things here. + auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero"); + auto *const Sel = B.CreateSelect(EqZero, Curr, Next, "sel"); + B.CreateRet(Sel); + return; + } + } + + llvm_unreachable("Unhandled group operation"); + }(); + + return &F; +} + +static Value *createCallHelper(IRBuilder<> &B, Function &F, + ArrayRef Args) { + auto *const CI = B.CreateCall(&F, Args); + CI->setAttributes(F.getAttributes()); + CI->setCallingConv(F.getCallingConv()); + return CI; +} + +void BIMuxInfoConcept::setDefaultBuiltinAttributes(Function &F, + bool AlwaysInline) { + // Many of our mux builtin functions are marked alwaysinline (unless they're + // already marked noinline) + if (AlwaysInline && !F.hasFnAttribute(Attribute::NoInline)) { + F.addFnAttr(Attribute::AlwaysInline); + } + // We never use exceptions + F.addFnAttr(Attribute::NoUnwind); + // Recursion is not supported in ComputeMux + F.addFnAttr(Attribute::NoRecurse); +} + +Function *BIMuxInfoConcept::defineGetGlobalId(Module &M) { + Function *F = + M.getFunction(BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalId)); + assert(F); + setDefaultBuiltinAttributes(*F); + F->setLinkage(GlobalValue::InternalLinkage); + + // Create an IR builder with a single basic block in our function + IRBuilder<> B(BasicBlock::Create(M.getContext(), "entry", F)); + + auto *const MuxGetGroupIdFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetGroupId, M); + auto *const MuxGetGlobalOffsetFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalOffset, M); + auto *const MuxGetLocalIdFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M); + auto *const MuxGetLocalSizeFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M); + assert(MuxGetGroupIdFn && MuxGetGlobalOffsetFn && MuxGetLocalIdFn && + MuxGetLocalSizeFn); + + // Pass on all arguments through to dependent builtins. We expect that each + // function has identical prototypes, regardless of whether scheduling + // parameters have been added + const SmallVector Args(make_pointer_range(F->args())); + + auto *const GetGroupIdCall = createCallHelper(B, *MuxGetGroupIdFn, Args); + auto *const GetGlobalOffsetCall = + createCallHelper(B, *MuxGetGlobalOffsetFn, Args); + auto *const GetLocalIdCall = createCallHelper(B, *MuxGetLocalIdFn, Args); + auto *const GetLocalSizeCall = createCallHelper(B, *MuxGetLocalSizeFn, Args); + + // (get_group_id(i) * get_local_size(i)) + auto *Ret = B.CreateMul(GetGroupIdCall, GetLocalSizeCall); + // (get_group_id(i) * get_local_size(i)) + get_local_id(i) + Ret = B.CreateAdd(Ret, GetLocalIdCall); + // get_global_id(i) = (get_group_id(i) * get_local_size(i)) + + // get_local_id(i) + get_global_offset(i) + Ret = B.CreateAdd(Ret, GetGlobalOffsetCall); + + // ... and return our result + B.CreateRet(Ret); + return F; +} + +// FIXME: Assumes a sub-group size of 1. +Function *BIMuxInfoConcept::defineGetSubGroupSize(Function &F) { + setDefaultBuiltinAttributes(F); + F.setLinkage(GlobalValue::InternalLinkage); + + IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F)); + + assert(F.getReturnType() == B.getInt32Ty()); + B.CreateRet(B.getInt32(1)); + + return &F; +} + +// FIXME: Assumes a sub-group size of 1. +Function *BIMuxInfoConcept::defineGetSubGroupLocalId(Function &F) { + setDefaultBuiltinAttributes(F); + F.setLinkage(GlobalValue::InternalLinkage); + + IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F)); + + assert(F.getReturnType() == B.getInt32Ty()); + B.CreateRet(B.getInt32(0)); + + return &F; +} + +Function *BIMuxInfoConcept::defineGetGlobalSize(Module &M) { + Function *F = + M.getFunction(BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalSize)); + assert(F); + setDefaultBuiltinAttributes(*F); + F->setLinkage(GlobalValue::InternalLinkage); + + auto *const MuxGetNumGroupsFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetNumGroups, M); + auto *const MuxGetLocalSizeFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M); + assert(MuxGetNumGroupsFn && MuxGetLocalSizeFn); + + // create an IR builder with a single basic block in our function + IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F)); + + // Pass on all arguments through to dependent builtins. We expect that each + // function has identical prototypes, regardless of whether scheduling + // parameters have been added + const SmallVector Args(make_pointer_range(F->args())); + + // call get_num_groups + auto *const GetNumGroupsCall = createCallHelper(B, *MuxGetNumGroupsFn, Args); + + // call get_local_size + auto *const GetLocalSizeCall = createCallHelper(B, *MuxGetLocalSizeFn, Args); + + // get_global_size(i) = get_num_groups(i) * get_local_size(i) + auto *const Ret = B.CreateMul(GetNumGroupsCall, GetLocalSizeCall); + + // and return our result + B.CreateRet(Ret); + return F; +} + +Function *BIMuxInfoConcept::defineGetLocalLinearId(Module &M) { + Function *F = M.getFunction( + BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetLocalLinearId)); + assert(F); + setDefaultBuiltinAttributes(*F); + F->setLinkage(GlobalValue::InternalLinkage); + + auto *const MuxGetLocalIdFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M); + auto *const MuxGetLocalSizeFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M); + assert(MuxGetLocalIdFn && MuxGetLocalSizeFn); + + // Create a call to all the required builtins. + IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F)); + + // Pass on all arguments through to dependent builtins. Ignoring the index + // parameters we'll add, we expect that each function has identical + // prototypes, regardless of whether scheduling parameters have been added + SmallVector Args(make_pointer_range(F->args())); + + SmallVector Idx0Args = {B.getInt32(0)}; + append_range(Idx0Args, Args); + SmallVector Idx1Args = {B.getInt32(1)}; + append_range(Idx1Args, Args); + SmallVector Idx2Args = {B.getInt32(2)}; + append_range(Idx2Args, Args); + + auto *const GetLocalIDXCall = createCallHelper(B, *MuxGetLocalIdFn, Idx0Args); + auto *const GetLocalIDYCall = createCallHelper(B, *MuxGetLocalIdFn, Idx1Args); + auto *const GetLocalIDZCall = createCallHelper(B, *MuxGetLocalIdFn, Idx2Args); + + auto *const GetLocalSizeXCall = + createCallHelper(B, *MuxGetLocalSizeFn, Idx0Args); + auto *const GetLocalSizeYCall = + createCallHelper(B, *MuxGetLocalSizeFn, Idx1Args); + + // get_local_id(2) * get_local_size(1). + auto *ZTerm = B.CreateMul(GetLocalIDZCall, GetLocalSizeYCall); + // get_local_id(2) * get_local_size(1) * get_local_size(0). + ZTerm = B.CreateMul(ZTerm, GetLocalSizeXCall); + + // get_local_id(1) * get_local_size(0). + auto *const YTerm = B.CreateMul(GetLocalIDYCall, GetLocalSizeXCall); + + // get_local_id(2) * get_local_size(1) * get_local_size(0) + + // get_local_id(1) * get_local_size(0). + auto *Ret = B.CreateAdd(ZTerm, YTerm); + // get_local_id(2) * get_local_size(1) * get_local_size(0) + + // get_local_id(1) * get_local_size(0) + get_local_id(0). + Ret = B.CreateAdd(Ret, GetLocalIDXCall); + + B.CreateRet(Ret); + return F; +} + +Function *BIMuxInfoConcept::defineGetGlobalLinearId(Module &M) { + Function *F = M.getFunction( + BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalLinearId)); + assert(F); + setDefaultBuiltinAttributes(*F); + F->setLinkage(GlobalValue::InternalLinkage); + + auto *const MuxGetGlobalIdFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalId, M); + auto *const MuxGetGlobalOffsetFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalOffset, M); + auto *const MuxGetGlobalSizeFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalSize, M); + assert(MuxGetGlobalIdFn && MuxGetGlobalOffsetFn && MuxGetGlobalSizeFn); + + // Create a call to all the required builtins. + IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F)); + + // Pass on all arguments through to dependent builtins. Ignoring the index + // parameters we'll add, we expect that each function has identical + // prototypes, regardless of whether scheduling parameters have been added + SmallVector Args(make_pointer_range(F->args())); + + SmallVector Idx0Args = {B.getInt32(0)}; + append_range(Idx0Args, Args); + SmallVector Idx1Args = {B.getInt32(1)}; + append_range(Idx1Args, Args); + SmallVector Idx2Args = {B.getInt32(2)}; + append_range(Idx2Args, Args); + + auto *const GetGlobalIDXCall = + createCallHelper(B, *MuxGetGlobalIdFn, Idx0Args); + auto *const GetGlobalIDYCall = + createCallHelper(B, *MuxGetGlobalIdFn, Idx1Args); + auto *const GetGlobalIDZCall = + createCallHelper(B, *MuxGetGlobalIdFn, Idx2Args); + + auto *const GetGlobalOffsetXCall = + createCallHelper(B, *MuxGetGlobalOffsetFn, Idx0Args); + auto *const GetGlobalOffsetYCall = + createCallHelper(B, *MuxGetGlobalOffsetFn, Idx1Args); + auto *const GetGlobalOffsetZCall = + createCallHelper(B, *MuxGetGlobalOffsetFn, Idx2Args); + + auto *const GetGlobalSizeXCall = + createCallHelper(B, *MuxGetGlobalSizeFn, Idx0Args); + auto *const GetGlobalSizeYCall = + createCallHelper(B, *MuxGetGlobalSizeFn, Idx1Args); + + // global linear id is calculated as follows: + // get_global_linear_id() = + // (get_global_id(2) - get_global_offset(2)) * get_global_size(1) * + // get_global_size(0) + (get_global_id(1) - get_global_offset(1)) * + // get_global_size(0) + get_global_id(0) - get_global_offset(0). + // = + // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) + + // get_global_id(1) - get_global_offset(1)) * get_global_size(0) + + // get_global_id(0) - get_global_offset(0). + + auto *ZTerm = B.CreateSub(GetGlobalIDZCall, GetGlobalOffsetZCall); + // (get_global_id(2) - get_global_offset(2)) * get_global_size(1). + ZTerm = B.CreateMul(ZTerm, GetGlobalSizeYCall); + + // get_global_id(1) - get_global_offset(1). + auto *const YTerm = B.CreateSub(GetGlobalIDYCall, GetGlobalOffsetYCall); + + // (get_global_id(2) - get_global_offset(2)) * get_global_size(1) + + // get_global_id(1) - get_global_offset(1) + auto *YZTermsCombined = B.CreateAdd(ZTerm, YTerm); + + // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) + + // get_global_id(1) - get_global_offset(1)) * get_global_size(0). + YZTermsCombined = B.CreateMul(YZTermsCombined, GetGlobalSizeXCall); + + // get_global_id(0) - get_global_offset(0). + auto *const XTerm = B.CreateSub(GetGlobalIDXCall, GetGlobalOffsetXCall); + + // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) + + // get_global_id(1) - get_global_offset(1)) * get_global_size(0) + + // get_global_id(0) - get_global_offset(0). + auto *const Ret = B.CreateAdd(XTerm, YZTermsCombined); + + B.CreateRet(Ret); + return F; +} + +Function *BIMuxInfoConcept::defineGetEnqueuedLocalSize(Module &M) { + Function *F = M.getFunction( + BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetEnqueuedLocalSize)); + assert(F); + setDefaultBuiltinAttributes(*F); + F->setLinkage(GlobalValue::InternalLinkage); + + auto *const MuxGetLocalSizeFn = + getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M); + assert(MuxGetLocalSizeFn); + + IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F)); + + // Pass on all arguments through to dependent builtins. We expect that each + // function has identical prototypes, regardless of whether scheduling + // parameters have been added + const SmallVector Args(make_pointer_range(F->args())); + + // Since we don't support non-uniform subgroups + // get_enqueued_local_size(x) == get_local_size(x). + auto *const GetLocalSize = createCallHelper(B, *MuxGetLocalSizeFn, Args); + + B.CreateRet(GetLocalSize); + return F; +} + +Function *BIMuxInfoConcept::defineMemBarrier(Function &F, unsigned, + unsigned SemanticsIdx) { + // FIXME: We're ignoring some operands here. We're dropping the 'scope' but + // our set of default set of targets can't make use of anything but a + // single-threaded fence. We're also ignoring the kind of memory being + // controlled by the barrier. + auto &M = *F.getParent(); + setDefaultBuiltinAttributes(F); + F.setLinkage(GlobalValue::InternalLinkage); + IRBuilder<> B(BasicBlock::Create(M.getContext(), "", &F)); + + // Grab the semantics argument. + Value *Semantics = F.getArg(SemanticsIdx); + // Mask out only the memory ordering value. + Semantics = B.CreateAnd(Semantics, B.getInt32(MemSemanticsMask)); + + // Don't insert this exit block just yet + auto *const ExitBB = BasicBlock::Create(M.getContext(), "exit"); + + auto *const DefaultBB = + BasicBlock::Create(M.getContext(), "case.default", &F); + auto *const Switch = B.CreateSwitch(Semantics, DefaultBB); + + const struct { + StringRef Name; + unsigned SwitchVal; + AtomicOrdering Ordering; + } Data[4] = { + {"case.acquire", MemSemanticsAcquire, AtomicOrdering::Acquire}, + {"case.release", MemSemanticsRelease, AtomicOrdering::Release}, + {"case.acq_rel", MemSemanticsAcquireRelease, + AtomicOrdering::AcquireRelease}, + {"case.seq_cst", MemSemanticsSequentiallyConsistent, + AtomicOrdering::SequentiallyConsistent}, + }; + + for (const auto &D : Data) { + auto *const BB = BasicBlock::Create(M.getContext(), D.Name, &F); + + Switch->addCase(B.getInt32(D.SwitchVal), BB); + B.SetInsertPoint(BB); + B.CreateFence(D.Ordering, SyncScope::SingleThread); + B.CreateBr(ExitBB); + } + + // The default case assumes a 'relaxed' ordering and emits no fence + // whatsoever. + B.SetInsertPoint(DefaultBB); + B.CreateBr(ExitBB); + + ExitBB->insertInto(&F); + B.SetInsertPoint(ExitBB); + B.CreateRetVoid(); + + return &F; +} + +static BasicBlock *copy1D(Module &M, BasicBlock &ParentBB, Value *DstPtr, + Value *SrcPtr, Value *NumBytes) { + Type *const I8Ty = IntegerType::get(M.getContext(), 8); + + assert(SrcPtr->getType()->isPointerTy() && + "Mux DMA builtins are always byte-accessed"); + assert(DstPtr->getType()->isPointerTy() && + "Mux DMA builtins are always byte-accessed"); + + compiler::utils::CreateLoopOpts opts; + opts.IVs = {SrcPtr, DstPtr}; + opts.loopIVNames = {"dma.src", "dma.dst"}; + + // This is a simple loop copy a byte at a time from SrcPtr to DstPtr. + BasicBlock *ExitBB = compiler::utils::createLoop( + &ParentBB, nullptr, ConstantInt::get(getSizeType(M), 0), NumBytes, opts, + [&](BasicBlock *BB, Value *X, ArrayRef IVsCurr, + MutableArrayRef IVsNext) { + IRBuilder<> B(BB); + Value *const CurrentDmaSrcPtr1DPhi = IVsCurr[0]; + Value *const CurrentDmaDstPtr1DPhi = IVsCurr[1]; + Value *load = B.CreateLoad(I8Ty, CurrentDmaSrcPtr1DPhi); + B.CreateStore(load, CurrentDmaDstPtr1DPhi); + IVsNext[0] = B.CreateGEP(I8Ty, CurrentDmaSrcPtr1DPhi, + ConstantInt::get(X->getType(), 1)); + IVsNext[1] = B.CreateGEP(I8Ty, CurrentDmaDstPtr1DPhi, + ConstantInt::get(X->getType(), 1)); + return BB; + }); + + return ExitBB; +} + +static BasicBlock *copy2D(Module &M, BasicBlock &ParentBB, Value *DstPtr, + Value *SrcPtr, Value *LineSizeBytes, + Value *LineStrideDst, Value *LineStrideSrc, + Value *NumLines) { + Type *const I8Ty = IntegerType::get(M.getContext(), 8); + + assert(SrcPtr->getType()->isPointerTy() && + "Mux DMA builtins are always byte-accessed"); + assert(DstPtr->getType()->isPointerTy() && + "Mux DMA builtins are always byte-accessed"); + + compiler::utils::CreateLoopOpts opts; + opts.IVs = {SrcPtr, DstPtr}; + opts.loopIVNames = {"dma.src", "dma.dst"}; + + // This is a loop over the range of lines, calling a 1D copy on each line + BasicBlock *ExitBB = compiler::utils::createLoop( + &ParentBB, nullptr, ConstantInt::get(getSizeType(M), 0), NumLines, opts, + [&](BasicBlock *block, Value *, ArrayRef IVsCurr, + MutableArrayRef IVsNext) { + IRBuilder<> loopIr(block); + Value *CurrentDmaSrcPtrPhi = IVsCurr[0]; + Value *CurrentDmaDstPtrPhi = IVsCurr[1]; + + IVsNext[0] = loopIr.CreateGEP(I8Ty, CurrentDmaSrcPtrPhi, LineStrideSrc); + IVsNext[1] = loopIr.CreateGEP(I8Ty, CurrentDmaDstPtrPhi, LineStrideDst); + return copy1D(M, *block, CurrentDmaDstPtrPhi, CurrentDmaSrcPtrPhi, + LineSizeBytes); + }); + + return ExitBB; +} + +Function *BIMuxInfoConcept::defineDMA1D(Function &F) { + Argument *const ArgDstPtr = F.getArg(0); + Argument *const ArgSrcPtr = F.getArg(1); + Argument *const ArgWidth = F.getArg(2); + Argument *const ArgEvent = F.getArg(3); + + auto &M = *F.getParent(); + auto &Ctx = F.getContext(); + auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F); + auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB); + auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB); + + auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M); + compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB, + *GetLocalIDFn); + + BasicBlock *const LoopExitBB = + copy1D(M, *LoopEntryBB, ArgDstPtr, ArgSrcPtr, ArgWidth); + IRBuilder<> LoopIRB(LoopExitBB); + LoopIRB.CreateBr(ExitBB); + + IRBuilder<> ExitIRB(ExitBB); + ExitIRB.CreateRet(ArgEvent); + + return &F; +} + +Function *BIMuxInfoConcept::defineDMA2D(Function &F) { + Argument *const ArgDstPtr = F.getArg(0); + Argument *const ArcSrcPtr = F.getArg(1); + Argument *const ArgWidth = F.getArg(2); + Argument *const ArgDstStride = F.getArg(3); + Argument *const ArgSrcStride = F.getArg(4); + Argument *const ArgNumLines = F.getArg(5); + Argument *const ArgEvent = F.getArg(6); + + auto &M = *F.getParent(); + auto &Ctx = F.getContext(); + auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F); + auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB); + auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB); + + auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M); + compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB, + *GetLocalIDFn); + + // Create a loop around 1D DMA memcpy, adding strides each time. + BasicBlock *const LoopExitBB = + copy2D(M, *LoopEntryBB, ArgDstPtr, ArcSrcPtr, ArgWidth, ArgDstStride, + ArgSrcStride, ArgNumLines); + + IRBuilder<> LoopIRB(LoopExitBB); + LoopIRB.CreateBr(ExitBB); + + IRBuilder<> ExitIRB(ExitBB); + ExitIRB.CreateRet(ArgEvent); + + return &F; +} + +Function *BIMuxInfoConcept::defineDMA3D(Function &F) { + Argument *const ArgDstPtr = F.getArg(0); + Argument *const ArgSrcPtr = F.getArg(1); + Argument *const ArgLineSize = F.getArg(2); + Argument *const ArgDstLineStride = F.getArg(3); + Argument *const ArgSrcLineStride = F.getArg(4); + Argument *const ArgNumLinesPerPlane = F.getArg(5); + Argument *const ArgDstPlaneStride = F.getArg(6); + Argument *const ArgSrcPlaneStride = F.getArg(7); + Argument *const ArgNumPlanes = F.getArg(8); + Argument *const ArgEvent = F.getArg(9); + + auto &M = *F.getParent(); + auto &Ctx = F.getContext(); + Type *const I8Ty = IntegerType::get(Ctx, 8); + + auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F); + auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB); + auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB); + + auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M); + compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB, + *GetLocalIDFn); + + assert(ArgSrcPtr->getType()->isPointerTy() && + "Mux DMA builtins are always byte-accessed"); + assert(ArgDstPtr->getType()->isPointerTy() && + "Mux DMA builtins are always byte-accessed"); + + compiler::utils::CreateLoopOpts opts; + opts.IVs = {ArgSrcPtr, ArgDstPtr}; + opts.loopIVNames = {"dma.src", "dma.dst"}; + + // Create a loop around 1D DMA memcpy, adding stride, local width each time. + BasicBlock *LoopExitBB = compiler::utils::createLoop( + LoopEntryBB, nullptr, ConstantInt::get(getSizeType(M), 0), ArgNumPlanes, + opts, + [&](BasicBlock *BB, Value *, ArrayRef IVsCurr, + MutableArrayRef IVsNext) { + IRBuilder<> loopIr(BB); + Value *CurrentDmaPlaneSrcPtrPhi = IVsCurr[0]; + Value *CurrentDmaPlaneDstPtrPhi = IVsCurr[1]; + + IVsNext[0] = + loopIr.CreateGEP(I8Ty, CurrentDmaPlaneSrcPtrPhi, ArgSrcPlaneStride); + IVsNext[1] = + loopIr.CreateGEP(I8Ty, CurrentDmaPlaneDstPtrPhi, ArgDstPlaneStride); + + return copy2D(M, *BB, CurrentDmaPlaneDstPtrPhi, + CurrentDmaPlaneSrcPtrPhi, ArgLineSize, ArgDstLineStride, + ArgSrcLineStride, ArgNumLinesPerPlane); + }); + + IRBuilder<> LoopExitIRB(LoopExitBB); + LoopExitIRB.CreateBr(ExitBB); + + IRBuilder<> ExitIRB(ExitBB); + ExitIRB.CreateRet(ArgEvent); + + return &F; +} + +Function *BIMuxInfoConcept::defineDMAWait(Function &F) { + // By default this function is a simple return-void. + IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F)); + B.CreateRetVoid(); + + return &F; +} + +Function *BIMuxInfoConcept::defineMuxBuiltin(BuiltinID ID, Module &M, + ArrayRef OverloadInfo) { + assert(BuiltinInfo::isMuxBuiltinID(ID) && "Only handling mux builtins"); + Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID, OverloadInfo)); + // FIXME: We'd ideally want to declare it here to reduce pass + // inter-dependencies. + assert(F && "Function should have been pre-declared"); + if (!F->isDeclaration()) { + return F; + } + + switch (ID) { + default: + break; + case eMuxBuiltinGetGlobalId: + return defineGetGlobalId(M); + case eMuxBuiltinGetGlobalSize: + return defineGetGlobalSize(M); + case eMuxBuiltinGetLocalLinearId: + return defineGetLocalLinearId(M); + case eMuxBuiltinGetGlobalLinearId: + return defineGetGlobalLinearId(M); + case eMuxBuiltinGetEnqueuedLocalSize: + return defineGetEnqueuedLocalSize(M); + // Just handle the memory synchronization requirements of any barrier + // builtin. We assume that the control requirements of work-group and + // sub-group control barriers have been handled by earlier passes. + case eMuxBuiltinMemBarrier: + return defineMemBarrier(*F, 0, 1); + case eMuxBuiltinSubGroupBarrier: + case eMuxBuiltinWorkGroupBarrier: + return defineMemBarrier(*F, 1, 2); + case eMuxBuiltinDMARead1D: + case eMuxBuiltinDMAWrite1D: + return defineDMA1D(*F); + case eMuxBuiltinDMARead2D: + case eMuxBuiltinDMAWrite2D: + return defineDMA2D(*F); + case eMuxBuiltinDMARead3D: + case eMuxBuiltinDMAWrite3D: + return defineDMA3D(*F); + case eMuxBuiltinDMAWait: + return defineDMAWait(*F); + case eMuxBuiltinGetSubGroupSize: + return defineGetSubGroupSize(*F); + case eMuxBuiltinGetSubGroupLocalId: + return defineGetSubGroupLocalId(*F); + } + + if (auto *const NewF = defineLocalWorkItemBuiltin(*this, ID, M)) { + return NewF; + } + + if (auto *const NewF = defineLocalWorkGroupBuiltin(*this, ID, M)) { + return NewF; + } + + if (auto GroupOp = BuiltinInfo::isMuxGroupCollective(ID)) { + if (auto *const NewF = + defineSubGroupGroupOpBuiltin(*F, *GroupOp, OverloadInfo)) { + return NewF; + } + } + + return nullptr; +} + +bool BIMuxInfoConcept::requiresSchedulingParameters(BuiltinID ID) { + switch (ID) { + default: + return false; + case eMuxBuiltinGetLocalId: + case eMuxBuiltinSetLocalId: + case eMuxBuiltinGetSubGroupId: + case eMuxBuiltinSetSubGroupId: + case eMuxBuiltinGetNumSubGroups: + case eMuxBuiltinSetNumSubGroups: + case eMuxBuiltinGetMaxSubGroupSize: + case eMuxBuiltinSetMaxSubGroupSize: + case eMuxBuiltinGetLocalLinearId: + // Work-item struct only + return true; + case eMuxBuiltinGetWorkDim: + case eMuxBuiltinGetGroupId: + case eMuxBuiltinGetNumGroups: + case eMuxBuiltinGetGlobalSize: + case eMuxBuiltinGetLocalSize: + case eMuxBuiltinGetGlobalOffset: + case eMuxBuiltinGetEnqueuedLocalSize: + // Work-group struct only + return true; + case eMuxBuiltinGetGlobalId: + case eMuxBuiltinGetGlobalLinearId: + // Work-item and work-group structs + return true; + } +} + +Type *BIMuxInfoConcept::getRemappedTargetExtTy(Type *Ty, Module &M) { + // We only map target extension types + assert(Ty && Ty->isTargetExtTy() && "Only expecting target extension types"); + auto &Ctx = Ty->getContext(); + auto *TgtExtTy = cast(Ty); + + // Samplers are replaced by default with size_t. + if (TgtExtTy == compiler::utils::tgtext::getSamplerTy(Ctx)) { + return getSizeType(M); + } + + // Events are replaced by default with size_t. + if (TgtExtTy == compiler::utils::tgtext::getEventTy(Ctx)) { + return getSizeType(M); + } + + // *All* images are replaced by default with a pointer in the default address + // space to the same structure type (i.e., regardless of image dimensions, + // etc.) + if (TgtExtTy->getName() == "spirv.Image") { + return PointerType::getUnqual(Ctx); + } + + return nullptr; +} + +Function * +BIMuxInfoConcept::getOrDeclareMuxBuiltin(BuiltinID ID, Module &M, + ArrayRef OverloadInfo) { + assert(BuiltinInfo::isMuxBuiltinID(ID) && "Only handling mux builtins"); + auto FnName = BuiltinInfo::getMuxBuiltinName(ID, OverloadInfo); + if (auto *const F = M.getFunction(FnName)) { + return F; + } + auto &Ctx = M.getContext(); + AttrBuilder AB(Ctx); + auto *const SizeTy = getSizeType(M); + auto *const Int32Ty = Type::getInt32Ty(Ctx); + auto *const VoidTy = Type::getVoidTy(Ctx); + + Type *RetTy = nullptr; + SmallVector ParamTys; + SmallVector ParamNames; + + switch (ID) { + // Ranked Getters + case eMuxBuiltinGetLocalId: + case eMuxBuiltinGetGlobalId: + case eMuxBuiltinGetLocalSize: + case eMuxBuiltinGetGlobalSize: + case eMuxBuiltinGetGlobalOffset: + case eMuxBuiltinGetNumGroups: + case eMuxBuiltinGetGroupId: + case eMuxBuiltinGetEnqueuedLocalSize: + ParamTys.push_back(Int32Ty); + ParamNames.push_back("idx"); + LLVM_FALLTHROUGH; + // Unranked Getters + case eMuxBuiltinGetWorkDim: + case eMuxBuiltinGetSubGroupId: + case eMuxBuiltinGetNumSubGroups: + case eMuxBuiltinGetSubGroupSize: + case eMuxBuiltinGetMaxSubGroupSize: + case eMuxBuiltinGetSubGroupLocalId: + case eMuxBuiltinGetLocalLinearId: + case eMuxBuiltinGetGlobalLinearId: { + // Some builtins return uint, others return size_t + RetTy = + (ID == eMuxBuiltinGetWorkDim || ID == eMuxBuiltinGetSubGroupId || + ID == eMuxBuiltinGetNumSubGroups || ID == eMuxBuiltinGetSubGroupSize || + ID == eMuxBuiltinGetMaxSubGroupSize || + ID == eMuxBuiltinGetSubGroupLocalId) + ? Int32Ty + : SizeTy; + // All of our mux getters are readonly - they may never write data + AB.addMemoryAttr(MemoryEffects::readOnly()); + break; + } + // Ranked Setters + case eMuxBuiltinSetLocalId: + ParamTys.push_back(Int32Ty); + ParamNames.push_back("idx"); + LLVM_FALLTHROUGH; + // Unranked Setters + case eMuxBuiltinSetSubGroupId: + case eMuxBuiltinSetNumSubGroups: + case eMuxBuiltinSetMaxSubGroupSize: { + RetTy = VoidTy; + ParamTys.push_back(ID == eMuxBuiltinSetLocalId ? SizeTy : Int32Ty); + ParamNames.push_back("val"); + break; + } + case eMuxBuiltinMemBarrier: { + RetTy = VoidTy; + for (auto PName : {"scope", "semantics"}) { + ParamTys.push_back(Int32Ty); + ParamNames.push_back(PName); + } + AB.addAttribute(Attribute::NoMerge); + AB.addAttribute(Attribute::NoDuplicate); + AB.addAttribute(Attribute::Convergent); + break; + } + case eMuxBuiltinSubGroupBarrier: + case eMuxBuiltinWorkGroupBarrier: { + RetTy = VoidTy; + for (auto PName : {"id", "scope", "semantics"}) { + ParamTys.push_back(Int32Ty); + ParamNames.push_back(PName); + } + AB.addAttribute(Attribute::NoMerge); + AB.addAttribute(Attribute::NoDuplicate); + AB.addAttribute(Attribute::Convergent); + break; + } + case eMuxBuiltinDMAWait: + RetTy = VoidTy; + // Num events + ParamTys.push_back(Int32Ty); + ParamNames.push_back("num_events"); + // The events list + ParamTys.push_back(PointerType::getUnqual(Ctx)); + ParamNames.push_back("events"); + AB.addAttribute(Attribute::Convergent); + break; + case eMuxBuiltinDMARead1D: + case eMuxBuiltinDMAWrite1D: { + // We need to be told the target event type to declare this builtin. + assert(!OverloadInfo.empty() && "Missing event type"); + auto *const EventTy = OverloadInfo[0]; + RetTy = EventTy; + const bool IsRead = ID == eMuxBuiltinDMARead1D; + + PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local); + PointerType *const GlobalPtrTy = + PointerType::get(Ctx, AddressSpace::Global); + + ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy); + ParamNames.push_back("dst"); + + ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy); + ParamNames.push_back("src"); + + ParamTys.push_back(SizeTy); + ParamNames.push_back("num_bytes"); + + ParamTys.push_back(EventTy); + ParamNames.push_back("event"); + break; + } + case eMuxBuiltinDMARead2D: + case eMuxBuiltinDMAWrite2D: { + // We need to be told the target event type to declare this builtin. + assert(!OverloadInfo.empty() && "Missing event type"); + auto *const EventTy = OverloadInfo[0]; + RetTy = EventTy; + const bool IsRead = ID == eMuxBuiltinDMARead2D; + + PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local); + PointerType *const GlobalPtrTy = + PointerType::get(Ctx, AddressSpace::Global); + + ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy); + ParamNames.push_back("dst"); + + ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy); + ParamNames.push_back("src"); + + for (auto &P : {"num_bytes", "dst_stride", "src_stride", "height"}) { + ParamTys.push_back(SizeTy); + ParamNames.push_back(P); + } + + ParamTys.push_back(EventTy); + ParamNames.push_back("event"); + break; + } + case eMuxBuiltinDMARead3D: + case eMuxBuiltinDMAWrite3D: { + // We need to be told the target event type to declare this builtin. + assert(!OverloadInfo.empty() && "Missing event type"); + auto *const EventTy = OverloadInfo[0]; + RetTy = EventTy; + const bool IsRead = ID == eMuxBuiltinDMARead3D; + + PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local); + PointerType *const GlobalPtrTy = + PointerType::get(Ctx, AddressSpace::Global); + + ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy); + ParamNames.push_back("dst"); + + ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy); + ParamNames.push_back("src"); + + for (auto &P : {"num_bytes", "dst_line_stride", "src_line_stride", "height", + "dst_plane_stride", "src_plane_stride", "depth"}) { + ParamTys.push_back(SizeTy); + ParamNames.push_back(P); + } + + ParamTys.push_back(EventTy); + ParamNames.push_back("event"); + break; + } + default: + // Group builtins are more easily found using this helper rather than + // explicitly enumerating each switch case. + if (auto Group = BuiltinInfo::isMuxGroupCollective(ID)) { + RetTy = OverloadInfo.front(); + AB.addAttribute(Attribute::Convergent); + switch (Group->Op) { + default: + ParamTys.push_back(RetTy); + ParamNames.push_back("val"); + break; + case GroupCollective::OpKind::Broadcast: + ParamTys.push_back(RetTy); + ParamNames.push_back("val"); + // Broadcasts additionally add ID parameters + if (Group->isSubGroupScope()) { + ParamTys.push_back(Int32Ty); + ParamNames.push_back("lid"); + } else { + ParamTys.push_back(SizeTy); + ParamNames.push_back("lidx"); + ParamTys.push_back(SizeTy); + ParamNames.push_back("lidy"); + ParamTys.push_back(SizeTy); + ParamNames.push_back("lidz"); + } + break; + case GroupCollective::OpKind::Shuffle: + ParamTys.push_back(RetTy); + ParamNames.push_back("val"); + ParamTys.push_back(Int32Ty); + ParamNames.push_back("lid"); + break; + case GroupCollective::OpKind::ShuffleXor: + ParamTys.push_back(RetTy); + ParamNames.push_back("val"); + ParamTys.push_back(Int32Ty); + ParamNames.push_back("xor_val"); + break; + case GroupCollective::OpKind::ShuffleUp: + ParamTys.push_back(RetTy); + ParamNames.push_back("prev"); + ParamTys.push_back(RetTy); + ParamNames.push_back("curr"); + ParamTys.push_back(Int32Ty); + ParamNames.push_back("delta"); + break; + case GroupCollective::OpKind::ShuffleDown: + ParamTys.push_back(RetTy); + ParamNames.push_back("curr"); + ParamTys.push_back(RetTy); + ParamNames.push_back("next"); + ParamTys.push_back(Int32Ty); + ParamNames.push_back("delta"); + break; + } + // All work-group operations have a 'barrier id' operand as their first + // parameter. + if (Group->isWorkGroupScope()) { + ParamTys.insert(ParamTys.begin(), Int32Ty); + ParamNames.insert(ParamNames.begin(), "id"); + } + } else { + // Unknown mux builtin + return nullptr; + } + } + + assert(RetTy); + assert(ParamTys.size() == ParamNames.size()); + + SmallVector SchedParamIdxs; + // Fill up the scalar parameters with the default attributes. + SmallVector ParamAttrs(ParamTys.size(), AttributeSet()); + + if (requiresSchedulingParameters(ID) && + getSchedulingParameterModuleMetadata(M)) { + for (const auto &P : getMuxSchedulingParameters(M)) { + ParamTys.push_back(P.ParamTy); + ParamNames.push_back(P.ParamName); + ParamAttrs.push_back(P.ParamAttrs); + SchedParamIdxs.push_back(ParamTys.size() - 1); + } + } + + auto *const FnTy = FunctionType::get(RetTy, ParamTys, /*isVarArg*/ false); + auto *const F = Function::Create(FnTy, Function::ExternalLinkage, FnName, &M); + F->addFnAttrs(AB); + + // Add some extra attributes we know are always true. + setDefaultBuiltinAttributes(*F); + + for (unsigned i = 0, e = ParamNames.size(); i != e; i++) { + F->getArg(i)->setName(ParamNames[i]); + auto AB = AttrBuilder(Ctx, ParamAttrs[i]); + F->getArg(i)->addAttrs(AB); + } + + setSchedulingParameterFunctionMetadata(*F, SchedParamIdxs); + + return F; +} + +// By default we use two parameters: +// * one structure containing local work-group data +// * one structure containing non-local work-group data +SmallVector +BIMuxInfoConcept::getMuxSchedulingParameters(Module &M) { + auto &Ctx = M.getContext(); + auto &DL = M.getDataLayout(); + AttributeSet DefaultAttrs; + DefaultAttrs = DefaultAttrs.addAttribute(Ctx, Attribute::NonNull); + DefaultAttrs = DefaultAttrs.addAttribute(Ctx, Attribute::NoAlias); + + BuiltinInfo::SchedParamInfo WIInfo; + { + auto *const WIInfoS = getWorkItemInfoStructTy(M); + WIInfo.ID = SchedParamIndices::WI; + WIInfo.ParamPointeeTy = WIInfoS; + WIInfo.ParamTy = PointerType::get(Ctx, /*AddressSpace=*/0); + WIInfo.ParamName = "wi-info"; + WIInfo.ParamDebugName = WIInfoS->getStructName().str(); + WIInfo.PassedExternally = false; + + auto AB = AttrBuilder(Ctx, DefaultAttrs); + AB.addAlignmentAttr(DL.getABITypeAlign(WIInfoS)); + AB.addDereferenceableAttr(DL.getTypeAllocSize(WIInfoS)); + WIInfo.ParamAttrs = AttributeSet::get(Ctx, AB); + } + + BuiltinInfo::SchedParamInfo WGInfo; + { + auto *const WGInfoS = getWorkGroupInfoStructTy(M); + WGInfo.ID = SchedParamIndices::WG; + WGInfo.ParamPointeeTy = WGInfoS; + WGInfo.ParamTy = PointerType::get(Ctx, /*AddressSpace=*/0); + WGInfo.ParamName = "wg-info"; + WGInfo.ParamDebugName = WGInfoS->getStructName().str(); + WGInfo.PassedExternally = true; + + auto AB = AttrBuilder(Ctx, DefaultAttrs); + AB.addAlignmentAttr(DL.getABITypeAlign(WGInfoS)); + AB.addDereferenceableAttr(DL.getTypeAllocSize(WGInfoS)); + WGInfo.ParamAttrs = AttributeSet::get(Ctx, AB); + } + + return {WIInfo, WGInfo}; +} + +SmallVector +BIMuxInfoConcept::getFunctionSchedulingParameters(Function &F) { + // Query function metadata to determine whether this function has scheduling + // parameters + auto ParamIdxs = getSchedulingParameterFunctionMetadata(F); + if (ParamIdxs.empty()) { + return {}; + } + + auto SchedParamInfo = getMuxSchedulingParameters(*F.getParent()); + // We don't allow a function to have a subset of the global scheduling + // parameters. + assert(ParamIdxs.size() >= SchedParamInfo.size()); + // Set the concrete argument values on each of the scheduling parameter data. + for (auto it : zip(SchedParamInfo, ParamIdxs)) { + // Some scheduling parameters may not be present (returning an index of + // -1), in which case skip their concrete argument values. + if (std::get<1>(it) >= 0) { + std::get<0>(it).ArgVal = F.getArg(std::get<1>(it)); + } + } + + return SchedParamInfo; +} + +Value *BIMuxInfoConcept::initializeSchedulingParamForWrappedKernel( + const BuiltinInfo::SchedParamInfo &Info, IRBuilder<> &B, Function &IntoF, + Function &) { + // We only expect to have to initialize the work-item info. The work-group + // info is straight passed through. + (void)IntoF; + assert(!Info.PassedExternally && Info.ID == SchedParamIndices::WI && + Info.ParamName == "wi-info" && + Info.ParamPointeeTy == getWorkItemInfoStructTy(*IntoF.getParent())); + return B.CreateAlloca(Info.ParamPointeeTy, + /*ArraySize*/ nullptr, Info.ParamName); +} + +std::optional BIMuxInfoConcept::getBuiltinRange( + llvm::CallInst &CI, BuiltinID ID, + std::array, 3> MaxLocalSizes, + std::array, 3> MaxGlobalSizes) const { + assert(CI.getCalledFunction() && CI.getType()->isIntegerTy() && + "Unexpected builtin"); + + auto Bits = CI.getType()->getIntegerBitWidth(); + // Assume we're indexing the global sizes array. + std::array, 3> *SizesPtr = &MaxGlobalSizes; + + switch (ID) { + default: + return std::nullopt; + case eMuxBuiltinGetWorkDim: + return ConstantRange::getNonEmpty(APInt(Bits, 1), APInt(Bits, 4)); + case eMuxBuiltinGetLocalId: + case eMuxBuiltinGetLocalSize: + case eMuxBuiltinGetEnqueuedLocalSize: + // Use the local sizes array, and fall through to common handling. + SizesPtr = &MaxLocalSizes; + [[fallthrough]]; + case eMuxBuiltinGetGlobalSize: { + auto *DimIdx = CI.getOperand(0); + if (!isa(DimIdx)) { + return std::nullopt; + } + const uint64_t DimVal = cast(DimIdx)->getZExtValue(); + if (DimVal >= SizesPtr->size()) { + return std::nullopt; + } + const std::optional Size = (*SizesPtr)[DimVal]; + if (!Size) { + return std::nullopt; + } + // ID builtins range [0,size) (exclusive), and size builtins [1,size] + // (inclusive). Thus offset the range by 1 at each low/high end when + // returning the range for a size builtin. + const int SizeAdjust = ID == eMuxBuiltinGetLocalSize || + ID == eMuxBuiltinGetEnqueuedLocalSize || + ID == eMuxBuiltinGetGlobalSize; + return ConstantRange::getNonEmpty(APInt(Bits, SizeAdjust), + APInt(Bits, Size.value() + SizeAdjust)); + } + } +} + +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp new file mode 100644 index 0000000000000..f735b1d1e6b8f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp @@ -0,0 +1,319 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// This pass replaces builtin functions with optimal equivalents. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_TYPE "ca-optimal-builtins" + +using namespace llvm; + +namespace { + +void removeCallSite(CallBase &CB, LazyCallGraph &CG) { + Function *Caller = CB.getCaller(); + Function *Callee = CB.getCaller(); + auto CallerNode = CG.get(*Caller); + auto CalleeNode = CG.get(*Callee); + if (auto *CallerRef = CG.lookupRefSCC(CallerNode)) { + CallerRef->removeOutgoingEdge(CallerNode, CalleeNode); + } +} + +} // namespace + +namespace compiler { +namespace utils { + +Value *OptimalBuiltinReplacementPass::replaceAbacusCLZ( + CallBase &CB, StringRef BaseName, const SmallVectorImpl &, + const SmallVectorImpl &) { + if (BaseName != "__abacus_clz") { + return nullptr; + } + Module *M = CB.getModule(); + SmallVector Args(CB.args()); + // Get the declaration for the intrinsic + auto *const ArgTy = Args[0]->getType(); + auto *const Intrinsic = + llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::ctlz, ArgTy); + // If we didn't find the intrinsic or the return type isn't what we + // expect, skip this optimization + Function *Callee = CB.getCalledFunction(); + assert(Callee); + if (!Intrinsic || Intrinsic->getReturnType() != Callee->getReturnType()) { + return nullptr; + } + + // On 32-bit ARM, the llvm.ctlz intrinsic on 64-bit types is expanded using + // compiler-rt. Without online linking, we can't support that. + const Triple TT(CB.getModule()->getTargetTriple()); + if (TT.getArch() == Triple::arm && ArgTy->isIntOrIntVectorTy(64)) { + return nullptr; + } + + // LLVM's ctlz has a second argument to specify that zeroes in the first + // argument produces a defined result. + LLVMContext &Ctx = M->getContext(); + Args.push_back(ConstantInt::getFalse(Ctx)); + + auto *Call = CallInst::Create(Intrinsic, Args); + Call->insertBefore(CB.getIterator()); + return Call; +} + +Value *OptimalBuiltinReplacementPass::replaceAbacusMulhi( + CallBase &CB, StringRef BaseName, const SmallVectorImpl &, + const SmallVectorImpl &Quals) { + if (BaseName != "__abacus_mul_hi") { + return nullptr; + } + IRBuilder<> B(&CB); + + auto I = CB.arg_begin(); + Value *const LHS = *I++; + Value *const RHS = *I++; + + const auto BitWidth = LHS->getType()->getScalarType()->getIntegerBitWidth(); + + // Don't perform this optimization on 64-bit types as 128-bit types aren't + // generally well supported. + if (BitWidth == 64) { + return nullptr; + } + + unsigned VecWidth = 1; + if (const auto *VecTy = dyn_cast(LHS->getType())) { + VecWidth = multi_llvm::getVectorNumElements(VecTy); + } + + Type *UpTy = B.getIntNTy(BitWidth * 2); + if (VecWidth != 1) { + UpTy = FixedVectorType::get(UpTy, VecWidth); + } + + bool SrcIsSigned = false; + for (unsigned i = 0, e = Quals[0].getCount(); i != e; i++) { + if (Quals[0].at(i) == eTypeQualSignedInt) { + SrcIsSigned = true; + break; + } + } + + const auto CastOp = SrcIsSigned ? Instruction::SExt : Instruction::ZExt; + + auto *const UpLHS = B.CreateCast(CastOp, LHS, UpTy); + auto *const UpRHS = B.CreateCast(CastOp, RHS, UpTy); + + auto *const Mul = B.CreateMul(UpLHS, UpRHS); + + Constant *ShiftAmt = B.getIntN(BitWidth * 2, BitWidth); + if (VecWidth != 1) { + ShiftAmt = ConstantDataVector::getSplat(VecWidth, ShiftAmt); + } + + auto *const Shift = B.CreateAShr(Mul, ShiftAmt); + + return B.CreateTrunc(Shift, LHS->getType()); +} + +Value *OptimalBuiltinReplacementPass::replaceAbacusFMinFMax( + CallBase &CB, StringRef BaseName, const SmallVectorImpl &, + const SmallVectorImpl &) { + const bool IsFMin = BaseName == "__abacus_fmin"; + if (!IsFMin && BaseName != "__abacus_fmax") { + return nullptr; + } + + const Triple TT(CB.getModule()->getTargetTriple()); + // minnum/maxnum intrinsics fail CTS on arm targets. See + // https://llvm.org/PR27363. + if (TT.getArch() == Triple::arm || TT.getArch() == Triple::aarch64) { + return nullptr; + } + + IRBuilder<> B(&CB); + + auto I = CB.arg_begin(); + Value *LHS = *I++; + Value *RHS = *I++; + + const auto *LHSTy = LHS->getType(); + const auto *RHSTy = RHS->getType(); + + if (LHSTy->isVectorTy() != RHSTy->isVectorTy()) { + auto VectorEC = + multi_llvm::getVectorElementCount(LHSTy->isVectorTy() ? LHSTy : RHSTy); + if (!LHS->getType()->isVectorTy()) { + LHS = B.CreateVectorSplat(VectorEC, LHS); + } + if (!RHS->getType()->isVectorTy()) { + RHS = B.CreateVectorSplat(VectorEC, RHS); + } + } + return B.CreateBinaryIntrinsic(IsFMin ? Intrinsic::minnum : Intrinsic::maxnum, + LHS, RHS); +} + +OptimalBuiltinReplacementPass::OptimalBuiltinReplacementPass() { + replacements.emplace_back(replaceAbacusCLZ); + replacements.emplace_back(replaceAbacusMulhi); + replacements.emplace_back(replaceAbacusFMinFMax); +} + +Value * +OptimalBuiltinReplacementPass::replaceBuiltinWithInlineIR(CallBase &CB) const { + auto *M = CB.getModule(); + NameMangler mangler(&M->getContext()); + + SmallVector Types; + SmallVector Quals; + Function *Callee = CB.getCalledFunction(); + assert(Callee); + const StringRef BaseName = + mangler.demangleName(Callee->getName(), Types, Quals); + + for (const auto &replace_fn : replacements) { + if (replace_fn) { + if (auto *V = replace_fn(CB, BaseName, Types, Quals)) { + return V; + } + } + } + + return nullptr; +} + +PreservedAnalyses OptimalBuiltinReplacementPass::run(LazyCallGraph::SCC &C, + CGSCCAnalysisManager &AM, + LazyCallGraph &CG, + CGSCCUpdateResult &) { + // Without the possibility of recursion, we can expect all meaningful + // OpenCL/ComputeMux programs to be contained within a single singular SCC + // serving as the entry point. We use this as the root. + if (C.size() != 1) { + return PreservedAnalyses::all(); + } + Module &M = *C.begin()->getFunction().getParent(); + + // Check that at least one node in this graph is a kernel. + if (none_of(C, [](const LazyCallGraph::Node &N) { + return N.getFunction().getCallingConv() == CallingConv::SPIR_KERNEL; + })) { + return PreservedAnalyses::all(); + } + + const auto &MAMProxy = AM.getResult(C, CG); + if (auto *BI = MAMProxy.getCachedResult(M)) { + replacements.emplace_back([BI](CallBase &CB, StringRef, + const SmallVectorImpl &, + const SmallVectorImpl &) + -> Value * { + if (Function *Callee = CB.getCalledFunction()) { + if (const auto Builtin = BI->analyzeBuiltin(*Callee)) { + if (Builtin->properties & eBuiltinPropertyCanEmitInline) { + IRBuilder<> B(&CB); + const SmallVector Args(CB.args()); + if (Value *Impl = BI->emitBuiltinInline(Callee, B, Args)) { + assert( + Impl->getType() == CB.getType() && + "The inlinined function type must match that of the original " + "function"); + return Impl; + } + } + } + } + return nullptr; + }); + } + + if (adjustReplacements) { + adjustReplacements(replacements); + } + + // If there are no replacements to run, for whatever reason, we can bail + // early. + if (replacements.empty()) { + return PreservedAnalyses::all(); + } + + SmallVector ToDelete; + // The SmallPriorityWorklist prioritises nodes which have been inserted + // multiple times, and avoids duplication of already-inserted items, but + // *not* ones already visited and popped off. + SmallPriorityWorklist Worklist; + // Assuming we only have one node to begin with (see above), start off with + // that. + Worklist.insert(&*C.begin()); + // While the worklist above prevents re-insertion, we might end up visiting + // the same function again after already visiting if popping it off the + // worklist. So we still have to keep track of recursion. + SmallPtrSet Visited; + + // Now visit all nodes in this "root" graph in order. We will visit + // outer-most functions (kernels) first before descending the call graph. + // This gives precedence to "outer-most" replacements. + while (!Worklist.empty()) { + LazyCallGraph::Node *N = Worklist.pop_back_val(); + LLVM_DEBUG(dbgs() << "OptimalBuiltinReplacement: visiting " << *N << "\n"); + for (Instruction &I : instructions(N->getFunction())) { + if (auto *CB = dyn_cast(&I)) { + if (CB->getCalledFunction() && !isa(I)) { + if (Value *New = replaceBuiltinWithInlineIR(*CB)) { + LLVM_DEBUG(dbgs() + << "\tOptimalBuiltinReplacement: replacing call to " + << CB->getCalledFunction()->getName() << "\n"); + ToDelete.push_back(CB); + removeCallSite(*CB, CG); + // Assume that replacements don't introduce new calls, and we can + // simply mark this one as gone and move on. + CB->replaceAllUsesWith(New); + } else if (auto *CalledN = CG.lookup(*CB->getCalledFunction())) { + if (Visited.insert(CalledN).second) { + Worklist.insert(CalledN); + } + } + } + } + } + } + + const bool Modified = !ToDelete.empty(); + + // Clean up any dead calls. + while (!ToDelete.empty()) { + Instruction *I = ToDelete.pop_back_val(); + I->eraseFromParent(); + } + + return Modified ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp new file mode 100644 index 0000000000000..d1e46ee67b290 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp @@ -0,0 +1,739 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +llvm::AnalysisKey compiler::utils::DeviceInfoAnalysis::Key; + +namespace compiler { +namespace utils { + +uint64_t computeApproximatePrivateMemoryUsage(const llvm::Function &fn) { + const llvm::Module *module = fn.getParent(); + const auto &layout = module->getDataLayout(); + uint64_t bytes = 0; + + // BarrierPass asserts that `allocas` only exist in the entry block + for (auto &inst : fn.getEntryBlock()) { + if (!llvm::isa(inst)) { + continue; + } + const auto &alloca_inst = llvm::cast(inst); + const auto *type = alloca_inst.getType(); + if (type->getAddressSpace() != AddressSpace::Private) { + continue; + } + auto *alloc_type = alloca_inst.getAllocatedType(); + const auto alloc_size = layout.getTypeAllocSize(alloc_type); + if (alloca_inst.isArrayAllocation()) { + auto *arr_size_val = alloca_inst.getArraySize(); + auto *const_int = llvm::dyn_cast(arr_size_val); + assert(const_int != nullptr && "Array Allocation of dynamic size"); + const uint64_t arr_size = const_int->getUniqueInteger().getLimitedValue(); + bytes += arr_size * alloc_size; + + } else { + bytes += alloc_size; + } + } + return bytes; +} + +static llvm::SmallVector +getNewOps(llvm::Constant *constant, llvm::Constant *from, llvm::Constant *to) { + llvm::SmallVector newOps; + // iterate through the constant and create a vector of old and new + // ones + for (unsigned i = 0, e = constant->getNumOperands(); i != e; ++i) { + auto op = constant->getOperand(i); + if (op == from) { + newOps.push_back(to); + } else { + newOps.push_back(llvm::cast(op)); + } + } + return newOps; +} + +void remapConstantArray(llvm::ConstantArray *arr, llvm::Constant *from, + llvm::Constant *to) { + const llvm::SmallVector newOps = getNewOps(arr, from, to); + // Create a new array with the list of operands and replace all uses with + llvm::Constant *newConstant = + llvm::ConstantArray::get(arr->getType(), newOps); + arr->replaceAllUsesWith(newConstant); + arr->destroyConstant(); +} + +void remapConstantExpr(llvm::ConstantExpr *expr, llvm::Constant *from, + llvm::Constant *to) { + const llvm::SmallVector newOps = getNewOps(expr, from, to); + // Create a new expression with the list of operands and replace all uses with + llvm::Constant *newConstant = expr->getWithOperands(newOps); + expr->replaceAllUsesWith(newConstant); + expr->destroyConstant(); +} + +bool funcContainsDebugMetadata(const llvm::Function &func, + llvm::ValueToValueMapTy &vmap) { + // Check if function references debug info + bool foundDI = false; + + // Function has a DISubprogram entry attached + if (auto DISubprogram = func.getSubprogram()) { + vmap.MD()[DISubprogram].reset(DISubprogram); + foundDI = true; + } + + for (auto &BB : func) { + for (auto &Inst : BB) { + if (const auto &DL = Inst.getDebugLoc()) { + llvm::DILocation *loc = DL.get(); + vmap.MD()[loc].reset(loc); + foundDI = true; + } + } + } + + return foundDI; +} + +void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) { + // remove all dead constant users (sometimes these are left over by previous + // passes) + constant->removeDeadConstantUsers(); + + // Only handle constants which are ConstantExpr, ConstantVector or + // ConstantArray + assert((llvm::isa(constant) || + llvm::isa(constant) || + llvm::isa(constant)) && + "Unsupported constant type in IR"); + + // For each user of a constant we will check to see if they in turn are + // constants. If they are convert them to instructions first (still + // referencing this constant). We can are then clear to convert the current + // constant to an instruction as the only users left are instructions. + + llvm::SmallVector users; + // Create the list of users of this constant. We don't want duplicates here, + // which often happens with ConstantVectors, as we only want convert them to + // an instruction once. We want determinism here so use a vector to maintain + // order. + for (auto *constantUser : constant->users()) { + if (std::find(users.begin(), users.end(), constantUser) == users.end()) { + users.push_back(constantUser); + } + } + + for (auto *constantUser : users) { + if (llvm::isa(constantUser)) { + // instructions are our best case, do nothing! + } else if (llvm::Constant *subConstant = + llvm::dyn_cast(constantUser)) { + replaceConstantExpressionWithInstruction(subConstant); + } else { + constantUser->print(llvm::errs()); + llvm_unreachable("Constant user is not a constant or instruction!!"); + } + } + + // we record each use + llvm::SmallVector uses; + + for (auto &use : constant->uses()) { + uses.push_back(&use); + } + + for (auto *use : uses) { + // get the instruction that is the user of the use + auto inst = llvm::cast(use->getUser()); + + // get the function for this use + auto useFunc = inst->getFunction(); + + llvm::Instruction *newInst = nullptr; + // create a new instruction that matches the constant expression + if (llvm::ConstantExpr *constantExpr = + llvm::dyn_cast(constant)) { + newInst = constantExpr->getAsInstruction(); + // insert the instruction at the beginning of the entry block + newInst->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt()); + } else if (llvm::ConstantVector *constantVec = + llvm::dyn_cast(constant)) { + // If it is a ConstantVector then only handle the case where it is + // a single splatted value. This is the only kind generated at present. + auto splatVal = constantVec->getSplatValue(); + assert(splatVal && + "ConstantVector does not contained identical constants so cannot " + "be splatted!"); + // Take the splatted Value and create two instructions. The first is + // InsertElement to place it in a new vector and the second is a + // ShuffleVector to duplicate the value across the vector. + auto numEls = constantVec->getNumOperands(); + llvm::Value *poison = llvm::PoisonValue::get( + llvm::FixedVectorType::get(splatVal->getType(), numEls)); + llvm::Type *i32Ty = llvm::Type::getInt32Ty(constant->getContext()); + auto insert = llvm::InsertElementInst::Create( + poison, splatVal, llvm::ConstantInt::get(i32Ty, 0)); + insert->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt()); + llvm::Value *zeros = llvm::ConstantAggregateZero::get( + llvm::FixedVectorType::get(i32Ty, numEls)); + newInst = new llvm::ShuffleVectorInst(insert, poison, zeros); + newInst->insertAfter(insert); + } else if (llvm::ConstantArray *constantArr = + llvm::dyn_cast(constant)) { + auto numEls = constantArr->getNumOperands(); + llvm::Value *poison = llvm::PoisonValue::get(constantArr->getType()); + llvm::Instruction *insertedIns = nullptr; + for (unsigned int i = 0; i < numEls; i++) { + auto *insertNext = + llvm::InsertValueInst::Create(insertedIns ? insertedIns : poison, + constantArr->getOperand(i), {i}); + if (insertedIns) { + insertNext->insertAfter(insertedIns); + } else { + insertNext->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt()); + } + insertedIns = insertNext; + } + newInst = insertedIns; + } + + // replace the use of the constant with the instruction + use->set(newInst); + } + + // lastly, destroy the constant we just replaced + constant->destroyConstant(); +} + +llvm::AttributeList getCopiedFunctionAttrs(const llvm::Function &oldFn, + int numParams) { + const unsigned numParamsToCopy = + numParams < 0 ? oldFn.arg_size() : (unsigned)numParams; + llvm::SmallVector newArgAttrs(numParamsToCopy); + const llvm::AttributeList oldAttrs = oldFn.getAttributes(); + // clone any argument attributes we're copying over. Note we can't simply + // call Function::copyAttributes as not all arguments are present in the new + // function. + for (unsigned i = 0, e = numParamsToCopy; i != e; i++) { + newArgAttrs[i] = oldAttrs.getParamAttrs(i); + } + + return llvm::AttributeList::get(oldFn.getContext(), oldAttrs.getFnAttrs(), + oldAttrs.getRetAttrs(), newArgAttrs); +} + +void copyFunctionAttrs(const llvm::Function &oldFn, llvm::Function &newFn, + int numParams) { + newFn.setAttributes(getCopiedFunctionAttrs(oldFn, numParams)); +} + +bool cloneFunctionsAddArg( + llvm::Module &module, + std::function paramTypeFunc, + std::function + toBeCloned, + const UpdateMDCallbackFn &updateMetaDataCallback) { + // mapping from new -> old function + llvm::ValueMap newToOldMap; + + // Preserve the value map across all function clones + llvm::ValueToValueMapTy vmap; + + const ParamTypeAttrsPair paramInfo = paramTypeFunc(module); + + // For each function we run the function toBeCloned to set the bools + // doCloneNoBody and doCloneWithBody + // first, run through our functions and make copies of all functions that: + // 1) are not declarations (these will be builtins we expand later) or + // doCloneNoBody is set (don't clone but flesh out) + // 2) are not new functions that we just added + // 3) Functions marked by doCloneWithBody + for (auto &func : module.functions()) { + bool doCloneWithBody = false; + bool doCloneNoBody = false; + + toBeCloned(func, doCloneWithBody, doCloneNoBody); + const bool isDecl = func.isDeclaration(); + bool processFunc = (0 == newToOldMap.count(&func)); + + if (!isDecl) { + processFunc = processFunc && doCloneWithBody; + } else { + processFunc = processFunc && doCloneNoBody; + } + + if (processFunc) { + auto funcTy = func.getFunctionType(); + + const unsigned numParams = funcTy->getNumParams(); + llvm::SmallVector newParamTypes(numParams + 1); + + // add each param from the original function to the new one + for (unsigned i = 0; i < numParams; i++) { + newParamTypes[i] = funcTy->getParamType(i); + } + // and lastly add our extra arg as the last param + newParamTypes[numParams] = paramInfo.first; + + auto newFuncTy = llvm::FunctionType::get(funcTy->getReturnType(), + newParamTypes, false); + + // create our new function, using the linkage from the old one + auto newFunc = + llvm::Function::Create(newFuncTy, func.getLinkage(), "", &module); + + // set the correct calling convention + newFunc->setCallingConv(func.getCallingConv()); + + // take the name of the old function + newFunc->takeName(&func); + + // Copy names over for the parameters + llvm::Function::arg_iterator DestI = newFunc->arg_begin(); + for (const auto &I : func.args()) { + (*DestI).setName(I.getName()); // Copy the name over... + DestI++; + } + + if (isDecl) { + // copy debug info for function over; CloneFunctionInto takes care of + // this if this function has a body + newFunc->setSubprogram(func.getSubprogram()); + // copy the metadata into the new function, ignoring any debug info. + copyFunctionMetadata(func, *newFunc); + } else { + // map all original function arguments to the new function arguments + for (auto iter = func.arg_begin(), iter_end = func.arg_end(), + new_iter = newFunc->arg_begin(); + iter != iter_end; ++iter, ++new_iter) { + vmap[(&*iter)] = (&*new_iter); + } + + llvm::SmallVector returns; + + // we have module changes if our function contains any debug info + assert(newFunc->getParent() && + "assumed newFunc has an associated module"); + const bool hasDbgMetadata = funcContainsDebugMetadata(func, vmap); + const bool differentModules = newFunc->getParent() != func.getParent(); + auto changeType = differentModules + ? llvm::CloneFunctionChangeType::DifferentModule + : llvm::CloneFunctionChangeType::LocalChangesOnly; + if (hasDbgMetadata) { + changeType = std::max(changeType, + llvm::CloneFunctionChangeType::GlobalChanges); + } + CloneFunctionInto(newFunc, &func, vmap, changeType, returns); + } + + // Add in the new parameter attributes here, because CloneFunctionInto + // wipes out pre-existing attributes on newFunc which aren't in oldFunc. + newFunc->addParamAttrs(numParams, llvm::AttrBuilder(newFunc->getContext(), + paramInfo.second)); + + // map new func -> old func + newToOldMap[newFunc] = &func; + + // remove the body of the old function that we are going to delete + // anyway, so that none of its callsites get processed in the remainder + // of this pass + func.deleteBody(); + } + } + + // next, remap all callsites that would have called the old function, to the + // new function we just created + for (auto pair : newToOldMap) { + llvm::Function *newFunc = pair.first; + llvm::Function *oldFunc = pair.second; + + remapClonedCallsites(*oldFunc, *newFunc, true); + + // next, let the caller update any metadata. + if (updateMetaDataCallback) { + updateMetaDataCallback(*oldFunc, *newFunc, + newFunc->getFunctionType()->getNumParams() - 1); + } + } + + // lastly, remove all the old functions we no longer need + for (auto pair : newToOldMap) { + // the old function, no longer used + llvm::Function *const oldFunc = pair.second; + + // then destroy the function + oldFunc->eraseFromParent(); + } + + return true; +} + +void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc, + bool extraArg) { + // list of calls we need to erase at the end + llvm::SmallVector callsToErase; + + // for everything that uses our old function + for (auto *user : oldFunc.users()) { + // if the user calls our old function + if (auto ci = llvm::dyn_cast(user)) { + // store the name from the old call + const std::string name = ci->getName().str(); + + // get the number of args at the old callsite + const unsigned numArgs = ci->arg_size(); + + // the number of args at the new callsite. If we're adding an extra + // argument this is incremented. + const unsigned newNumArgs = extraArg ? numArgs + 1 : numArgs; + + // create a buffer for our args + llvm::SmallVector args(newNumArgs); + + // set all the new call args to be the old call args + for (unsigned i = 0; i < numArgs; i++) { + args[i] = ci->getArgOperand(i); + } + + // if we're adding an extra param it's always the last + // argument, so propagate the value on from the parent + if (extraArg) { + llvm::Function *parentFunc = ci->getFunction(); + llvm::Argument *lastArg = getLastArgument(parentFunc); + args[numArgs] = lastArg; + } + + // create our new call instruction to replace the old one + auto newCi = llvm::CallInst::Create(&newFunc, args, name); + newCi->insertBefore(ci->getIterator()); + + // use the debug location from the old call (if any) + newCi->setDebugLoc(ci->getDebugLoc()); + + // set the calling convention for our new call the same as the old one + newCi->setCallingConv(ci->getCallingConv()); + + // replace anything that uses the old call with the new one + ci->replaceAllUsesWith(newCi); + + // and remember to erase the old callsite + callsToErase.push_back(ci); + } else if (llvm::ConstantExpr *constant = + llvm::dyn_cast(user)) { + remapConstantExpr(constant, &oldFunc, &newFunc); + } else { + llvm_unreachable( + "UNHANDLED user for Function not a CallInst or ConstantExpr\n"); + } + } + + // remove all the old instructions we no longer need + for (llvm::CallInst *ci : callsToErase) { + // then destroy the call + ci->eraseFromParent(); + } +} + +llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit, + llvm::Value *indexStart, llvm::Value *indexEnd, + const CreateLoopOpts &opts, + CreateLoopBodyFn body) { + // If the index increment is null, we default to 1 as our index. + auto indexInc = opts.indexInc + ? opts.indexInc + : llvm::ConstantInt::get(indexStart->getType(), 1); + + llvm::LLVMContext &ctx = entry->getContext(); + + llvm::SmallVector currIVs(opts.IVs.begin(), opts.IVs.end()); + llvm::SmallVector nextIVs(opts.IVs.size()); + + // the basic block that will link into our loop + llvm::IRBuilder<> entryIR(entry); + + // the basic block that will form the start of our loop + llvm::IRBuilder<> loopIR( + llvm::BasicBlock::Create(ctx, opts.headerName, entry->getParent())); + + // branch into our loop to begin executing + entryIR.CreateBr(loopIR.GetInsertBlock()); + + // first thing in the loop is our phi node for the loop counter + auto phi = loopIR.CreatePHI(indexInc->getType(), 2); + + // and make the phi node equal the start index when coming from our entry + phi->addIncoming(indexStart, entryIR.GetInsertBlock()); + + // Set up all of our user PHIs + for (unsigned i = 0, e = currIVs.size(); i != e; i++) { + // For convenience to callers, permit nullptr and skip over it. + if (!currIVs[i]) + continue; + + auto *const phi = loopIR.CreatePHI(currIVs[i]->getType(), 2); + llvm::cast(phi)->addIncoming(currIVs[i], + entryIR.GetInsertBlock()); + // Set IV names if they've been given to us. + if (i < opts.loopIVNames.size()) { + phi->setName(opts.loopIVNames[i]); + } + currIVs[i] = phi; + } + + // run the lamdba for the loop body, storing the block is finished at + llvm::BasicBlock *const latch = + body(loopIR.GetInsertBlock(), phi, currIVs, nextIVs); + llvm::IRBuilder<> bodyIR(latch); + + // add to the phi node to increment our loop counter + auto *const add = bodyIR.CreateAdd(phi, indexInc); + + // and set that if we loop back around, the phi node will be the increment + phi->addIncoming(add, latch); + + // Update all of our PHIs + for (unsigned i = 0, e = currIVs.size(); i != e; i++) { + if (!currIVs[i]) + continue; + llvm::cast(currIVs[i])->addIncoming(nextIVs[i], latch); + } + + if (!exit) { + // the basic block to exit our loop when we are done + const llvm::IRBuilder<> exitIR( + llvm::BasicBlock::Create(ctx, "exitIR", entry->getParent())); + exit = exitIR.GetInsertBlock(); + } + + // last, branch condition either to the exit, or for another loop iteration + auto *const termBR = bodyIR.CreateCondBr(bodyIR.CreateICmpULT(add, indexEnd), + loopIR.GetInsertBlock(), exit); + + if (opts.disableVectorize) { + auto *const vecDisable = llvm::MDNode::get( + ctx, {llvm::MDString::get(ctx, "llvm.loop.vectorize.enable"), + llvm::ConstantAsMetadata::get( + llvm::ConstantInt::get(llvm::Type::getInt1Ty(ctx), false))}); + // LLVM loop metadata -- for legacy reasons -- must have a reference to + // itself as its first operand. See + // https://llvm.org/docs/LangRef.html#llvm-loop. + auto *loopID = llvm::MDNode::get(ctx, {nullptr, vecDisable}); + loopID->replaceOperandWith(0, loopID); + termBR->setMetadata(llvm::LLVMContext::MD_loop, loopID); + } + + // we stopped executing in the exit block, so return that + return exit; +} + +llvm::Argument *getLastArgument(llvm::Function *f) { + assert(!f->arg_empty() && + "Can't get last argument if there are no arguments"); + return f->arg_end() - 1; +} + +unsigned getSizeTypeBytes(const llvm::Module &m) { + return m.getDataLayout().getPointerSize(0); +} + +llvm::IntegerType *getSizeType(const llvm::Module &m) { + const llvm::DataLayout &dataLayout = m.getDataLayout(); + return llvm::IntegerType::get(m.getContext(), + dataLayout.getPointerSizeInBits(0)); +} + +static llvm::Function * +createKernelWrapperFunctionImpl(llvm::Function &F, llvm::Function &NewFunction, + llvm::StringRef Suffix, + llvm::StringRef OldSuffix) { + // Make sure we take a copy of the basename as we're going to change the + // original function's name from underneath the StringRef. + const std::string baseName = getOrSetBaseFnName(NewFunction, F).str(); + + if (!OldSuffix.empty()) { + if (getBaseFnName(F).empty()) { + setBaseFnName(F, F.getName()); + } + F.setName(F.getName() + OldSuffix); + } + + NewFunction.setName(baseName + Suffix); + + // we don't use exceptions + NewFunction.addFnAttr(llvm::Attribute::NoUnwind); + + // copy the calling convention from the old function + NewFunction.setCallingConv(F.getCallingConv()); + + // and remove spir_kernel from the old function + if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL) { + F.setCallingConv(llvm::CallingConv::SPIR_FUNC); + } + + // copy the metadata into the new kernel ignoring any debug info. + copyFunctionMetadata(F, NewFunction); + + // drop kernel (+ entry point) information from the old function: we've + // copied it over to the new one. + dropIsKernel(F); + + // copy debug info for function over + if (auto *SP = F.getSubprogram()) { + const llvm::DIBuilder DIB(*F.getParent()); + llvm::DISubprogram *const NewSP = DIB.createArtificialSubprogram(SP); + // Wipe the list of retained nodes, as this new function is a wrapper over + // the old one and does not itself contain/retain any of the wrapped + // function's nodes. + NewSP->replaceRetainedNodes({}); + NewFunction.setSubprogram(NewSP); + } + + // set the function to always inline: 'noinline' takes precedence, though + if (!F.hasFnAttribute(llvm::Attribute::NoInline)) { + F.addFnAttr(llvm::Attribute::AlwaysInline); + } + + // lastly set the linkage to internal + F.setLinkage(llvm::GlobalValue::InternalLinkage); + + return &NewFunction; +} + +llvm::Function *createKernelWrapperFunction(llvm::Function &F, + llvm::StringRef Suffix, + llvm::StringRef OldSuffix) { + // Create our new function + llvm::Function *const NewFunction = llvm::Function::Create( + F.getFunctionType(), llvm::Function::ExternalLinkage, "", F.getParent()); + + // copy over function attributes, including parameter attributes + copyFunctionAttrs(F, *NewFunction); + + // Copy over parameter names + for (auto it : zip(NewFunction->args(), F.args())) { + std::get<0>(it).setName(std::get<1>(it).getName()); + } + + return createKernelWrapperFunctionImpl(F, *NewFunction, Suffix, OldSuffix); +} + +llvm::Function * +createKernelWrapperFunction(llvm::Module &M, llvm::Function &F, + llvm::ArrayRef ArgTypes, + llvm::StringRef Suffix, llvm::StringRef OldSuffix) { + llvm::FunctionType *NewFunctionType = + llvm::FunctionType::get(F.getReturnType(), ArgTypes, false); + + // create our new function + llvm::Function *const NewFunction = llvm::Function::Create( + NewFunctionType, llvm::Function::ExternalLinkage, "", &M); + + // copy over function attributes, ignoring all parameter attributes - we + // don't know what the parameter mapping is. + copyFunctionAttrs(F, *NewFunction, 0); + + return createKernelWrapperFunctionImpl(F, *NewFunction, Suffix, OldSuffix); +} + +llvm::CallInst *createCallToWrappedFunction( + llvm::Function &WrappedF, const llvm::SmallVectorImpl &Args, + llvm::BasicBlock *BB, llvm::BasicBlock::iterator InsertPt, + llvm::StringRef Name) { + auto *const CI = + llvm::CallInst::Create(WrappedF.getFunctionType(), &WrappedF, Args); + + CI->setName(Name); + CI->setCallingConv(WrappedF.getCallingConv()); + CI->setAttributes(getCopiedFunctionAttrs(WrappedF)); + + if (BB) { + CI->insertInto(BB, InsertPt); + + if (auto *const ParentF = BB->getParent()) { + // An inlinable function call in a function with debug info *must* be + // given a debug location. + if (auto *const SP = ParentF->getSubprogram()) { + auto *const DbgLoc = llvm::DILocation::get(ParentF->getContext(), + /*line*/ 0, /*col*/ 0, SP); + CI->setDebugLoc(DbgLoc); + } + } + } + + return CI; +} + +llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS, + llvm::Value *RHS, llvm::RecurKind Kind) { + switch (Kind) { + default: + llvm_unreachable("Unexpected Kind"); + case llvm::RecurKind::None: + return nullptr; + case llvm::RecurKind::Add: + return B.CreateAdd(LHS, RHS); + case llvm::RecurKind::Mul: + return B.CreateMul(LHS, RHS); + case llvm::RecurKind::Or: + return B.CreateOr(LHS, RHS); + case llvm::RecurKind::And: + return B.CreateAnd(LHS, RHS); + case llvm::RecurKind::Xor: + return B.CreateXor(LHS, RHS); + case llvm::RecurKind::SMin: + return B.CreateBinaryIntrinsic(llvm::Intrinsic::smin, LHS, RHS); + case llvm::RecurKind::UMin: + return B.CreateBinaryIntrinsic(llvm::Intrinsic::umin, LHS, RHS); + case llvm::RecurKind::SMax: + return B.CreateBinaryIntrinsic(llvm::Intrinsic::smax, LHS, RHS); + case llvm::RecurKind::UMax: + return B.CreateBinaryIntrinsic(llvm::Intrinsic::umax, LHS, RHS); + case llvm::RecurKind::FAdd: + return B.CreateFAdd(LHS, RHS); + case llvm::RecurKind::FMul: + return B.CreateFMul(LHS, RHS); + case llvm::RecurKind::FMin: + return B.CreateBinaryIntrinsic(llvm::Intrinsic::minnum, LHS, RHS); + case llvm::RecurKind::FMax: + return B.CreateBinaryIntrinsic(llvm::Intrinsic::maxnum, LHS, RHS); + } +} + +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp new file mode 100644 index 0000000000000..c9d66624db7ef --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp @@ -0,0 +1,134 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include + +using namespace llvm; + +namespace compiler { +namespace utils { +// Note that Clang has three on/off options for debugging pass managers: +// `-fdebug-pass-manager`, `-fdebug-pass-structure`, and +// `-fdebug-pass-arguments``. +// LLVM's `opt` tool combines them all into one: +// --debug-pass-manager (Normal) +// --debug-pass-manager=verbose (Verbose) +// --debug-pass-manager=quiet (Quiet) +// However, the mapping is not one-to-one: +// opt: +// PrintPassOptions PrintPassOpts; +// PrintPassOpts.Verbose = DebugPM == DebugLogging::Verbose; +// PrintPassOpts.SkipAnalyses = DebugPM == DebugLogging::Quiet; +// StandardInstrumentations SI(DebugPM != DebugLogging::None, VerifyEachPass, +// PrintPassOpts); +// clang: +// bool DebugPassStructure = CodeGenOpts.DebugPass == "Structure"; +// PrintPassOptions PrintPassOpts; +// PrintPassOpts.Indent = DebugPassStructure; +// PrintPassOpts.SkipAnalyses = DebugPassStructure; +// StandardInstrumentations SI(CodeGenOpts.DebugPassManager || +// DebugPassStructure, +// false, PrintPassOpts); +// While clang also pushes `mdebug-pass` onto LLVM, it only works for the +// legacy pass manager, and so we choose to only support and model the +// `debug-pass-manager` form. +DebugLogging DebugPasses; +static cl::opt DebugPM( + "debug-pass-manager", cl::location(DebugPasses), cl::Hidden, + cl::ValueOptional, cl::desc("Print pass management debugging information"), + cl::init(DebugLogging::None), + cl::values( + clEnumValN(DebugLogging::Normal, "", ""), + clEnumValN(DebugLogging::Quiet, "quiet", + "Skip printing info about analyses"), + clEnumValN( + DebugLogging::Verbose, "verbose", + "Print extra information about adaptors and pass managers"))); + +bool VerifyEachIsEnabled; +static cl::opt VerifyEach("verify-each", + cl::location(VerifyEachIsEnabled), + cl::desc("Verify after each transform")); + +PassMachinery::PassMachinery(LLVMContext &Ctx, TargetMachine *TM, + bool VerifyEach, DebugLogging debugLogLevel) + : TM(TM) { + llvm::PrintPassOptions PrintPassOpts; + PrintPassOpts.Verbose = DebugPM == DebugLogging::Verbose; + PrintPassOpts.SkipAnalyses = DebugPM == DebugLogging::Quiet; + PrintPassOpts.Indent = debugLogLevel != DebugLogging::None; + SI = std::make_unique( + Ctx, debugLogLevel != DebugLogging::None, VerifyEach, PrintPassOpts); +} + +PassMachinery::~PassMachinery() {} + +void PassMachinery::initializeStart(PipelineTuningOptions PTO) { + const std::optional PGOOpt; + PB = PassBuilder(TM, PTO, PGOOpt, &PIC); +} + +void PassMachinery::registerPasses() { + buildDefaultAAPipeline(); + registerLLVMAnalyses(); +} + +void PassMachinery::initializeFinish() { + // Register LLVM analyses now, with the knowledge that users have had the + // chance to register their own versions of LLVM analyses first. + registerPasses(); + // With all passes registered, cross-register all the proxies + PB.crossRegisterProxies(LAM, FAM, CGAM, MAM); + + // Allow registration of callbacks and instrumentation machinery + addClassToPassNames(); + registerPassCallbacks(); + + // Register pass instrumentation + SI->registerCallbacks(PIC, &MAM); +} + +void PassMachinery::buildDefaultAAPipeline() { + FAM.registerPass([&] { return PB.buildDefaultAAPipeline(); }); +} + +void PassMachinery::registerLLVMAnalyses() { + // Register standard analyses + PB.registerModuleAnalyses(MAM); + PB.registerCGSCCAnalyses(CGAM); + PB.registerFunctionAnalyses(FAM); + PB.registerLoopAnalyses(LAM); +} + +} // namespace utils +} // namespace compiler + +namespace compiler { +namespace utils { +/// Helper functions for printing +void printPassName(StringRef PassName, raw_ostream &OS) { + OS << " " << PassName << "\n"; +} + +void printPassName(StringRef PassName, StringRef Params, raw_ostream &OS) { + OS << " " << PassName << "<" << Params << ">\n"; +} + +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp new file mode 100644 index 0000000000000..32d9feb5b41bd --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp @@ -0,0 +1,131 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +using namespace llvm; + +#define DEBUG_TYPE "ca-barriers" + +PreservedAnalyses +compiler::utils::PrepareBarriersPass::run(Module &M, + ModuleAnalysisManager &AM) { + SmallPtrSet Kernels; + auto &BI = AM.getResult(M); + for (auto &F : M.functions()) { + if (isKernelEntryPt(F)) { + Kernels.insert(&F); + } + } + + SmallPtrSet FuncsWithBarriers; + + for (Function &F : M) { + const auto B = BI.analyzeBuiltin(F); + // If the function does not have a barrier id. + if (!B || !BI.isMuxBuiltinWithBarrierID(B->ID)) { + continue; + } + + for (User *U : F.users()) { + if (auto *const CI = dyn_cast(U)) { + auto *const Callee = CI->getFunction(); + + // If it's one of our kernels don't inline it, and definitely don't + // delete it either. No need to inline already dead functions, either! + if (!Callee->isDefTriviallyDead() && !Kernels.contains(Callee)) { + FuncsWithBarriers.insert(Callee); + } + } + } + } + + bool Changed = false; + + // Walk the users of the barrier. + while (!FuncsWithBarriers.empty()) { + auto *F = *FuncsWithBarriers.begin(); + FuncsWithBarriers.erase(F); + + // Make a copy of the users of the function to be inlined since + // InlineFunction modifies the state of ci/F which affects + // the range being iterated over, resulting in use-after-free. + const SmallVector Users{F->user_begin(), F->user_end()}; + + // Check the users of the function the call instruction inhabits. + for (User *U : Users) { + // If the call instruction's function does not any users. + if (!isa(U)) { + continue; + } + + auto *const InfoF = cast(U)->getFunction(); + InlineFunctionInfo IFI; + auto InlineResult = + InlineFunction(*cast(U), IFI, /*MergeAttributes*/ false, + /*CalleeAAR*/ nullptr, /*InsertLifetime*/ true, + /*ForwardVarArgsTo*/ nullptr); + if (InlineResult.isSuccess()) { + Changed = true; + + // The function we inlined into now contains a barrier, so add it + // to the set. + if (!InfoF->isDefTriviallyDead() && !Kernels.contains(InfoF)) { + FuncsWithBarriers.insert(InfoF); + } + } else { + LLVM_DEBUG(dbgs() << "Could not inline: " << *U << '\n';); + } + } + + // Delete the now-dead inlined function + if (F->isDefTriviallyDead()) { + F->eraseFromParent(); + } + } + + // Assign all barriers a unique ID + unsigned ID = 0U; + auto &Ctx = M.getContext(); + auto *const I32Ty = IntegerType::get(Ctx, 32); + for (auto *F : Kernels) { + for (BasicBlock &BB : *F) { + for (Instruction &I : BB) { + // Check call instructions for barrier. + if (auto *const CI = dyn_cast(&I)) { + if (Function *Callee = CI->getCalledFunction()) { + if (auto B = BI.analyzeBuiltin(*Callee)) { + if (BI.isMuxBuiltinWithBarrierID(B->ID)) { + CI->setOperand(0, ConstantInt::get(I32Ty, ID++)); + } + } + } + } + } + } + } + + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp new file mode 100644 index 0000000000000..396bc347f7fa1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp @@ -0,0 +1,767 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "replace-module-scope-vars" + +namespace { +using AlignIntTy = uint64_t; + +// Creates and returns a new GEP instruction, inserted before input parameter +// 'inst'. This GEP points to the element at 'index' of the struct living at +// the final argument of each function. +GetElementPtrInst *generateStructGEP(Instruction &inst, + StructType *funcsStructTy, + unsigned index) { + // find the function the instruction is in + auto func = inst.getFunction(); + + // the local module-scope variables struct we added to each function + auto funcsStruct = compiler::utils::getLastArgument(func); + + assert(funcsStruct->getType()->isPointerTy()); + + // the type with which to index into our struct type + auto indexTy = Type::getInt32Ty(inst.getModule()->getContext()); + + // create a new GEP just before the instruction + auto GEP = GetElementPtrInst::CreateInBounds( + funcsStructTy, funcsStruct, + {ConstantInt::get(indexTy, 0), ConstantInt::get(indexTy, index)}); + GEP->insertBefore(inst.getIterator()); + return GEP; +} + +// Given the type of a __local variable about to be added to the +// struct function calculates and returns the alignment of the type. +AlignIntTy calculateTypeAlign(Type *type, const DataLayout &layout) { + // Get underlying type if variable is an array + while (type->isArrayTy()) { + type = type->getArrayElementType(); + } + + // 3 component wide vectors have the size of 4 components according to the + // OpenCL spec section 6.1.5 'Alignment of Types' + unsigned int vectorWidth = + type->isVectorTy() ? multi_llvm::getVectorNumElements(type) : 1; + if (3 == vectorWidth) { + vectorWidth = 4; + } + + // if we have a pointer type return the size of a pointer on the target + if (type->isPointerTy()) { + return layout.getPointerSize(); + } + + // size of member in bytes - at least 8 bits to avoid zero alignment on + // integer types smaller than i8. + const unsigned int vectorSize = + (std::max(type->getScalarSizeInBits(), 8u) * vectorWidth) / 8; + + return vectorSize; +} + +// Variables in the local address space not passed as arguments can only be +// declared in the outermost scope of a kernel function. Here we find the kernel +// function the local address space global resides in. +Function *determineKernel(GlobalVariable &global) { + auto global_user = *(global.user_begin()); + if (auto instruction = dyn_cast(global_user)) { + return instruction->getFunction(); + } else if (ConstantVector *cv = dyn_cast(global_user)) { + User *cv_user = *(cv->user_begin()); + auto instruction = cast(cv_user); + return instruction->getFunction(); + } else if (global_user) { + global_user->print(errs()); + llvm_unreachable("Unknown user used the local module-scope variable!"); + } + return nullptr; +} + +// Information associated to with a local address space module scope variable +// that is needed to update it's debug info metadata +struct GlobalVarDebugInfoWrapper final { + // Byte offset into struct of replacement variables + unsigned offset; + // Associated debug info metadata entry + DIGlobalVariable *DIGlobal; + // Kernel function variable was defined in + Function *function; +}; + +// Check if a user is an instruction and if so add it to the Visited, Worklist +// and FuncsToClone. If it's not an instruction repeat for all its users +void checkUsersForInstructions( + User *user, llvm::SmallPtrSet &Visited, + llvm::SmallVector &FuncsToClone, + llvm::SmallPriorityWorklist &Worklist) { + if (auto *I = dyn_cast(user)) { + auto *F = I->getFunction(); + if (Visited.insert(F).second) { + Worklist.insert(F); + FuncsToClone.push_back(F); + LLVM_DEBUG( + dbgs() << "Function '" << F->getName() + << "' requires additional local module struct parameter\n"); + } + } else { + for (auto *user_of_user : user->users()) { + checkUsersForInstructions(user_of_user, Visited, FuncsToClone, Worklist); + } + } +} + +/// @brief Clone all required functions in a module, appending an extra +/// parameter to them if they are part of the call graph required for access to +/// local variables. +/// +/// @param module llvm module containing the functions +/// @param newParamType Type of the parameter to be added +/// @param newParamAttrs Parameter attributes of the parameter to be added +/// @return bool if the module has changed (currently always true) +/// +/// This recurses through all the users of the local variables to look for any +/// functions which use them as well as assuming that the top level kernels must +/// have them. +bool addParamToAllRequiredFunctions(llvm::Module &module, + llvm::Type *const newParamType, + const llvm::AttributeSet &newParamAttrs) { + llvm::SmallPtrSet Visited; + llvm::SmallVector FuncsToClone; + llvm::SmallPriorityWorklist Worklist; + + // Iterate through the top level functions checking if they are kernels. + for (auto &F : module.functions()) { + // Kernel entry points must present a consistent ABI to external users + if (compiler::utils::isKernelEntryPt(F)) { + Visited.insert(&F); + Worklist.insert(&F); + FuncsToClone.push_back(&F); + LLVM_DEBUG( + dbgs() << "Function '" << F.getName() + << "' requires additional local module struct parameter\n"); + continue; + } + } + + // Check each global's users if they are instructions or recurse up the user + // chain if not. If an Instruction is found we add it to the functions to + // clone. + for (auto &global : module.globals()) { + for (auto *user : global.users()) { + checkUsersForInstructions(user, Visited, FuncsToClone, Worklist); + } + } + + // Iterate over the functions that require local struct parameters and + // recursively register all callers of those functions as needing local struct + // parameters too. + while (!Worklist.empty()) { + Function *F = Worklist.pop_back_val(); + for (auto *U : F->users()) { + if (auto *CB = dyn_cast(U)) { + auto *Caller = CB->getFunction(); + if (Visited.insert(Caller).second) { + Worklist.insert(Caller); + FuncsToClone.push_back(Caller); + LLVM_DEBUG(dbgs() << "Function '" << Caller->getName() + << "' requires local struct parameters\n"); + } + } else { + report_fatal_error("unhandled user type"); + } + } + } + + // Ideally cloneFunctionsAddArg() would take a list of functions, but + // currently takes a std::function so we search the created vector of + // functions. + return compiler::utils::cloneFunctionsAddArg( + module, + [newParamType, newParamAttrs](llvm::Module &) { + return compiler::utils::ParamTypeAttrsPair{newParamType, newParamAttrs}; + }, + [&FuncsToClone](const llvm::Function &func, bool &ClonedWithBody, + bool &ClonedNoBody) { + ClonedWithBody = llvm::is_contained(FuncsToClone, &func); + ClonedNoBody = false; + }, + nullptr /*updateMetaDataCallback*/); +} + +} // namespace + +PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run( + Module &M, ModuleAnalysisManager &) { + // the element types of the struct of replacement local module-scope + // variables we are replacing + SmallVector structElementTypes; + + // ordered list of kernel names which are used to find cached function + // types. StringRef is safe here because the names will be taken over from + // the old functions to the new ones. + SmallVector names; + + // unmodified function types of functions in the module + DenseMap functionTypes; + + for (auto &F : M.functions()) { + if (isKernel(F)) { + names.push_back(F.getName()); + functionTypes[F.getName()] = F.getFunctionType(); + } + } + + // a map from the original global variable to the index into + // structElementTypes + ValueMap index_map; + + // the global variables we need to process and remove + SmallVector globals; + + // maps variables in `globals` we're processing to helper information + // needed for updating debug info + DenseMap debug_info_map; + + // __local address space automatic variables are represented in the LLVM + // module as global variables with address space 3. + // + // This pass identifies these variables and places them into a struct + // allocated in a newly created wrapper function. A pointer to the struct + // is then passed via a parameter to the original kernel. + for (auto &global : M.globals()) { + // get the type of the global variable + const auto type = global.getType(); + + if (global.use_empty()) { + continue; + } + + if (type->isPointerTy() && + type->getPointerAddressSpace() == AddressSpace::Local) { + // and save that this is a global we care about + globals.push_back(&global); + } + } + + // if we found no local module-scope variables to be replaced... + if (globals.empty()) { + // ... then we're done! + return PreservedAnalyses::all(); + } + + // Pad struct so that members are aligned. + // + // Unlike x86, ARM architecture alignment can be different from the + // member size. So that __local alignment is OpenCL conformant + // we need to manually pad our struct. + // + // To do this we keep track of each local module-scope elements + // offset in the struct, and ensure that it is a multiple of + // that elements alignment. Finally we then align the whole struct + // to the largest alignment found out of all our __local members. + + // track largest member alignment found so far. + unsigned int maxAlignment = 0; + // byte offset in struct of current member + unsigned int offset = 0; + const auto &dl = M.getDataLayout(); + for (auto &global : globals) { + auto memberType = global->getValueType(); + + // alignment of the new struct member, in the case where we can't + // calculate this, e.g. struct types, use the alignment of the llvm + // global. This is also needed if '__attribute__(aligned)' was used to + // set a specific alignment. + const unsigned int alignment = + std::max(global->getAlignment(), calculateTypeAlign(memberType, dl)); + assert(alignment > 0 && "'0' is an impossible alignment"); + + // check if this is the largest alignment seen so far + maxAlignment = std::max(alignment, maxAlignment); + + // check if member is not already aligned + const unsigned int remainder = offset % alignment; + if (0 != remainder) { + // calculate number of padding bytes + const unsigned int padding = alignment - remainder; + + // Use a byte array to pad struct rather than trying to create + // an arbitrary intNTy, since this may not be supported by the backend. + const auto padByteType = Type::getInt8Ty(M.getContext()); + const auto padByteArrayType = ArrayType::get(padByteType, padding); + structElementTypes.push_back(padByteArrayType); + + // bump offset by padding size + offset += padding; + } + + // we need the byte-offset when generating debug info + debug_info_map[global] = {offset, nullptr, nullptr}; + + // map the global variable to its index in structElementTypes + index_map[global] = structElementTypes.size(); + + // then add our element type to the struct + structElementTypes.push_back(memberType); + + // update the offset based on the type's size + auto allocSize = dl.getTypeAllocSize(memberType); + if (dl.getTypeAllocSize(memberType).isScalable()) { + // Not an assert because this can happen in user-supplied IR + report_fatal_error("Scalable types in local memory are not supported"); + } + const unsigned int totalSize = allocSize.getFixedValue(); + offset += totalSize; + } + + // create a struct containing all the local module-scope variables + auto structTy = StructType::create(structElementTypes, "localVarTypes"); + + // change all our functions to take a pointer to the new structTy we created + const AttributeSet defaultAttrs; + addParamToAllRequiredFunctions( + M, PointerType::get(M.getContext(), /*AddressSpace=*/0), defaultAttrs); + + // Check if we have debug info, if so we need to fix it up to turn global + // variable entries into local variable ones. + if (const auto NMD = M.getNamedMetadata("llvm.dbg.cu")) { + const DIBuilder DIB(M, /*AllowUnresolved*/ false); + + for (auto *CUOp : NMD->operands()) { + // Find module compilation unit + DICompileUnit *CU = cast(CUOp); + + // Check if there are any debug info global variables, as the DMA + // pass can create global variables without debug metadata attached. + auto DIGlobalVariables = CU->getGlobalVariables(); + if (DIGlobalVariables.empty()) { + continue; + } + // Updated list of global debug info variables so that it no longer + // contains entries we will later replace with DILocalVariable metadata + SmallVector CU_DIExprs; + for (auto &global : M.globals()) { + // Get debug info expression for global variable + SmallVector Global_DIExprs; + global.getDebugInfo(Global_DIExprs); + + if (Global_DIExprs.empty()) { + continue; + } + + if (globals.end() == find(globals, &global)) { + // This is not a __local address space variable we will + // replace, so retain its debug info in the CU MDNode + CU_DIExprs.append(Global_DIExprs.begin(), Global_DIExprs.end()); + } else { + // We will replace this debug info variable later + assert(Global_DIExprs.size() == 1 && + "Only expecting a single debug info variable"); + debug_info_map[&global].DIGlobal = Global_DIExprs[0]->getVariable(); + } + } + CU->replaceGlobalVariables(MDTuple::get(M.getContext(), CU_DIExprs)); + } + } + + for (auto &global : globals) { + const SmallVector users(global->users()); + + for (auto *user : users) { + // if we have a constant expression, we need to force it back to a + // normal instruction, as we are removing the constant that the + // constant expression was associated with (we are removing the global + // variable), we can't use a constant expression to calculate the + // result. + if (auto *constant = dyn_cast(user)) { + replaceConstantExpressionWithInstruction(constant); + } + } + } + + for (auto &global : globals) { + if (debug_info_map[global].DIGlobal) { + // If global variable has debug info, find out what kernel the __local + // variable was defined in so we can use that information later. + debug_info_map[global].function = determineKernel(*global); + assert(debug_info_map[global].function); + } + + // For each user that matches a specific kind of instruction, we do 3 + // different things: + // 1) Create a GEP instruction to retrieve the address of the local + // version of 'global' in the newly created local struct. + // 2) We create a cast instruction to cast the type of the GEP created + // in 1) to the type of the global instruction. + // 3) Replace the use of the global instruction with the instruction + // created in 2). + const SmallVector users(global->users()); + for (auto *user : users) { + // if we have a GEP instruction... + if (GetElementPtrInst *gep = dyn_cast(user)) { + auto local = generateStructGEP(*gep, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(gep->getIterator()); + + gep->setOperand(0, castedLocal); + gep->setIsInBounds(); + } else if (CastInst *cast = dyn_cast(user)) { + auto local = generateStructGEP(*cast, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(cast->getIterator()); + + cast->setOperand(0, castedLocal); + } else if (LoadInst *load = dyn_cast(user)) { + auto local = generateStructGEP(*load, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(load->getIterator()); + + load->setOperand(0, castedLocal); + } else if (StoreInst *store = dyn_cast(user)) { + auto local = generateStructGEP(*store, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(store->getIterator()); + + // global could be pointer or value operand of the store + if (store->getValueOperand() == global) { + store->setOperand(0, castedLocal); + } else { + store->setOperand(1, castedLocal); + } + } else if (ConstantVector *cv = dyn_cast(user)) { + // Because 'cv' is not an instruction, we have to iterate over all its + // users and do the work for all of them individually. + for (auto cvIt = cv->user_begin(); cvIt != cv->user_end();) { + auto cvUser = *cvIt++; + auto inst = ::cast(cvUser); + auto local = generateStructGEP(*inst, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(inst->getIterator()); + + auto indexTy = Type::getInt32Ty(M.getContext()); + Value *newCv = PoisonValue::get(cv->getType()); + + // We can't simply 'setOperand' in a 'ConstantVector'. We have to + // recreate it from scratch. + for (unsigned i = 0; i < cv->getNumOperands(); ++i) { + Instruction *newCvInst; + if (cv->getOperand(i) == global) { + newCvInst = InsertElementInst::Create( + newCv, castedLocal, ConstantInt::get(indexTy, i)); + } else { + newCvInst = InsertElementInst::Create( + newCv, cv->getOperand(i), ConstantInt::get(indexTy, i)); + } + newCvInst->insertBefore(inst->getIterator()); + newCv = newCvInst; + } + + // And don't forget to replace 'cv' by 'newCv'. + inst->replaceUsesOfWith(cv, newCv); + } + } else if (PHINode *phi = dyn_cast(user)) { + // Because we can't create 1) before a phi node, we have to create it + // before the terminator of the incoming block. + for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) { + if (phi->getIncomingValue(i) == global) { + auto incomingBlock = phi->getIncomingBlock(i); + auto incomingBlockT = incomingBlock->getTerminator(); + auto local = + generateStructGEP(*incomingBlockT, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(incomingBlockT->getIterator()); + + phi->setIncomingValue(i, castedLocal); + } + } + } else if (AtomicRMWInst *atomic = dyn_cast(user)) { + auto local = generateStructGEP(*atomic, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(atomic->getIterator()); + + // global could be pointer or value operand of the atomic + if (atomic->getPointerOperand() == global) { + atomic->setOperand(0, castedLocal); + } else { + atomic->setOperand(1, castedLocal); + } + } else if (auto *atomic = dyn_cast(user)) { + const auto local = + generateStructGEP(*atomic, structTy, index_map[global]); + const auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(atomic->getIterator()); + + // global could be the pointer + if (atomic->getPointerOperand() == global) { + atomic->setOperand(0, castedLocal); + } + // the comparison value + if (atomic->getCompareOperand() == global) { + atomic->setOperand(1, castedLocal); + } + // the new value + if (atomic->getNewValOperand() == global) { + atomic->setOperand(2, castedLocal); + } + } else if (SelectInst *select = dyn_cast(user)) { + auto local = generateStructGEP(*select, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(select->getIterator()); + + // global could be the true or false value of the select + if (select->getTrueValue() == global) { + select->setOperand(1, castedLocal); + } else { + select->setOperand(2, castedLocal); + } + } else if (CallInst *call = dyn_cast(user)) { + auto local = generateStructGEP(*call, structTy, index_map[global]); + + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(call->getIterator()); + + unsigned i = 0; + for (; i < call->getNumOperands(); ++i) { + if (call->getOperand(i) == global) { + call->setOperand(i, castedLocal); + } + } + } else if (InsertElementInst *insertIns = + dyn_cast(user)) { + auto local = generateStructGEP(*insertIns, structTy, index_map[global]); + auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(insertIns->getIterator()); + + // Update middle operand as the others are the vector and index + insertIns->setOperand(1, castedLocal); + } else if (auto *cmpIns = dyn_cast(user)) { + const auto local = + generateStructGEP(*cmpIns, structTy, index_map[global]); + const auto castedLocal = + CastInst::CreatePointerCast(local, global->getType()); + castedLocal->insertBefore(cmpIns->getIterator()); + + // global could be either side of the compare + if (cmpIns->getOperand(0) == global) { + cmpIns->setOperand(0, castedLocal); + } + if (cmpIns->getOperand(1) == global) { + cmpIns->setOperand(1, castedLocal); + } + } else { + user->print(errs()); + llvm_unreachable("Unknown user used the local module-scope variable!"); + } + } + } + + // lastly, we create a wrapper function with the original kernel signature + // of each kernel, which will alloca the struct for the remapped local + // module-scope variables + for (const auto &name : names) { + // the original kernel function + auto *kernelFunc = M.getFunction(name); + + // the original kernel function type, saved earlier + auto kernelFuncTy = functionTypes[name]; + + auto newFunc = + Function::Create(kernelFuncTy, kernelFunc->getLinkage(), "", &M); + + // copy over function parameter names + for (unsigned i = 0, e = newFunc->arg_size(); i != e; i++) { + newFunc->getArg(i)->setName(kernelFunc->getArg(i)->getName()); + } + // copy over function/parameter/ret attributes + copyFunctionAttrs(*kernelFunc, *newFunc, newFunc->arg_size()); + + auto baseName = getOrSetBaseFnName(*newFunc, *kernelFunc); + newFunc->setName(baseName + ".mux-local-var-wrapper"); + + // copy over function metadata + copyFunctionMetadata(*kernelFunc, *newFunc); + // drop the old function's kernel information - we've stolen it. + dropIsKernel(*kernelFunc); + + // copy the calling convention too + newFunc->setCallingConv(kernelFunc->getCallingConv()); + + // and clear spir_kernel from the original function + if (kernelFunc->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) { + kernelFunc->setCallingConv(llvm::CallingConv::SPIR_FUNC); + } + + // we don't use exceptions + newFunc->addFnAttr(Attribute::NoUnwind); + + // next, set the function to always inline unless it has a noinline + // attribute. + if (!kernelFunc->hasFnAttribute(Attribute::NoInline)) { + kernelFunc->addFnAttr(Attribute::AlwaysInline); + } + + // lastly set the linkage to internal + kernelFunc->setLinkage(GlobalValue::InternalLinkage); + + // move debug info for function over + newFunc->setSubprogram(kernelFunc->getSubprogram()); + kernelFunc->setSubprogram(nullptr); + + // create an irbuilder and basic block for our new function + IRBuilder<> ir(BasicBlock::Create(newFunc->getContext(), "", newFunc)); + + // stack allocate the local module-scope variables struct + auto alloca = ir.CreateAlloca(structTy); + alloca->setAlignment(MaybeAlign(maxAlignment).valueOrOne()); + + // Generate debug info metadata for the globals we have replaced + // which previously had debug info attached + for (auto global : globals) { + auto debug_info_wrapper = debug_info_map[global]; + auto DIGlobal = debug_info_wrapper.DIGlobal; + if (!DIGlobal) { + // No debug info for GlobalVariable + continue; + } + + // Expression for byte offset in newly allocated struct where our + // replacement variable lives + const unsigned offset = debug_info_wrapper.offset; + const uint64_t dwPlusOp = dwarf::DW_OP_plus_uconst; + DIBuilder DIB(M, /*AllowUnresolved*/ false); + auto offset_expr = + DIB.createExpression(ArrayRef{dwPlusOp, offset}); + + // enqueued_kernel_scope is true if the variable was originally defined + // in kernelFunc, the kernel being enqueued by the user, rather than + // another kernel function being called by kernelFunc. + auto func = debug_info_wrapper.function; + const bool enqueued_kernel_scope = !func->getSubprogram(); + auto DISubprogram = enqueued_kernel_scope ? newFunc->getSubprogram() + : func->getSubprogram(); + + // We can't guarantee a subprogram for all functions. + // FIXME: Should we be able to? Do we need to clone subprograms somehow? + if (!DISubprogram) { + continue; + } + + // Create replacement debug metadata entry representing the global + // as a DILocalVariable in the kernel function scope. + auto DILocal = DIB.createAutoVariable( + DISubprogram, DIGlobal->getName(), DIGlobal->getFile(), + DIGlobal->getLine(), dyn_cast(DIGlobal->getType())); + + // Insert debug declare intrinsic pointing to the location of + // the variable in our allocated struct + auto *location = + DILocation::get(DISubprogram->getContext(), DIGlobal->getLine(), + /*Column*/ 0, DISubprogram); + if (enqueued_kernel_scope) { + DIB.insertDeclare(alloca, DILocal, offset_expr, location, + alloca->getParent()); + } else { + // A pointer to our struct is passed as the last argument to each + // function, use this argument if the global came from another kernel + // function which is called by kernelFunc. + auto last_arg = func->arg_end() - 1; + DIB.insertDeclare(last_arg, DILocal, offset_expr, location, + func->getEntryBlock().getFirstNonPHIOrDbg()); + } + } + + // create a buffer for our args + SmallVector args; + + for (auto &arg : newFunc->args()) { + args.push_back(&arg); + } + + // add the new alloca for the local module-scope variables struct + args.push_back(alloca); + + // call the original function + auto ci = ir.CreateCall(kernelFunc, args); + ci->setCallingConv(kernelFunc->getCallingConv()); + ci->setAttributes(getCopiedFunctionAttrs(*kernelFunc)); + + // and return void + ir.CreateRetVoid(); + } + + // erase all the global variables that we have removed all uses for + for (auto global : globals) { + // Vecz generates constant vector with global variable with local scope. + // In this case, if we try to remove the global variable, llvm generates + // assert because there are still uses with constant vector in + // LLVMContext. As a result, if constant vector uses global variable with + // local scope, keep it. + bool keepIt = false; + for (auto *user : global->users()) { + if (isa(user)) { + keepIt = true; + break; + } + } + + if (!keepIt) { + global->eraseFromParent(); + } + } + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp new file mode 100644 index 0000000000000..a05ff3e077c80 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp @@ -0,0 +1,155 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +namespace compiler { +namespace utils { + +static constexpr const char *WorkItemParamName = "MuxWorkItemInfo"; +static constexpr const char *WorkGroupParamName = "MuxWorkGroupInfo"; + +StructType *getWorkItemInfoStructTy(llvm::Module &M) { + LLVMContext &ctx = M.getContext(); + // Check whether this struct has previously been defined. + if (auto *ty = StructType::getTypeByName(ctx, WorkItemParamName)) { + return ty; + } + auto *uint_type = Type::getInt32Ty(ctx); + auto *size_type = getSizeType(M); + auto *array_type = ArrayType::get(size_type, 3); + + SmallVector elements( + WorkItemInfoStructField::total); + + elements[WorkItemInfoStructField::local_id] = array_type; + elements[WorkItemInfoStructField::sub_group_id] = uint_type; + elements[WorkItemInfoStructField::num_sub_groups] = uint_type; + elements[WorkItemInfoStructField::max_sub_group_size] = uint_type; + + return StructType::create(elements, WorkItemParamName); +} + +StructType *getWorkGroupInfoStructTy(llvm::Module &M) { + LLVMContext &ctx = M.getContext(); + // Check whether this struct has previously been defined. + if (auto *ty = StructType::getTypeByName(ctx, WorkGroupParamName)) { + return ty; + } + auto *uint_type = Type::getInt32Ty(ctx); + auto *size_type = getSizeType(M); + auto *array_type = ArrayType::get(size_type, 3); + + SmallVector elements( + WorkGroupInfoStructField::total); + + elements[WorkGroupInfoStructField::group_id] = array_type; + elements[WorkGroupInfoStructField::num_groups] = array_type; + elements[WorkGroupInfoStructField::global_offset] = array_type; + elements[WorkGroupInfoStructField::local_size] = array_type; + elements[WorkGroupInfoStructField::work_dim] = uint_type; + + return StructType::create(elements, WorkGroupParamName); +} + +void populateStructSetterFunction(Function &F, Argument &structPtrArg, + StructType *const structTy, + uint32_t structFieldIdx, bool hasRankArg) { + assert(F.isDeclaration() && "Scrubbing existing function"); + + F.addFnAttr(Attribute::AlwaysInline); + F.setLinkage(GlobalValue::InternalLinkage); + + auto argIter = F.arg_begin(); + + Value *const indexArg = hasRankArg ? argIter++ : nullptr; + + Value *const valueArg = argIter++; + + IRBuilder<> ir(BasicBlock::Create(F.getContext(), "", &F)); + + SmallVector gep_indices{ir.getInt32(0), + ir.getInt32(structFieldIdx)}; + + if (hasRankArg) { + gep_indices.push_back(indexArg); + } + + assert(structPtrArg.getType()->isPointerTy() && + "Assuming a pointer type as the last argument"); + + Value *gep = ir.CreateGEP(structTy, &structPtrArg, gep_indices); + + ir.CreateStore(valueArg, gep); + + ir.CreateRetVoid(); +} + +void populateStructGetterFunction(llvm::Function &F, Argument &structPtrArg, + llvm::StructType *const structTy, + uint32_t structFieldIdx, bool hasRankArg, + size_t defaultValue) { + assert(F.isDeclaration() && "Scrubbing existing function"); + F.addFnAttr(Attribute::AlwaysInline); + F.setLinkage(GlobalValue::InternalLinkage); + + auto *indexArg = hasRankArg ? F.arg_begin() : nullptr; + + assert(structPtrArg.getType()->isPointerTy() && + "Assuming a pointer type as the last argument"); + + IRBuilder<> ir(BasicBlock::Create(F.getContext(), "", &F)); + + SmallVector gep_indices{ir.getInt32(0), + ir.getInt32(structFieldIdx)}; + + Value *ret = nullptr; + Value *cmp = nullptr; + + if (hasRankArg) { + // we have 3 dimensions; x, y & z + auto *maxValidIndex = ir.getInt32(3); + + cmp = ir.CreateICmp(CmpInst::ICMP_ULT, indexArg, maxValidIndex); + + auto *sel = ir.CreateSelect(cmp, indexArg, ir.getInt32(0)); + + gep_indices.push_back(sel); + } + + auto gep = ir.CreateGEP(structTy, &structPtrArg, gep_indices); + + ret = ir.CreateLoad(F.getReturnType(), gep); + + if (hasRankArg) { + ret = ir.CreateSelect(cmp, ret, + ConstantInt::get(F.getReturnType(), defaultValue)); + } + + ir.CreateRet(ret); +} + +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp new file mode 100644 index 0000000000000..8b421ccaf4c30 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp @@ -0,0 +1,172 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +namespace compiler { +namespace utils { + +GlobalSubgroupInfo::GlobalSubgroupInfo(Module &M, BuiltinInfo &BI) : BI(BI) { + SmallPtrSet UsesSubgroups; + SmallPriorityWorklist Worklist; + + for (auto &F : M) { + if (F.isDeclaration()) { + continue; + } + auto SGI = std::make_unique(); + + // Assume the 'mux-no-subgroups' attribute is correct. If a pass introduces + // the use of sub-groups, then it should remove the attribute itself! + if (hasNoExplicitSubgroups(F)) { + FunctionMap.insert({&F, std::move(SGI)}); + continue; + } + + for (auto &BB : F) { + for (const auto &I : BB) { + if (auto *const CI = dyn_cast(&I)) { + if (auto SGBuiltin = isMuxSubgroupBuiltin(CI->getCalledFunction())) { + // Only add each function to the worklist once + if (UsesSubgroups.insert(&F).second) { + Worklist.insert(&F); + } + // Track this function's use of this builtin + SGI->UsedSubgroupBuiltins.insert(SGBuiltin->ID); + } + } + } + } + FunctionMap.insert({&F, std::move(SGI)}); + } + + // Collect all functions that contain sub-group calls, including calls to + // other functions in the module that contain sub-group calls. + while (!Worklist.empty()) { + auto *const F = Worklist.pop_back_val(); + const auto &FSubgroups = FunctionMap[F]->UsedSubgroupBuiltins; + // Track which unique call-graph edges we've traversed, in case F ends up + // calling the same function multiple times. The set of builtins used by + // this item isn't going to change while we're working on it. + SmallPtrSet AlreadyUnioned; + for (auto *const U : F->users()) { + if (auto *const CI = dyn_cast(U)) { + auto *const CallerF = CI->getFunction(); + // If we haven't seen this function before, we need to process it and + // propagate its users. + if (UsesSubgroups.insert(CallerF).second) { + Worklist.insert(CallerF); + } + // If we've recorded that CallerF calls F for the first time in this + // loop, CallerF's set of used builtins gains all the builtins used by + // F. + if (AlreadyUnioned.insert(CallerF).second) { + auto &CallerSubgroups = FunctionMap[CallerF]->UsedSubgroupBuiltins; + // If the set union produces a new set... + if (set_union(CallerSubgroups, FSubgroups)) { + // ... we might have previously visited CallerF when it had fewer + // registered uses of sub-groups. Thus we need to stick it back on + // the worklist to propagate these to its users. + Worklist.insert(CallerF); + } + } + } + } + } +} + +bool GlobalSubgroupInfo::usesSubgroups(const llvm::Function &F) const { + auto I = FunctionMap.find(&F); + assert(I != FunctionMap.end() && "Missing entry for function"); + return !I->second->UsedSubgroupBuiltins.empty(); +} + +std::optional +GlobalSubgroupInfo::isMuxSubgroupBuiltin(const Function *F) const { + if (!F) { + return std::nullopt; + } + auto SGBuiltin = BI.analyzeBuiltin(*F); + if (!SGBuiltin) { + return std::nullopt; + } + + switch (SGBuiltin->ID) { + default: + break; + case eMuxBuiltinSubGroupBarrier: + case eMuxBuiltinGetSubGroupSize: + case eMuxBuiltinGetMaxSubGroupSize: + case eMuxBuiltinGetNumSubGroups: + case eMuxBuiltinGetSubGroupId: + case eMuxBuiltinGetSubGroupLocalId: + return SGBuiltin; + } + + if (auto GroupOp = BI.isMuxGroupCollective(SGBuiltin->ID); + GroupOp && GroupOp->isSubGroupScope()) { + return SGBuiltin; + } + + return std::nullopt; +} + +AnalysisKey SubgroupAnalysis::Key; + +SubgroupAnalysis::Result SubgroupAnalysis::run(Module &M, + ModuleAnalysisManager &AM) { + return GlobalSubgroupInfo(M, AM.getResult(M)); +} + +PreservedAnalyses SubgroupAnalysisPrinterPass::run(Module &M, + ModuleAnalysisManager &AM) { + const auto &Info = AM.getResult(M); + + for (auto &F : M) { + if (F.isDeclaration()) { + continue; + } + OS << "Function '" << F.getName() << "' uses"; + if (!Info.usesSubgroups(F)) { + OS << " no sub-group builtins\n"; + continue; + } + auto *FInfo = Info[&F]; + assert(FInfo && "Missing function info"); + const auto &UsedBuiltins = FInfo->UsedSubgroupBuiltins; + // Note: this output isn't stable and shouldn't be relied upon. It's mostly + // for developer analysis. + OS << " " << UsedBuiltins.size() << " sub-group builtin" + << (UsedBuiltins.size() == 1 ? "" : "s") << ": " + << static_cast(*UsedBuiltins.begin()); + for (auto B : + make_range(std::next(UsedBuiltins.begin()), UsedBuiltins.end())) { + OS << "," << static_cast(B); + } + OS << "\n"; + } + + return PreservedAnalyses::all(); +} +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp new file mode 100644 index 0000000000000..1b6f0de967602 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp @@ -0,0 +1,103 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include + +using namespace compiler::utils; +using namespace llvm; + +namespace compiler { +namespace utils { +namespace tgtext { + +Type *getEventTy(LLVMContext &Ctx) { + return TargetExtType::get(Ctx, "spirv.Event"); +} + +Type *getSamplerTy(LLVMContext &Ctx) { + return TargetExtType::get(Ctx, "spirv.Sampler"); +} + +[[maybe_unused]] static Type * +getImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim, + ImageTyDepthParam Depth, ImageTyArrayedParam Arrayed, + ImageTyMSParam MS, ImageTySampledParam Sampled, + ImageTyAccessQualParam AccessQual) { + unsigned IntParams[7]; + IntParams[ImageTyDimensionalityIdx] = Dim; + IntParams[ImageTyDepthIdx] = Depth; + IntParams[ImageTyArrayedIdx] = Arrayed; + IntParams[ImageTyMSIdx] = MS; + IntParams[ImageTySampledIdx] = Sampled; + IntParams[ImageTyFormatIdx] = /*Unknown*/ 0; + IntParams[ImageTyAccessQualIdx] = AccessQual; + return TargetExtType::get(Ctx, "spirv.Image", Type::getVoidTy(Ctx), + IntParams); +} + +[[maybe_unused]] static Type * +getOpenCLImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim, + ImageTyArrayedParam Arrayed, ImageTyDepthParam Depth, + ImageTyMSParam MS, ImageTyAccessQualParam AccessQual) { + return getImageTyHelper(Ctx, Dim, Depth, Arrayed, MS, ImageSampledRuntime, + AccessQual); +} + +[[maybe_unused]] static Type * +getOpenCLImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim, + ImageTyArrayedParam Arrayed, + ImageTyAccessQualParam AccessQual) { + return getOpenCLImageTyHelper(Ctx, Dim, Arrayed, ImageDepthNone, + ImageMSSingleSampled, AccessQual); +} + +Type *getImage1DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) { + return getOpenCLImageTyHelper(Ctx, ImageDim1D, ImageNonArrayed, AccessQual); +} + +Type *getImage1DArrayTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) { + return getOpenCLImageTyHelper(Ctx, ImageDim1D, ImageArrayed, AccessQual); +} + +Type *getImage1DBufferTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) { + return getOpenCLImageTyHelper(Ctx, ImageDimBuffer, ImageNonArrayed, + AccessQual); +} + +Type *getImage2DTy(LLVMContext &Ctx, bool Depth, bool MS, + ImageTyAccessQualParam AccessQual) { + return getOpenCLImageTyHelper( + Ctx, ImageDim2D, ImageNonArrayed, Depth ? ImageDepth : ImageDepthNone, + MS ? ImageMSMultiSampled : ImageMSSingleSampled, AccessQual); +} + +Type *getImage2DArrayTy(LLVMContext &Ctx, bool Depth, bool MS, + ImageTyAccessQualParam AccessQual) { + return getOpenCLImageTyHelper( + Ctx, ImageDim2D, ImageArrayed, Depth ? ImageDepth : ImageDepthNone, + MS ? ImageMSMultiSampled : ImageMSSingleSampled, AccessQual); +} + +Type *getImage3DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) { + return getOpenCLImageTyHelper(Ctx, ImageDim3D, ImageNonArrayed, AccessQual); +} + +} // namespace tgtext +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp new file mode 100644 index 0000000000000..dafbd1484f3c8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp @@ -0,0 +1,284 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Defines the RenameStructsPass. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace compiler::utils; +using namespace llvm; + +/// @brief Indicates whether a function needs to be cloned. +/// +/// There are a few ways the undesirable types can exist in a function: +/// * As a return type. +/// * As a parameter type. +/// * As a call to a function returning undesirable type. +/// * The result of an alloca. +/// * Result of a cast of some type. +/// * Reference to a global. +/// +/// @param[in] StructTypeRemapper Map from suffixed opaque structs to +/// unsuffixed opaque structs. +/// @param[in] Function function to be checked for cloning. +/// +/// @return Whether function should be cloned. +/// @retval true if function should be cloned. +/// @retval false otherwise. +static bool shouldClone(compiler::utils::StructTypeRemapper &StructTypeRemapper, + const Function &Func) { + // First check the return type. + if (StructTypeRemapper.isRemapped(Func.getReturnType())) { + return true; + } + + // Then the arguments. + for (const Argument &Arg : Func.args()) { + if (StructTypeRemapper.isRemapped(Arg.getType())) { + return true; + } + } + + // Now look for specific instructions that could introduce the type. + for (auto &BB : Func) { + for (auto &I : BB) { + // We can catch any instruction that produces an undesirable type by + // just checking its type. + if (StructTypeRemapper.isRemapped(I.getType())) { + return true; + } + } + } + + // TODO: Check globals. + + // If an instruction makes use of a type but + // isn't of that type e.g. a cast it will necessarily get caught by + // the above case as it is a use of something which produced that + // type. + + // If we've got here, we've checked all the cases, so no need to clone. + return false; +} + +/// @brief Constructs a map of suffixed opaque structure types to their +/// unsuffixed versions. +/// +/// If a module references opaque structs that have identical names up to a +/// suffix within the context, e.g. opencl.event_t and opencl.event_t this +/// function will return a map mapping the suffixed versions to the unsuffixed +/// versions e.g. map[opencl.event_t.0] = opencl.event_t. +/// +/// @param module Module referencing the types in the context. +/// +/// @return The map of suffixed structures to the unsuffixed structures. +static compiler::utils::StructMap +uniqueOpaqueSuffixedStructs(llvm::Module &module) { + StructMap map; + for (auto *structTy : module.getIdentifiedStructTypes()) { + if (!structTy->isOpaque()) { + continue; + } + + // Look up each struct in the module by name. + auto structName = structTy->getName(); + const char *Suffix = ".0123456789"; + + // Check whether there is a type in the context with the same name minus a + // suffix. + if (auto *ctxStructTy = llvm::StructType::getTypeByName( + module.getContext(), structName.rtrim(Suffix))) { + // Make sure it is also opaque. + if (!ctxStructTy->isOpaque()) { + continue; + } + + // If it isn't the same type as the first map the suffixed + // type to the unsuffixed type. + if (ctxStructTy != structTy) { + map[structTy] = ctxStructTy; + } + } + } + return map; +} + +/// @brief Populates list of functions that need to be cloned. +/// +/// @param[in] Module module containing the functions to be inspected. +/// @param[in] StructTypeRemapper Map from suffixed opaque structs to +/// unsuffixed opaque structs. +/// @param[out] WorkList vector of functions that need to be processed. +static void +populateWorkList(Module &Module, + compiler::utils::StructTypeRemapper &StructTypeRemapper, + SmallVectorImpl &WorkList) { + for (auto &Function : Module) { + // We don't need to touch intrinsics. + if (Function.isIntrinsic()) { + continue; + } + + // Check the function for undesirable types. + if (shouldClone(StructTypeRemapper, Function)) { + WorkList.push_back(&Function); + } + } +} + +static void removeOldFunctions(const SmallVectorImpl &OldFuncs) { + // First we have to delete the bodies of the functions, otherwise we will + // get issues about uses missing their defs. + for (auto &OldFunc : OldFuncs) { + OldFunc->deleteBody(); + } + + // Now we can delete the actual functions. + for (auto &OldFunc : OldFuncs) { + OldFunc->eraseFromParent(); + } +} + +/// @brief Clones a list of functions updating types within the function. +/// +/// Clones a list of functions updating the types of any instances of the +/// undesirable types according to the map that was passed to this pass. A new +/// call graph is constructed and the old functions names are taken by the +/// new functions. +/// +/// @param[in] StructTypeRemapper Map from suffixed opaque structs to +/// unsuffixed opaque structs. +/// @param[in] OldFuncs list of functions to clone and update. +static void +replaceRemappedTypeRefs(compiler::utils::StructTypeRemapper &StructTypeRemapper, + const SmallVectorImpl &OldFuncs) { + // Maps the old functions to their new versions with updated types. + // Note: it is important we do this before cloning to catch the case that + // functions A and B both need updating, but function A calls function B and + // A is processed before B, otherwise function calls won't be updated during + // the clone. + SmallDenseMap FFMap; + for (auto &OldFunc : OldFuncs) { + auto *OldFuncTy = OldFunc->getFunctionType(); + // First map the return type. + auto *RetTy = StructTypeRemapper.remapType(OldFuncTy->getReturnType()); + + // Then map the parameter types. + SmallVector ParamTys; + for (auto ParamTy : OldFuncTy->params()) { + ParamTys.push_back(StructTypeRemapper.remapType(ParamTy)); + } + + // Create the new function with updated types. + auto *NewFuncTy = FunctionType::get(RetTy, ParamTys, OldFuncTy->isVarArg()); + auto *NewFunc = Function::Create(NewFuncTy, OldFunc->getLinkage(), "", + OldFunc->getParent()); + NewFunc->setCallingConv(OldFunc->getCallingConv()); + + FFMap[OldFunc] = NewFunc; + } + + // Here we actually do the cloning. + for (auto &OldFunc : OldFuncs) { + // We construct a new value map on each iteration to avoid entries in the + // value map potentially being overwritten during cloning which would then + // be used be subsequent loop iterations. + ValueToValueMapTy ValueMap; + for (auto &pair : FFMap) { + ValueMap[pair.getFirst()] = pair.getSecond(); + } + auto *NewFunc = FFMap[OldFunc]; + auto NewArgIterator = NewFunc->arg_begin(); + for (llvm::Argument &Arg : OldFunc->args()) { + NewArgIterator->setName(Arg.getName()); + ValueMap[&Arg] = &*(NewArgIterator++); + } + NewFunc->takeName(OldFunc); + + if (OldFunc->isDeclaration()) { + // Everything that follows requires a body. + continue; + } + + SmallVector Returns; + CloneFunctionInto(NewFunc, OldFunc, ValueMap, + CloneFunctionChangeType::GlobalChanges, Returns, "", + /* CodeInfo */ nullptr, &StructTypeRemapper); + Returns.clear(); + + // It's possible we still have references to the old types in our new + // new function, this can happen via allocas and cast as well as + // references to global variables. + for (auto &BB : *NewFunc) { + for (auto &I : BB) { + // Anything that defines a undesirable instance will get caught + // here. + I.mutateType(StructTypeRemapper.remapType(I.getType())); + + // GEP instructions need to be handled separately. + if (auto *GEP = dyn_cast(&I)) { + if (StructTypeRemapper.isRemapped(GEP->getSourceElementType())) { + GEP->setSourceElementType( + StructTypeRemapper.remapType(GEP->getSourceElementType())); + } + } + } + } + } + + // We can now remove any of the misnamed types and any functions that used + // them. + removeOldFunctions(OldFuncs); +} + +namespace compiler { +namespace utils { +PreservedAnalyses UniqueOpaqueStructsPass::run(Module &Module, + ModuleAnalysisManager &) { + // Find the opaque types in the module that have suffixes and map them to + // their unsuffixed versions. + auto StructMap = uniqueOpaqueSuffixedStructs(Module); + StructTypeRemapper StructTypeRemapper(StructMap); + + // Build the list of functions we need to process. + SmallVector WorkList; + populateWorkList(Module, StructTypeRemapper, WorkList); + + // If the set is empty we have no work and can exit early. + if (WorkList.empty()) { + return PreservedAnalyses::all(); + } + + // Otherwise, clone the functions, updating the types. + replaceRemappedTypeRefs(StructTypeRemapper, WorkList); + + // We definitely cloned something by this point, so the module has been + // modified. + return PreservedAnalyses::none(); +} +} // namespace utils +} // namespace compiler diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp new file mode 100644 index 0000000000000..4569df09a5495 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp @@ -0,0 +1,1927 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "work-item-loops" + +namespace compiler { +namespace utils { + +/// @brief A subclass of the generic Barrier which is used by the +/// WorkItemLoopsPass. +/// +/// It adds additional fields used when creating wrapper kernels. +class BarrierWithLiveVars : public Barrier { +public: + BarrierWithLiveVars(llvm::Module &m, llvm::Function &f, + VectorizationInfo vf_info, bool IsDebug) + : Barrier(m, f, IsDebug), vf_info(vf_info) {} + + VectorizationInfo getVFInfo() const { return vf_info; } + + AllocaInst *getMemSpace() const { return mem_space; } + void setMemSpace(AllocaInst *ai) { mem_space = ai; } + + void setSize0(Value *v) { size0 = v; } + Value *getSize0() const { return size0; } + + void setTotalSize(Value *v) { totalSize = v; } + Value *getTotalSize() const { return totalSize; } + + Value *getStructSize() const { return structSize; } + void setStructSize(Value *v) { structSize = v; } + + AllocaInst *getDebugAddr() const { return debug_addr; } + void setDebugAddr(AllocaInst *ai) { debug_addr = ai; } + +private: + VectorizationInfo vf_info; + + // Alloca representing the memory for the live variables for a given kernel, + // with enough space for each individual work-item in a work-group to have + // its own view. + // + // This is typically used to hold Z*Y*(X/vec_width) individual instances of + // the live-variables structure. + AllocaInst *mem_space = nullptr; + + // Alloca holding the address of the live vars struct for the + // currently executing work item. + AllocaInst *debug_addr = nullptr; + + // The number of items along the primary dimension + Value *size0 = nullptr; + + // The total number of items + Value *totalSize = nullptr; + + /// @brief The size of the struct in bytes, if the barrier contains + /// scalables + Value *structSize = nullptr; +}; + +} // namespace utils +} // namespace compiler + +namespace { + +struct ScheduleGenerator { + ScheduleGenerator(Module &m, + const compiler::utils::BarrierWithLiveVars &barrierMain, + const compiler::utils::BarrierWithLiveVars *barrierTail, + compiler::utils::BuiltinInfo &BI) + : module(m), context(m.getContext()), barrierMain(barrierMain), + barrierTail(barrierTail), BI(BI), i32Ty(Type::getInt32Ty(context)) { + set_local_id = + BI.getOrDeclareMuxBuiltin(compiler::utils::eMuxBuiltinSetLocalId, m); + set_subgroup_id = + BI.getOrDeclareMuxBuiltin(compiler::utils::eMuxBuiltinSetSubGroupId, m); + assert(set_local_id && set_subgroup_id && "Missing mux builtins"); + } + Module &module; + LLVMContext &context; + const compiler::utils::BarrierWithLiveVars &barrierMain; + const compiler::utils::BarrierWithLiveVars *barrierTail; + compiler::utils::BuiltinInfo &BI; + + SmallVector args; + Function *set_local_id = nullptr; + Function *set_subgroup_id = nullptr; + Type *const i32Ty; + + uint32_t workItemDim0 = 0; + uint32_t workItemDim1 = 1; + uint32_t workItemDim2 = 2; + Value *localSizeDim[3]; + + AllocaInst *nextID = nullptr; + Value *mainLoopLimit = nullptr; + Value *peel = nullptr; + bool noExplicitSubgroups = false; + bool emitTail = true; + bool wrapperHasMain = false; + bool wrapperHasTail = false; + + DILocation *wrapperDbgLoc = nullptr; + + Value * + createLinearLiveVarsPtr(const compiler::utils::BarrierWithLiveVars &barrier, + IRBuilder<> &ir, Value *index) { + Value *const mem_space = barrier.getMemSpace(); + if (!mem_space) { + return nullptr; + } + + // Calculate the offset for where the live variables of the current + // work item (within the nested loops) are stored. + // Loop i,j,k --> ((i * dim1) + j) * size0 + k + // memory access pattern should not depend on the vectorization + // dimension + + Value *live_var_ptr; + if (!barrier.getStructSize()) { + Value *const live_var_mem_idxs[] = {index}; + live_var_ptr = ir.CreateInBoundsGEP(barrier.getLiveVarsType(), mem_space, + live_var_mem_idxs); + } else { + // index into the byte buffer + auto *const byteOffset = ir.CreateMul(index, barrier.getStructSize()); + Value *const live_var_mem_idxs[] = {byteOffset}; + live_var_ptr = + ir.CreateInBoundsGEP(ir.getInt8Ty(), mem_space, live_var_mem_idxs); + } + + return live_var_ptr; + } + + Value *createLiveVarsPtr(const compiler::utils::BarrierWithLiveVars &barrier, + IRBuilder<> &ir, Value *dim_0, Value *dim_1, + Value *dim_2, Value *VF = nullptr) { + Value *const mem_space = barrier.getMemSpace(); + if (!mem_space) { + return nullptr; + } + + // Calculate the offset for where the live variables of the current + // work item (within the nested loops) are stored. + // Loop i,j,k --> ((i * dim1) + j) * size0 + k + // memory access pattern should not depend on the vectorization + // dimension + auto *const i_offset = ir.CreateMul(dim_2, localSizeDim[workItemDim1]); + auto *const j_offset = + ir.CreateMul(ir.CreateAdd(i_offset, dim_1), barrier.getSize0()); + auto *const k_offset = VF ? ir.CreateUDiv(dim_0, VF) : dim_0; + auto *const offset = ir.CreateAdd(j_offset, k_offset); + + return createLinearLiveVarsPtr(barrier, ir, offset); + } + + void + recreateDebugIntrinsics(const compiler::utils::BarrierWithLiveVars &barrier, + BasicBlock *block, StoreInst *SI) { + DIBuilder DIB(module, /*AllowUnresolved*/ false); + auto RecreateDebugIntrinsic = [&](DILocalVariable *const old_var, + const unsigned live_var_offset) { + const uint64_t dwPlusOp = dwarf::DW_OP_plus_uconst; + // Use a DWARF expression to point to byte offset in struct where + // the variable lives. This involves dereferencing the pointer + // stored in `live_vars_debug_addr` to get the start of the live + // vars struct, then using a byte offset into the struct for the + // particular variable. + auto expr = DIB.createExpression( + ArrayRef{dwarf::DW_OP_deref, dwPlusOp, live_var_offset}); + // Remap this debug variable to its new scope. + auto *new_var = DIB.createAutoVariable( + block->getParent()->getSubprogram(), old_var->getName(), + old_var->getFile(), old_var->getLine(), old_var->getType(), + /*AlwaysPreserve=*/false, DINode::FlagZero, + old_var->getAlignInBits()); + + // Create intrinsic + +#if LLVM_VERSION_LESS(21, 0) + assert(module.IsNewDbgInfoFormat && + "Modules should be using the new debug info format"); +#endif + auto *const DVR = + static_cast(cast(DIB.insertDeclare( + barrier.getDebugAddr(), new_var, expr, wrapperDbgLoc, block))); + + // This is nasty, but LLVM errors out on trailing debug info, we need a + // subsequent instruction even if we delete it immediately afterwards. + auto *DummyInst = new UnreachableInst(module.getContext(), block); + + // Bit of a HACK to produce the same debug output as the Mem2Reg + // pass used to do. + ConvertDebugDeclareToDebugValue(DVR, SI, DIB); + + DummyInst->eraseFromParent(); + }; + for (auto debug_pair : barrier.getDebugDbgVariableRecords()) { + RecreateDebugIntrinsic(debug_pair.first->getVariable(), + debug_pair.second); + } + } + + void + createWorkItemLoopBody(const compiler::utils::BarrierWithLiveVars &barrier, + IRBuilder<> &ir, BasicBlock *block, unsigned i, + Value *dim_0, Value *dim_1, Value *dim_2, + Value *accumulator = nullptr, Value *VF = nullptr, + Value *offset = nullptr) { + auto new_kernel_args = args; + if (accumulator) { + new_kernel_args.push_back(accumulator); + } + + // If the work item ID is a nullptr we take it to mean this barrier region + // doesn't need to use the barrier struct. + if (dim_0) { + assert(dim_1 && dim_2 && "unexpected null Work item IDs"); + + // set our local id + auto *const local_id = offset ? ir.CreateAdd(offset, dim_0) : dim_0; + ir.CreateCall(set_local_id, + {ConstantInt::get(i32Ty, workItemDim0), local_id}) + ->setCallingConv(set_local_id->getCallingConv()); + + auto *const live_var_ptr = + createLiveVarsPtr(barrier, ir, dim_0, dim_1, dim_2, VF); + if (live_var_ptr) { + new_kernel_args.push_back(live_var_ptr); + + if (auto *debug_addr = barrier.getDebugAddr()) { + // Update the alloca holding the address of the live vars struct for + // currently executing work item. + auto *const live_var_ptr_cast = + ir.CreatePointerBitCastOrAddrSpaceCast( + live_var_ptr, debug_addr->getAllocatedType()); + auto *const SI = ir.CreateStore(live_var_ptr_cast, debug_addr); + + // Recreate all the debug intrinsics pointing at location in live + // variables struct. We only need to do this once before the first + // barrier. + if (i == compiler::utils::kBarrier_FirstID) { + recreateDebugIntrinsics(barrier, block, SI); + } + } + } + } + + auto &subkernel = *barrier.getSubkernel(i); + + // call the original function now we've setup all the info! + CallInst *ci = ir.CreateCall(&subkernel, new_kernel_args); + // add a debug location for this call so that later inlining correctly + // updates the debug metadata of all inlined instructions. + if (wrapperDbgLoc) { + ci->setDebugLoc(wrapperDbgLoc); + } + ci->setCallingConv(subkernel.getCallingConv()); + ci->setAttributes(compiler::utils::getCopiedFunctionAttrs(subkernel)); + + // And update the location of where we need to go to next (if we need to) + const auto &successors = barrier.getSuccessorIds(i); + if (successors.size() > 1) { + ir.CreateStore(ci, nextID); + } + } + + // Create a 1D loop to execute all the work items in a 'barrier', reducing + // across an accumulator. + std::pair + makeReductionLoop(const compiler::utils::BarrierWithLiveVars &barrier, + const compiler::utils::GroupCollective &WGC, + BasicBlock *block, Value *op, Value *accumulator) { + auto *const accTy = accumulator->getType(); + Function *const func = block->getParent(); + + // Induction variables + auto *const totalSize = barrier.getTotalSize(); + + compiler::utils::CreateLoopOpts inner_opts; + inner_opts.IVs = {accumulator}; + inner_opts.disableVectorize = true; + + BasicBlock *preheader = block; + BasicBlock *exitBlock = nullptr; + PHINode *resultPhi = nullptr; + + auto *const zero = + Constant::getNullValue(compiler::utils::getSizeType(module)); + + if (auto *const loopLimitConst = dyn_cast(totalSize)) { + if (loopLimitConst->isZeroValue()) { + // No iterations at all! + return {block, accumulator}; + } + preheader = block; + } else { + preheader = + BasicBlock::Create(context, "ca_work_group_reduce_preheader", func); + + exitBlock = + BasicBlock::Create(context, "ca_work_group_reduce_exit", func); + preheader->moveAfter(block); + exitBlock->moveAfter(preheader); + + auto *const needLoop = CmpInst::Create( + Instruction::ICmp, CmpInst::ICMP_NE, zero, totalSize, "", block); + + BranchInst::Create(preheader, exitBlock, needLoop, block); + + resultPhi = PHINode::Create(accTy, 2, "WGC_reduce", exitBlock); + resultPhi->addIncoming(accumulator, block); + } + + BasicBlock *latchBlock = nullptr; + + // linearly looping through the work items + exitBlock = compiler::utils::createLoop( + preheader, exitBlock, zero, totalSize, inner_opts, + [&](BasicBlock *block, Value *index, ArrayRef ivs, + MutableArrayRef ivsNext) -> BasicBlock * { + IRBuilder<> ir(block); + auto *const liveVars = createLinearLiveVarsPtr(barrier, ir, index); + compiler::utils::Barrier::LiveValuesHelper live_values(barrier, block, + liveVars); + + IRBuilder<> ir_load(block); + auto *const itemOp = + live_values.getReload(op, ir_load, "_load", /*reuse*/ true); + + // Do the reduction here.. + accumulator = compiler::utils::createBinOpForRecurKind( + ir, ivs[0], itemOp, WGC.Recurrence); + ivsNext[0] = accumulator; + latchBlock = block; + + return block; + }); + + if (!resultPhi) { + assert(exitBlock != latchBlock && "createLoop didn't create a loop"); + resultPhi = PHINode::Create(accTy, 1, "WGC_reduce", exitBlock); + } + resultPhi->addIncoming(accumulator, latchBlock); + return {exitBlock, resultPhi}; + } + + void getUniformValues(BasicBlock *block, + const compiler::utils::BarrierWithLiveVars &barrier, + MutableArrayRef values) { + auto *const zero = + Constant::getNullValue(compiler::utils::getSizeType(module)); + IRBuilder<> ir(block); + auto *const barrier0 = ir.CreateInBoundsGEP(barrier.getLiveVarsType(), + barrier.getMemSpace(), {zero}); + compiler::utils::Barrier::LiveValuesHelper live_values(barrier, block, + barrier0); + for (auto &value : values) { + value = live_values.getReload(value, ir, "_load", true); + } + } + + std::optional + getBarrierGroupCollective(const compiler::utils::BarrierWithLiveVars &Barrier, + unsigned BarrierID) { + auto *const BarrierCall = Barrier.getBarrierCall(BarrierID); + if (!BarrierCall) { + return std::nullopt; + } + + auto Builtin = BI.analyzeBuiltin(*BarrierCall->getCalledFunction()); + assert(Builtin && "Barrier call must be a known builtin"); + return BI.isMuxGroupCollective(Builtin->ID); + } + + std::tuple> + makeWorkGroupCollectiveLoops(BasicBlock *block, unsigned barrierID) { + auto *const groupCall = barrierMain.getBarrierCall(barrierID); + if (!groupCall) { + return {block, nullptr, std::nullopt}; + } + + auto Info = getBarrierGroupCollective(barrierMain, barrierID); + if (!Info || !Info->isWorkGroupScope()) { + return {block, nullptr, std::nullopt}; + } + + switch (Info->Op) { + case compiler::utils::GroupCollective::OpKind::Reduction: + case compiler::utils::GroupCollective::OpKind::All: + case compiler::utils::GroupCollective::OpKind::Any: { + auto *const ty = groupCall->getType(); + auto *const accumulator = + compiler::utils::getNeutralVal(Info->Recurrence, ty); + auto [loop_exit_block, accum] = makeReductionLoop( + barrierMain, *Info, block, groupCall->getOperand(1), accumulator); + if (barrierTail) { + auto *const groupTailInst = barrierTail->getBarrierCall(barrierID); + std::tie(loop_exit_block, accum) = + makeReductionLoop(*barrierTail, *Info, loop_exit_block, + groupTailInst->getOperand(1), accum); + } + if (groupCall->hasName()) { + accum->takeName(groupCall); + } + return std::make_tuple(loop_exit_block, accum, Info); + } + case compiler::utils::GroupCollective::OpKind::ScanInclusive: + case compiler::utils::GroupCollective::OpKind::ScanExclusive: { + auto *const ty = groupCall->getType(); + auto *const accumulator = + compiler::utils::getIdentityVal(Info->Recurrence, ty); + return {block, accumulator, Info}; + } + case compiler::utils::GroupCollective::OpKind::Broadcast: { + // First we need to get the item ID values from the barrier struct. + // These should be uniform but they may still be variables. It should + // be safe to get them from the barrier struct at index zero. + auto *const zero = + Constant::getNullValue(compiler::utils::getSizeType(module)); + + Function *const func = block->getParent(); + BasicBlock *mainUniformBlock = block; + BasicBlock *tailUniformBlock = nullptr; + + auto *const totalSize = barrierMain.getTotalSize(); + if (auto *const loopLimitConst = dyn_cast(totalSize)) { + // If we know for a fact that the main struct has at least one item, + // we can just use that. Otherwise, we need to use the tail struct. + if (loopLimitConst->isZeroValue()) { + mainUniformBlock = nullptr; + if (barrierTail) { + tailUniformBlock = block; + } + } + } else if (barrierTail) { + // If we have a variable number of main items, it could be zero at + // runtime, so we need an alternative way to get the values. + mainUniformBlock = + BasicBlock::Create(context, "ca_main_uniform_load", func); + tailUniformBlock = + BasicBlock::Create(context, "ca_tail_uniform_load", func); + + auto *const needTail = CmpInst::Create( + Instruction::ICmp, CmpInst::ICMP_EQ, totalSize, zero, "", block); + BranchInst::Create(tailUniformBlock, mainUniformBlock, needTail, block); + } + + if (!mainUniformBlock && !tailUniformBlock) { + return {block, nullptr, std::nullopt}; + } + + Value *idsMain[] = {zero, zero, zero}; + Value *idsTail[] = {zero, zero, zero}; + if (mainUniformBlock) { + idsMain[0] = groupCall->getOperand(2); + idsMain[1] = groupCall->getOperand(3); + idsMain[2] = groupCall->getOperand(4); + getUniformValues(mainUniformBlock, barrierMain, idsMain); + } + + if (tailUniformBlock) { + auto *const tailGroupCall = barrierTail->getBarrierCall(barrierID); + assert(tailGroupCall && + "No corresponding work group broadcast in tail kernel"); + idsTail[0] = tailGroupCall->getOperand(2); + idsTail[1] = tailGroupCall->getOperand(3); + idsTail[2] = tailGroupCall->getOperand(4); + getUniformValues(tailUniformBlock, *barrierTail, idsTail); + + if (mainUniformBlock) { + // If both barrier structs had to be used, we need to merge the + // result. + block = BasicBlock::Create(context, "ca_merge_uniform_load", func); + BranchInst::Create(block, tailUniformBlock); + BranchInst::Create(block, mainUniformBlock); + + for (size_t i = 0; i != 3; ++i) { + auto *mergePhi = PHINode::Create(idsMain[i]->getType(), 2, + "uniform_merge", block); + mergePhi->addIncoming(idsMain[i], mainUniformBlock); + mergePhi->addIncoming(idsTail[i], tailUniformBlock); + idsMain[i] = mergePhi; + } + } else { + // Otherwise we can use the tail. + for (size_t i = 0; i != 3; ++i) { + idsMain[i] = idsTail[i]; + } + } + } + + IRBuilder<> ir(block); + auto *const op = groupCall->getOperand(1); + + // Compute the address of the value in the main barrier struct + auto *const VF = ir.CreateElementCount( + compiler::utils::getSizeType(module), barrierMain.getVFInfo().vf); + auto *const liveVars = createLiveVarsPtr(barrierMain, ir, idsMain[0], + idsMain[1], idsMain[2], VF); + compiler::utils::Barrier::LiveValuesHelper live_values(barrierMain, block, + liveVars); + auto *const GEPmain = live_values.getGEP(op); + assert(GEPmain && "Could not get broadcasted value"); + + if (barrierTail) { + const bool VP = barrierTail->getVFInfo().IsVectorPredicated; + + // Compute the address of the value in the tail barrier struct + auto *const offsetDim0 = ir.CreateSub(idsMain[0], mainLoopLimit); + auto *const liveVarsTail = + createLiveVarsPtr(*barrierTail, ir, offsetDim0, idsMain[1], + idsMain[2], VP ? VF : nullptr); + compiler::utils::Barrier::LiveValuesHelper live_values( + *barrierTail, block, liveVarsTail); + + auto *const opTail = + barrierTail->getBarrierCall(barrierID)->getOperand(1); + auto *const GEPtail = live_values.getGEP(opTail); + assert(GEPtail && "Could not get tail-broadcasted value"); + + // Select the main GEP or the tail GEP to load from + auto *const cond = ir.CreateICmpUGE(idsMain[0], mainLoopLimit); + + auto *const select = ir.CreateSelect(cond, GEPtail, GEPmain); + + auto *const result = ir.CreateLoad(op->getType(), select); + result->takeName(groupCall); + + return {block, result, Info}; + } else { + auto *const result = ir.CreateLoad(op->getType(), GEPmain); + result->takeName(groupCall); + return {block, result, Info}; + } + } + default: + break; + } + return {block, nullptr, std::nullopt}; + } + + // Create loops to execute all the main work items, and then all the + // left-over tail work items at the end. + BasicBlock *makeWorkItemLoops(BasicBlock *block, unsigned barrierID) { + Value *accum = nullptr; + std::optional collective; + std::tie(block, accum, collective) = + makeWorkGroupCollectiveLoops(block, barrierID); + + // Work-group scans should be using linear work-item loops. + assert((!collective || !collective->isScan()) && "No support for scans"); + + auto *const zero = + Constant::getNullValue(compiler::utils::getSizeType(module)); + auto *const i32Zero = Constant::getNullValue(i32Ty); + auto *const func = block->getParent(); + + // The subgroup induction variable, set to the value of the subgroup ID at + // the end of the last loop (i.e. beginning of the next loop) + Value *nextSubgroupIV = i32Zero; + + // looping through num groups in the first (innermost) + // dimension + BasicBlock *mainPreheaderBB = block; + BasicBlock *mainExitBB = nullptr; + + // We need to ensure any subgroup IV is defined on the path in which + // the vector loop is skipped. + PHINode *subgroupMergePhi = nullptr; + + // If we are emitting a tail, we might need to bypass the vector loop (if + // the local size is less than the vector width). + if (emitTail) { + if (auto *const loopLimitConst = dyn_cast(mainLoopLimit)) { + if (loopLimitConst->isZeroValue()) { + // No vector iterations at all! + mainPreheaderBB = nullptr; + mainExitBB = block; + } + } else { + mainPreheaderBB = BasicBlock::Create( + context, "ca_work_item_x_vector_preheader", func); + + mainExitBB = + BasicBlock::Create(context, "ca_work_item_x_vector_exit", func); + mainPreheaderBB->moveAfter(block); + mainExitBB->moveAfter(mainPreheaderBB); + + if (!noExplicitSubgroups) { + subgroupMergePhi = PHINode::Create(i32Ty, 2, "", mainExitBB); + subgroupMergePhi->addIncoming(i32Zero, block); + } + + auto *const needMain = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, zero, + mainLoopLimit, "", block); + + BranchInst::Create(mainPreheaderBB, mainExitBB, needMain, block); + } + } + + assert((mainPreheaderBB || !wrapperHasMain) && + "Vector loops in one barrier block but not another?"); + + if (mainPreheaderBB) { + wrapperHasMain = true; + // Subgroup induction variables + compiler::utils::CreateLoopOpts outer_opts; + if (!noExplicitSubgroups) { + outer_opts.IVs = {i32Zero}; + } + + // looping through num groups in the third (outermost) dimension + mainExitBB = compiler::utils::createLoop( + mainPreheaderBB, mainExitBB, zero, localSizeDim[workItemDim2], + outer_opts, + [&](BasicBlock *block, Value *dim_2, ArrayRef ivs2, + MutableArrayRef ivsNext2) -> BasicBlock * { + // if we need to set the local id, do so here. + IRBuilder<> ir(block); + ir.CreateCall(set_local_id, + {ConstantInt::get(i32Ty, workItemDim2), dim_2}) + ->setCallingConv(set_local_id->getCallingConv()); + + compiler::utils::CreateLoopOpts middle_opts; + middle_opts.IVs = ivs2.vec(); + + // looping through num groups in the second dimension + BasicBlock *exit1 = compiler::utils::createLoop( + block, nullptr, zero, localSizeDim[workItemDim1], middle_opts, + [&](BasicBlock *block, Value *dim_1, ArrayRef ivs1, + MutableArrayRef ivsNext1) -> BasicBlock * { + IRBuilder<> ir(block); + ir.CreateCall(set_local_id, + {ConstantInt::get(i32Ty, workItemDim1), dim_1}) + ->setCallingConv(set_local_id->getCallingConv()); + + // Materialize the scale factor at the beginning of the + // preheader + IRBuilder<> irph(mainPreheaderBB, + mainPreheaderBB->getFirstInsertionPt()); + auto *VF = irph.CreateElementCount( + compiler::utils::getSizeType(module), + barrierMain.getVFInfo().vf); + + compiler::utils::CreateLoopOpts inner_opts; + inner_opts.indexInc = VF; + inner_opts.IVs = ivs1.vec(); + + BasicBlock *exit0 = compiler::utils::createLoop( + block, nullptr, zero, mainLoopLimit, inner_opts, + [&](BasicBlock *block, Value *dim_0, + ArrayRef ivs0, + MutableArrayRef ivsNext0) -> BasicBlock * { + IRBuilder<> ir(block); + + if (!noExplicitSubgroups) { + // set our subgroup id + ir.CreateCall(set_subgroup_id, {ivs0[0]}) + ->setCallingConv( + set_subgroup_id->getCallingConv()); + } + + createWorkItemLoopBody(barrierMain, ir, block, + barrierID, dim_0, dim_1, dim_2, + accum, VF); + + if (!noExplicitSubgroups) { + nextSubgroupIV = + ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1)); + ivsNext0[0] = nextSubgroupIV; + } + + return block; + }); + + if (!noExplicitSubgroups) { + // Don't forget to update the subgroup IV phi. + ivsNext1[0] = nextSubgroupIV; + } + + return exit0; + }); + + if (!noExplicitSubgroups) { + // Don't forget to update the subgroup IV phi. + ivsNext2[0] = nextSubgroupIV; + + if (subgroupMergePhi) { + subgroupMergePhi->addIncoming(nextSubgroupIV, exit1); + } + } + + return exit1; + }); + } + + // looping through num groups in the first + // (innermost) dimension + BasicBlock *tailPreheaderBB = mainExitBB; + BasicBlock *tailExitBB = nullptr; + + if (emitTail && peel) { + // We might need to bypass the tail loop. + if (auto *const peelConst = dyn_cast(peel)) { + if (peelConst->isZeroValue()) { + // No tail iterations at all! + tailPreheaderBB = nullptr; + tailExitBB = mainExitBB; + } + } else { + tailPreheaderBB = BasicBlock::Create( + context, "ca_work_item_x_scalar_preheader", func); + + tailExitBB = + BasicBlock::Create(context, "ca_work_item_x_scalar_exit", func); + tailPreheaderBB->moveAfter(mainExitBB); + tailExitBB->moveAfter(tailPreheaderBB); + + auto *const needPeeling = CmpInst::Create( + Instruction::ICmp, CmpInst::ICMP_NE, zero, peel, "", mainExitBB); + + BranchInst::Create(tailPreheaderBB, tailExitBB, needPeeling, + mainExitBB); + } + } else { + tailPreheaderBB = nullptr; + tailExitBB = mainExitBB; + } + + assert((tailPreheaderBB || !wrapperHasTail) && + "Tail loops in one barrier block but not another?"); + + if (tailPreheaderBB) { + assert(barrierTail); + wrapperHasTail = true; + // Subgroup induction variables + compiler::utils::CreateLoopOpts outer_opts; + if (!noExplicitSubgroups) { + outer_opts.IVs = {subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV}; + } + + // looping through num groups in the third (outermost) dimension + tailExitBB = compiler::utils::createLoop( + tailPreheaderBB, tailExitBB, zero, localSizeDim[workItemDim2], + outer_opts, + [&](BasicBlock *block, Value *dim_2, ArrayRef ivs2, + MutableArrayRef ivsNext2) -> BasicBlock * { + // set the local id + IRBuilder<> ir(block); + ir.CreateCall(set_local_id, + {ConstantInt::get(i32Ty, workItemDim2), dim_2}) + ->setCallingConv(set_local_id->getCallingConv()); + + compiler::utils::CreateLoopOpts middle_opts; + middle_opts.IVs = ivs2.vec(); + + // looping through num groups in the second dimension + BasicBlock *exit1 = compiler::utils::createLoop( + block, nullptr, zero, localSizeDim[workItemDim1], middle_opts, + [&](BasicBlock *block, Value *dim_1, ArrayRef ivs1, + MutableArrayRef ivsNext1) -> BasicBlock * { + IRBuilder<> ir(block); + ir.CreateCall(set_local_id, + {ConstantInt::get(i32Ty, workItemDim1), dim_1}) + ->setCallingConv(set_local_id->getCallingConv()); + + compiler::utils::CreateLoopOpts inner_opts; + inner_opts.IVs = ivs1.vec(); + inner_opts.disableVectorize = true; + + BasicBlock *exit0 = compiler::utils::createLoop( + block, nullptr, zero, peel, inner_opts, + [&](BasicBlock *block, Value *dim_0, + ArrayRef ivs0, + MutableArrayRef ivsNext0) -> BasicBlock * { + IRBuilder<> ir(block); + + if (!noExplicitSubgroups) { + // set our subgroup id + ir.CreateCall(set_subgroup_id, {ivs0[0]}) + ->setCallingConv( + set_subgroup_id->getCallingConv()); + } + + createWorkItemLoopBody( + *barrierTail, ir, block, barrierID, dim_0, dim_1, + dim_2, accum, /*VF*/ nullptr, mainLoopLimit); + + if (!noExplicitSubgroups) { + nextSubgroupIV = + ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1)); + ivsNext0[0] = nextSubgroupIV; + } + + return block; + }); + + if (!noExplicitSubgroups) { + // Don't forget to update the subgroup IV phi. + ivsNext1[0] = nextSubgroupIV; + } + + return exit0; + }); + + if (!noExplicitSubgroups) { + // Don't forget to update the subgroup IV phi. + ivsNext2[0] = nextSubgroupIV; + } + + return exit1; + }); + } + return tailExitBB; + } + + // Create loops to execute all work items in local linear ID order. + BasicBlock *makeLinearWorkItemLoops(BasicBlock *block, unsigned barrierID) { + Value *accum = nullptr; + std::optional collective; + std::tie(block, accum, collective) = + makeWorkGroupCollectiveLoops(block, barrierID); + + bool isScan = collective && collective->isScan(); + bool isExclusiveScan = + isScan && collective->Op == + compiler::utils::GroupCollective::OpKind::ScanExclusive; + // The scan types can differ between 'main' and 'tail' kernels. + bool isTailExclusiveScan = false; + if (isScan && barrierTail) { + const auto tailInfo = getBarrierGroupCollective(*barrierTail, barrierID); + assert(tailInfo && "No corresponding work group scan in tail kernel"); + isTailExclusiveScan = + tailInfo->Op == + compiler::utils::GroupCollective::OpKind::ScanExclusive; + } + + auto *const zero = + Constant::getNullValue(compiler::utils::getSizeType(module)); + auto *const i32Zero = Constant::getNullValue(i32Ty); + auto *const func = block->getParent(); + + // The subgroup induction variable, set to the value of the subgroup ID at + // the end of the last loop (i.e. beginning of the next loop) + Value *nextSubgroupIV = noExplicitSubgroups ? nullptr : i32Zero; + + // The work-group scan induction variable, set to the current scan value at + // the end of the last loop (i.e. beginning of the next loop) + Value *nextScanIV = isScan ? accum : nullptr; + + // We need to ensure any subgroup IV is defined on the path in which + // the vector loop is skipped. + PHINode *subgroupMergePhi = nullptr; + // Same with the scan IV + PHINode *scanMergePhi = nullptr; + + compiler::utils::CreateLoopOpts outer_opts; + outer_opts.IVs = {nextSubgroupIV, nextScanIV}; + outer_opts.loopIVNames = {"sg.z", "scan.z"}; + + // looping through num groups in the third (outermost) dimension + return compiler::utils::createLoop( + block, nullptr, zero, localSizeDim[workItemDim2], outer_opts, + [&](BasicBlock *block, Value *dim_2, ArrayRef ivs2, + MutableArrayRef ivsNext2) -> BasicBlock * { + // set the local id + IRBuilder<> ir(block); + ir.CreateCall(set_local_id, + {ConstantInt::get(i32Ty, workItemDim2), dim_2}) + ->setCallingConv(set_local_id->getCallingConv()); + + compiler::utils::CreateLoopOpts middle_opts; + middle_opts.IVs = ivs2.vec(); + middle_opts.loopIVNames = {"sg.y", "scan.y"}; + + // looping through num groups in the second dimension + BasicBlock *exit1 = compiler::utils::createLoop( + block, nullptr, zero, localSizeDim[workItemDim1], middle_opts, + [&](BasicBlock *block, Value *dim_1, ArrayRef ivs1, + MutableArrayRef ivsNext1) -> BasicBlock * { + IRBuilder<> ir(block); + ir.CreateCall(set_local_id, + {ConstantInt::get(i32Ty, workItemDim1), dim_1}) + ->setCallingConv(set_local_id->getCallingConv()); + + // looping through num groups in the first (innermost) + // dimension + BasicBlock *mainPreheaderBB = block; + BasicBlock *mainExitBB = nullptr; + + // If we are emitting a tail, we might need to bypass the + // main loop (if the local size is less than the main loop + // width). + if (emitTail) { + if (auto *const loopLimitConst = + dyn_cast(mainLoopLimit)) { + if (loopLimitConst->isZeroValue()) { + // No main iterations at all! + mainPreheaderBB = nullptr; + mainExitBB = block; + if (!noExplicitSubgroups) { + nextSubgroupIV = ivs1[0]; + } + if (isScan) { + nextScanIV = ivs1[1]; + } + } + } else { + mainPreheaderBB = BasicBlock::Create( + context, "ca_work_item_x_main_preheader", func); + + mainExitBB = BasicBlock::Create( + context, "ca_work_item_x_main_exit", func); + mainPreheaderBB->moveAfter(block); + mainExitBB->moveAfter(mainPreheaderBB); + + if (!noExplicitSubgroups) { + subgroupMergePhi = + PHINode::Create(i32Ty, 2, "sg.merge", mainExitBB); + subgroupMergePhi->addIncoming(ivs1[0], block); + } + + if (isScan) { + scanMergePhi = PHINode::Create(accum->getType(), 2, + "scan.merge", mainExitBB); + scanMergePhi->addIncoming(ivs1[1], block); + } + + auto *const needMain = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, + zero, mainLoopLimit, "", block); + + BranchInst::Create(mainPreheaderBB, mainExitBB, needMain, + block); + } + } + + assert((mainPreheaderBB || !wrapperHasMain) && + "Main loops in one barrier block but not another?"); + + if (mainPreheaderBB) { + wrapperHasMain = true; + BasicBlock *mainLoopBB = nullptr; + + // Materialize the scale factor at the beginning of the + // preheader + IRBuilder<> irph(mainPreheaderBB, + mainPreheaderBB->getFirstInsertionPt()); + auto *VF = irph.CreateElementCount( + compiler::utils::getSizeType(module), + barrierMain.getVFInfo().vf); + + compiler::utils::CreateLoopOpts inner_vf_opts; + inner_vf_opts.indexInc = VF; + inner_vf_opts.IVs = ivs1.vec(); + inner_vf_opts.loopIVNames = {"sg.x.main", "scan.x.main"}; + + mainExitBB = compiler::utils::createLoop( + mainPreheaderBB, mainExitBB, zero, mainLoopLimit, + inner_vf_opts, + [&](BasicBlock *block, Value *dim_0, + ArrayRef ivs0, + MutableArrayRef ivsNext0) -> BasicBlock * { + IRBuilder<> ir(block); + + if (!noExplicitSubgroups) { + // set our subgroup id + ir.CreateCall(set_subgroup_id, {ivs0[0]}) + ->setCallingConv( + set_subgroup_id->getCallingConv()); + } + + if (isScan) { + auto *const barrierCall = + barrierMain.getBarrierCall(barrierID); + auto *const liveVars = createLiveVarsPtr( + barrierMain, ir, dim_0, dim_1, dim_2, VF); + compiler::utils::Barrier::LiveValuesHelper + live_values(barrierMain, block, liveVars); + auto *const itemOp = live_values.getReload( + barrierCall->getOperand(1), ir, "_load", + /*reuse*/ true); + nextScanIV = compiler::utils::createBinOpForRecurKind( + ir, ivs0[1], itemOp, collective->Recurrence); + accum = isExclusiveScan ? ivs0[1] : nextScanIV; + ivsNext0[1] = nextScanIV; + } + + createWorkItemLoopBody(barrierMain, ir, block, + barrierID, dim_0, dim_1, dim_2, + accum, VF); + + if (!noExplicitSubgroups) { + nextSubgroupIV = + ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1), + "sg.x.main.inc"); + ivsNext0[0] = nextSubgroupIV; + } + + // Move the exit after the loop block, as it reads more + // logically. + mainLoopBB = block; + if (mainExitBB) { + mainExitBB->moveAfter(mainLoopBB); + } + + return block; + }); + + if (subgroupMergePhi) { + subgroupMergePhi->addIncoming(nextSubgroupIV, mainLoopBB); + nextSubgroupIV = subgroupMergePhi; + } + + if (scanMergePhi) { + scanMergePhi->addIncoming(nextScanIV, mainLoopBB); + nextScanIV = scanMergePhi; + } + } + assert(mainExitBB && "didn't create a loop exit block!"); + + // looping through num groups in the first + // (innermost) dimension + BasicBlock *tailPreheaderBB = mainExitBB; + BasicBlock *tailExitBB = nullptr; + + if (emitTail && peel) { + // We might need to bypass the tail loop. + if (auto *const peelConst = dyn_cast(peel)) { + if (peelConst->isZeroValue()) { + // No tail iterations at all! + tailPreheaderBB = nullptr; + tailExitBB = mainExitBB; + } + } else { + tailPreheaderBB = BasicBlock::Create( + context, "ca_work_item_x_tail_preheader", func); + + tailExitBB = BasicBlock::Create( + context, "ca_work_item_x_tail_exit", func); + tailPreheaderBB->moveAfter(mainExitBB); + tailExitBB->moveAfter(tailPreheaderBB); + + auto *const needPeeling = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, + zero, peel, "", mainExitBB); + + BranchInst::Create(tailPreheaderBB, tailExitBB, needPeeling, + mainExitBB); + } + } else { + tailPreheaderBB = nullptr; + tailExitBB = mainExitBB; + } + + assert((tailPreheaderBB || !wrapperHasTail) && + "Tail loops in one barrier block but not another?"); + + if (tailPreheaderBB) { + assert(barrierTail); + wrapperHasTail = true; + // Subgroup induction variables + SmallVector subgroupIVs0 = {nextSubgroupIV, + nextScanIV}; + + BasicBlock *tailLoopBB = nullptr; + if (barrierTail->getVFInfo().IsVectorPredicated) { + IRBuilder<> ir(tailPreheaderBB); + if (!noExplicitSubgroups) { + // set our subgroup id + ir.CreateCall(set_subgroup_id, {subgroupIVs0[0]}) + ->setCallingConv(set_subgroup_id->getCallingConv()); + } + + if (isScan) { + assert(barrierTail); + auto *const barrierCall = + barrierTail->getBarrierCall(barrierID); + auto *const liveVars = createLiveVarsPtr( + *barrierTail, ir, zero, dim_1, dim_2, nullptr); + compiler::utils::Barrier::LiveValuesHelper live_values( + *barrierTail, tailPreheaderBB, liveVars); + auto *const itemOp = live_values.getReload( + barrierCall->getOperand(1), ir, "_load", + /*reuse*/ true); + nextScanIV = compiler::utils::createBinOpForRecurKind( + ir, subgroupIVs0[1], itemOp, collective->Recurrence); + accum = + isTailExclusiveScan ? subgroupIVs0[1] : nextScanIV; + } + + createWorkItemLoopBody(*barrierTail, ir, tailPreheaderBB, + barrierID, zero, dim_1, dim_2, accum, + /*VF*/ nullptr, mainLoopLimit); + + if (!noExplicitSubgroups) { + nextSubgroupIV = ir.CreateAdd(subgroupIVs0[0], + ConstantInt::get(i32Ty, 1), + "sg.x.tail.inc"); + } + + assert(tailExitBB); + ir.CreateBr(tailExitBB); + tailLoopBB = tailPreheaderBB; + } else { + compiler::utils::CreateLoopOpts inner_scalar_opts; + inner_scalar_opts.disableVectorize = true; + inner_scalar_opts.IVs.assign(subgroupIVs0.begin(), + subgroupIVs0.end()); + inner_scalar_opts.loopIVNames = {"sg.x.tail", + "scan.x.tail"}; + + tailExitBB = compiler::utils::createLoop( + tailPreheaderBB, tailExitBB, zero, peel, + inner_scalar_opts, + [&](BasicBlock *block, Value *dim_0, + ArrayRef ivs0, + MutableArrayRef ivsNext0) -> BasicBlock * { + IRBuilder<> ir(block); + + if (!noExplicitSubgroups) { + // set our subgroup id + ir.CreateCall(set_subgroup_id, {ivs0[0]}) + ->setCallingConv( + set_subgroup_id->getCallingConv()); + } + + if (isScan) { + assert(barrierTail); + auto *const barrierCall = + barrierTail->getBarrierCall(barrierID); + auto *const liveVars = createLiveVarsPtr( + *barrierTail, ir, dim_0, dim_1, dim_2, nullptr); + compiler::utils::Barrier::LiveValuesHelper + live_values(*barrierTail, block, liveVars); + auto *const itemOp = live_values.getReload( + barrierCall->getOperand(1), ir, "_load", + /*reuse*/ true); + nextScanIV = + compiler::utils::createBinOpForRecurKind( + ir, ivs0[1], itemOp, + collective->Recurrence); + accum = isTailExclusiveScan ? ivs0[1] : nextScanIV; + ivsNext0[1] = nextScanIV; + } + + createWorkItemLoopBody( + *barrierTail, ir, block, barrierID, dim_0, dim_1, + dim_2, accum, /*VF*/ nullptr, mainLoopLimit); + + if (!noExplicitSubgroups) { + nextSubgroupIV = ir.CreateAdd( + ivs0[0], ConstantInt::get(i32Ty, 1), + "sg.x.tail.inc"); + ivsNext0[0] = nextSubgroupIV; + } + + tailLoopBB = block; + // Move the exit after the loop block, as it reads + // more logically. + if (tailExitBB) { + tailExitBB->moveAfter(tailLoopBB); + } + + return block; + }); + } + + // Merge the main and tail subgroup IVs together in the + // tail exit, since we may have skipped either main or + // tail loops. + if (subgroupMergePhi) { + auto *scalarSubgroupIV = nextSubgroupIV; + nextSubgroupIV = PHINode::Create( + i32Ty, 2, "sg.main.tail.merge", tailExitBB); + cast(nextSubgroupIV) + ->addIncoming(scalarSubgroupIV, tailLoopBB); + cast(nextSubgroupIV) + ->addIncoming(subgroupMergePhi, mainExitBB); + } + + if (scanMergePhi) { + auto *scalarScanIV = nextScanIV; + nextScanIV = + PHINode::Create(accum->getType(), 2, + "scan.main.tail.merge", tailExitBB); + cast(nextScanIV) + ->addIncoming(scalarScanIV, tailLoopBB); + cast(nextScanIV) + ->addIncoming(scanMergePhi, mainExitBB); + } + } + + if (!noExplicitSubgroups) { + // Don't forget to update the subgroup IV phi. + ivsNext1[0] = nextSubgroupIV; + } + + if (isScan) { + // ... or the scan IV phi. + ivsNext1[1] = nextScanIV; + } + + return tailExitBB; + }); + + if (!noExplicitSubgroups) { + // Don't forget to update the subgroup IV phi. + ivsNext2[0] = nextSubgroupIV; + } + if (isScan) { + // ... or the scan IV phi. + ivsNext2[1] = nextScanIV; + } + + return exit1; + }); + } + + // It executes only the first work item in the work group + BasicBlock *makeRunOneWorkItem(BasicBlock *block, unsigned barrierID) { + // "Once" scheduled barriers shouldn't need the local id set. + IRBuilder<> ir(block); + createWorkItemLoopBody(barrierTail ? *barrierTail : barrierMain, ir, block, + barrierID, nullptr, nullptr, nullptr, nullptr); + return block; + } +}; + +// Emits code to set up the storage allocated to a live-vars structure. +// +// Allocates enough space for sizeZ * sizeY * sizeX work-items. Note that Z/Y/X +// here corresponds to the current outermost to innermost vectorized +// dimensions, rather than in their absolutist sense. +void setUpLiveVarsAlloca(compiler::utils::BarrierWithLiveVars &barrier, + IRBuilder<> &B, Value *const sizeZ, Value *const sizeY, + Value *const sizeX, StringRef name, bool isDebug) { + barrier.setSize0(sizeX); + Value *const live_var_size = B.CreateMul(sizeX, B.CreateMul(sizeY, sizeZ)); + barrier.setTotalSize(live_var_size); + AllocaInst *live_var_mem_space; + auto &m = *B.GetInsertBlock()->getModule(); + auto *const size_ty = compiler::utils::getSizeType(m); + const auto scalablesSize = barrier.getLiveVarMemSizeScalable(); + if (scalablesSize == 0) { + live_var_mem_space = + B.CreateAlloca(barrier.getLiveVarsType(), live_var_size, name); + live_var_mem_space->setAlignment( + MaybeAlign(barrier.getLiveVarMaxAlignment()).valueOrOne()); + barrier.setMemSpace(live_var_mem_space); + } else { + const auto fixedSize = barrier.getLiveVarMemSizeFixed(); + // We ensure that the VFs are the same between the main and tail. + auto *const vscale = + B.CreateElementCount(size_ty, ElementCount::getScalable(scalablesSize)); + auto *const structSize = + B.CreateAdd(vscale, ConstantInt::get(size_ty, fixedSize)); + auto *const buffer_size = B.CreateMul(structSize, live_var_size); + + live_var_mem_space = B.CreateAlloca(B.getInt8Ty(), buffer_size, name); + live_var_mem_space->setAlignment( + MaybeAlign(barrier.getLiveVarMaxAlignment()).valueOrOne()); + barrier.setMemSpace(live_var_mem_space); + barrier.setStructSize(structSize); + } + + if (isDebug) { + barrier.setDebugAddr(B.CreateAlloca(live_var_mem_space->getType(), nullptr, + "live_vars_peel_dbg")); + } +} + +} // namespace + +Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction( + BarrierWithLiveVars &barrierMain, BarrierWithLiveVars *barrierTail, + StringRef baseName, Module &M, compiler::utils::BuiltinInfo &BI) { + Function &mainF = barrierMain.getFunc(); + + // The reference function is that which we expect to hold the reference + // version of various pieces of data, such as metadata. It's the tail + // function if one exists, else it's the main function. + Function &refF = barrierTail ? barrierTail->getFunc() : barrierMain.getFunc(); + + const bool emitTail = barrierTail != nullptr; + + auto mainInfo = barrierMain.getVFInfo(); + auto tailInfo = + emitTail ? barrierTail->getVFInfo() : std::optional(); + + const auto workItemDim0 = 0; + const auto workItemDim1 = 1; + const auto workItemDim2 = 2; + + LLVMContext &context = M.getContext(); + + Function *new_wrapper = + createKernelWrapperFunction(mainF, ".mux-barrier-wrapper"); + + new_wrapper->setName(baseName + ".mux-barrier-wrapper"); + // Ensure the base name is recorded + setBaseFnName(*new_wrapper, baseName); + + // An inlinable function call in a function with debug info *must* be given + // a debug location. + DILocation *wrapperDbgLoc = nullptr; + if (new_wrapper->getSubprogram()) { + wrapperDbgLoc = DILocation::get(context, /*line*/ 0, /*col*/ 0, + new_wrapper->getSubprogram()); + } + + IRBuilder<> entryIR(BasicBlock::Create(context, "entry", new_wrapper)); + + auto *const i32Ty = Type::getInt32Ty(context); + + auto sizeTyBytes = getSizeTypeBytes(M); + + auto *VF = entryIR.CreateElementCount(compiler::utils::getSizeType(M), + barrierMain.getVFInfo().vf); + Value *localSizeDim[3]; + + if (auto wgs = parseRequiredWGSMetadata(refF)) { + localSizeDim[0] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[0]); + localSizeDim[1] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[1]); + localSizeDim[2] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[2]); + } else { + const uint32_t max_work_dim = parseMaxWorkDimMetadata(refF).value_or(3); + + // Fill out a default local size of 1x1x1. + std::fill(std::begin(localSizeDim), std::end(localSizeDim), + entryIR.getIntN(8 * sizeTyBytes, 1)); + + auto *const get_local_size = + BI.getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M); + assert(get_local_size && "Missing __mux_get_local_size"); + + auto ci0 = + entryIR.CreateCall(get_local_size, entryIR.getInt32(0), "local_size.x"); + ci0->setCallingConv(get_local_size->getCallingConv()); + localSizeDim[0] = ci0; + + if (max_work_dim > 1) { + auto ci1 = entryIR.CreateCall(get_local_size, entryIR.getInt32(1), + "local_size.y"); + ci1->setCallingConv(get_local_size->getCallingConv()); + localSizeDim[1] = ci1; + } + + if (max_work_dim > 2) { + auto ci2 = entryIR.CreateCall(get_local_size, entryIR.getInt32(2), + "local_size.z"); + ci2->setCallingConv(get_local_size->getCallingConv()); + localSizeDim[2] = ci2; + } + } + + // Assume that local sizes are never zero. This prevents LLVM "saving" our + // loops by inserting llvm.umax (or its equivalent) to stop the loops we're + // about to create from causing headaches: + // %iv.next = add i64 nuw %iv, 1 + // %exit = icmp eq i64 %iv.next, %localsizeY + // br i1 %exit, label %exit.the.loop, %continue.the.loop + // If LLVM doesn't know that %localsizey is never zero, it rightly determines + // that a zero size would cause problems, since we'd have to overflow our i64 + // to exit the loop, but we've marked the increment as 'nuw'. So it inserts + // an llvm.umax to ensure the size is at least 1. Since we know our local + // sizes are never zero, an llvm.assume intrinsic prevents this from + // happening. + // We want to insert a call to __mux__set_max_sub_group_size after these + // assumptions, to keep track of the last one we've inserted. + for (auto i = 0; i < 3; i++) { + auto *const nonZero = entryIR.CreateICmpNE( + localSizeDim[i], ConstantInt::get(localSizeDim[i]->getType(), 0)); + entryIR.CreateAssumption(nonZero); + } + + // There are four cases: + // + // 1. If !emitTail: in this case, only the main function will be called. The + // main function may be a scalar function, may be a predicated vector + // function, or may be an unpredicated vector function where the local size is + // known to be a multiple of the vectorization factor. + // + // 2. Otherwise, if tailInfo->IsVectorPredicated: in this case, the main + // function will be unpredicated and will be called for any multiples of vf, + // and one tail call will handle any remainder. vf of the main function and + // the tail function are the same. + // + // 3. Otherwise, if hasNoExplicitSubgroups(refF): in this case, the main + // function will be unpredicated and will be called for any multiples of vf, + // and one tail loop will handle any remainder. vf of the main function is + // used. + // + // 4. Otherwise: if local_size_x is a multiple of the main function's vf, the + // main function will handle the full loop and the main function's vf is used, + // else the tail function will handle the full loop and the tail function's vf + // is used. + // + // Unless hasNoExplicitSubgroups(refF), the subgroups are calculated as + // + // get_max_sub_group_size() = min(vf, local_size_x) + // get_num_sub_groups() = ((local_size_x + vector_width - 1) / vf) + // * local_size_y * local_size_z + // + // If hasNoExplicitSubgroups(refF) (even for cases 1 and 2), the subgroups are + // not calculated. + + const bool noExplicitSubgroups = hasNoExplicitSubgroups(refF); + + Value *mainLoopLimit = localSizeDim[workItemDim0]; + Value *peel = nullptr; + + Value *effectiveVF = VF; + + if (emitTail) { + auto *const rem = entryIR.CreateSRem(mainLoopLimit, VF, "rem"); + if (tailInfo->IsVectorPredicated || noExplicitSubgroups) { + peel = rem; + } else { + // We must have no more than one iteration with a subgroup size below the + // maximum subgroup size. To meet this requirement, if the tail is scalar + // and the vector size does not divide the workgroup size, do not use the + // vectorized kernel at all. + auto *const remcond = entryIR.CreateICmpNE( + rem, Constant::getNullValue(rem->getType()), "remcond"); + peel = entryIR.CreateSelect( + remcond, mainLoopLimit, + Constant::getNullValue(mainLoopLimit->getType()), "peel"); + effectiveVF = + entryIR.CreateSelect(remcond, + entryIR.CreateElementCount( + VF->getType(), barrierTail->getVFInfo().vf), + VF); + } + mainLoopLimit = entryIR.CreateSub(mainLoopLimit, peel, "mainLoopLimit"); + } + + // Set the subgroup maximum size and number of subgroups in this kernel + // wrapper. + if (!noExplicitSubgroups) { + auto setMaxSubgroupSizeFn = + BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetMaxSubGroupSize, M); + assert(setMaxSubgroupSizeFn && "Missing __mux_set_max_sub_group_size"); + auto setNumSubgroupsFn = + BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetNumSubGroups, M); + assert(setNumSubgroupsFn && "Missing __mux_set_num_sub_groups"); + auto *const localSizeInVecDim = localSizeDim[workItemDim0]; + auto *const localSizeInNonVecDim = entryIR.CreateMul( + localSizeDim[workItemDim1], localSizeDim[workItemDim2], "wg.yz"); + auto *maxSubgroupSize = entryIR.CreateBinaryIntrinsic( + Intrinsic::umin, localSizeInVecDim, effectiveVF, {}, "sg.x"); + entryIR.CreateCall(setMaxSubgroupSizeFn, + {entryIR.CreateTrunc(maxSubgroupSize, i32Ty)}); + auto *const numSubgroupsInVecDim = entryIR.CreateUDiv( + entryIR.CreateAdd( + localSizeInVecDim, + entryIR.CreateSub(effectiveVF, + ConstantInt::get(effectiveVF->getType(), 1))), + effectiveVF, "sgs.x"); + auto *const numSubgroups = + entryIR.CreateMul(numSubgroupsInVecDim, localSizeInNonVecDim, "sgs"); + entryIR.CreateCall(setNumSubgroupsFn, + {entryIR.CreateTrunc(numSubgroups, i32Ty)}); + } + + if (barrierMain.hasLiveVars()) { + // The size in the first dimension is divided by the vectorization factor. + // When vector-predicated, this result is rounded up: (LIM + VF - 1) / VF. + // This catches cases where we need two loop iterations, e.g., VF=4 and + // size=7, where rounding down would give one. + Value *numerator = mainLoopLimit; + if (mainInfo.IsVectorPredicated) { + Value *const vf_minus_1 = + entryIR.CreateSub(VF, ConstantInt::get(VF->getType(), 1)); + numerator = entryIR.CreateAdd(mainLoopLimit, vf_minus_1); + } + Value *const size0 = entryIR.CreateUDiv(numerator, VF); + + setUpLiveVarsAlloca(barrierMain, entryIR, localSizeDim[workItemDim2], + localSizeDim[workItemDim1], size0, "live_variables", + IsDebug); + } + + // Amazingly, it's possible for the tail kernel to have live vars in its + // barriers, even when the main kernel does not. + if (emitTail && barrierTail->hasLiveVars()) { + Value *size0 = peel; + if (tailInfo->IsVectorPredicated) { + // If the tail is predicated, it will only have a single (vectorized) item + // along the X axis, or none. + auto *const hasLeftover = entryIR.CreateICmp( + CmpInst::ICMP_NE, peel, ConstantInt::get(peel->getType(), 0), + "tail.has.vp"); + size0 = entryIR.CreateZExt(hasLeftover, peel->getType()); + } + setUpLiveVarsAlloca(*barrierTail, entryIR, localSizeDim[workItemDim2], + localSizeDim[workItemDim1], size0, + "live_variables_peel", IsDebug); + } + + // next means next barrier id. This variable is uninitialized to begin with, + // and is set by the first pass below + IntegerType *index_type = i32Ty; + AllocaInst *nextID = + entryIR.CreateAlloca(index_type, nullptr, "next_barrier_id"); + + std::map bbs; + // The vectorized kernel has been further optimized and may have removed + // unreachable barriers that are still present in the scalar kernel. But if + // they are unreachable, we know they must also be unreachable in the scalar + // kernel even if we have not yet detected that. + + for (auto &[i, subkernel] : barrierMain.getSubkernels()) { + bbs[i] = BasicBlock::Create(context, "sw.bb", new_wrapper); + } + + ScheduleGenerator schedule(M, barrierMain, barrierTail, BI); + schedule.workItemDim0 = workItemDim0; + schedule.workItemDim1 = workItemDim1; + schedule.workItemDim2 = workItemDim2; + schedule.localSizeDim[0] = localSizeDim[0]; + schedule.localSizeDim[1] = localSizeDim[1]; + schedule.localSizeDim[2] = localSizeDim[2]; + schedule.wrapperDbgLoc = wrapperDbgLoc; + schedule.nextID = nextID; + schedule.mainLoopLimit = mainLoopLimit; + schedule.noExplicitSubgroups = noExplicitSubgroups; + schedule.emitTail = emitTail; + schedule.peel = peel; + + // Make call instruction for first new kernel. It follows wrapper function's + // parameters. + for (auto &arg : new_wrapper->args()) { + schedule.args.push_back(&arg); + } + + // Branch directly into the first basic block. + entryIR.CreateBr(bbs[kBarrier_FirstID]); + + for (auto &[i_, subkernel_] : barrierMain.getSubkernels()) { + auto i = i_; + + // Keep it linear + BasicBlock *const block = bbs[i]; + block->moveAfter(&new_wrapper->back()); + + if (i == kBarrier_EndID) { + // This basic block breaks us out of our function, thus we return! + ReturnInst::Create(context, block); + } else { + // Re-issue the barrier's memory fence before the work-item loops + if (auto *const CI = barrierMain.getBarrierCall(i)) { + auto *const callee = CI->getCalledFunction(); + const auto builtin = BI.analyzeBuiltin(*callee); + if (builtin && + builtin->ID == compiler::utils::eMuxBuiltinWorkGroupBarrier) { + IRBuilder<> B(block); + auto *MemBarrier = + BI.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M); + assert(MemBarrier); + Value *Ops[2] = {CI->getOperand(1), CI->getOperand(2)}; + + auto *const Call = B.CreateCall(MemBarrier, Ops); + + // Patch up any operands that were non-constants by fetching them from + // the barrier struct. We do this after creating the call because we + // need an instruction to function as an insert point. + if (!isa(Ops[0]) || !isa(Ops[1])) { + // We expect these values to be uniform so it should be safe to get + // from the barrier struct at index zero. Barriers are convergent, + // so there should be no chance that the value does not exist. + auto *const zero = + Constant::getNullValue(compiler::utils::getSizeType(M)); + IRBuilder<> ir(Call); + auto *const barrier0 = + ir.CreateInBoundsGEP(barrierMain.getLiveVarsType(), + barrierMain.getMemSpace(), {zero}); + + Barrier::LiveValuesHelper live_values(barrierMain, Call, barrier0); + + size_t op_index = 0; + for (auto *const op : Ops) { + if (!isa(op)) { + auto *const new_op = + live_values.getReload(op, ir, "_load", /*reuse*/ true); + Call->setArgOperand(op_index, new_op); + } + ++op_index; + } + } + Call->setDebugLoc(wrapperDbgLoc); + } + } + + auto *const exitBlock = [&]() { + switch (barrierMain.getSchedule(i)) { + case BarrierSchedule::Unordered: + case BarrierSchedule::ScalarTail: + if (tailInfo && tailInfo->IsVectorPredicated) { + return schedule.makeLinearWorkItemLoops(block, i); + } + return schedule.makeWorkItemLoops(block, i); + + case BarrierSchedule::Once: + return schedule.makeRunOneWorkItem(block, i); + + case BarrierSchedule::Linear: + return schedule.makeLinearWorkItemLoops(block, i); + } + + llvm_unreachable("Unexpected barrier schedule enum"); + }(); + + // the last basic block in our function! + IRBuilder<> exitIR(exitBlock); + + const auto &successors = barrierMain.getSuccessorIds(i); + const auto num_succ = successors.size(); + + if (num_succ == 1) { + // If there is only one successor, we can branch directly to it + exitIR.CreateBr(bbs.find(successors.front())->second); + } else if (num_succ == 2) { + // If there are exactly two successors, we can use a conditional branch + auto *const bb_id = ConstantInt::get(index_type, successors[0]); + auto *const br_block = + BasicBlock::Create(context, "barrier.branch", new_wrapper); + auto *const ld_next_id = new LoadInst(index_type, nextID, "", br_block); + auto *const cmp_id = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, ld_next_id, + bb_id, "", br_block); + BranchInst::Create(bbs.find(successors[0])->second, + bbs.find(successors[1])->second, cmp_id, br_block); + + exitIR.CreateBr(br_block); + } else if (num_succ == 0) { + // If a barrier region has no successor, we just emit a call to + // llvm.trap and unreachable. A barrier region can have zero successors + // if all its terminators end in unreachable. Since there are no + // successors, it is not possible to continue and therefore we emit an + // unreachable here. + + // TODO: we should be flagging up unreachables sooner, so that we avoid + // wrapping barrier regions with no successors with work item loops, + // and we should also make sure that the barrier region has no + // successors because of all its terminators ending in unreachable. + // If it's not the case we may want to handle that differently. + auto trap = + M.getOrInsertFunction("llvm.trap", Type::getVoidTy(context)); + exitIR.CreateCall(trap); + exitIR.CreateUnreachable(); + } else { + // Make a basic block with a switch to jump to the next subkernel + auto *const switch_body = + BasicBlock::Create(context, "barrier.switch", new_wrapper); + LoadInst *const ld_next_id = + new LoadInst(index_type, nextID, "", switch_body); + SwitchInst *const sw = SwitchInst::Create( + ld_next_id, bbs.find(successors[0])->second, num_succ, switch_body); + for (const auto i : successors) { + sw->addCase(ConstantInt::get(index_type, i), bbs.find(i)->second); + } + exitIR.CreateBr(switch_body); + } + } + } + + bbs[kBarrier_EndID]->moveAfter(&new_wrapper->back()); + bbs[kBarrier_EndID]->setName("kernel.exit"); + + // Remap any constant expression which take a reference to the old function + // FIXME: What about the main function? + for (auto *user : make_early_inc_range(refF.users())) { + if (ConstantExpr *constant = dyn_cast(user)) { + remapConstantExpr(constant, &refF, new_wrapper); + } else if (ConstantArray *ca = dyn_cast(user)) { + remapConstantArray(ca, &refF, new_wrapper); + } else if (!isa(user)) { + llvm_unreachable( + "Cannot handle user of function being anything other than a " + "ConstantExpr, ConstantArray or CallInst"); + } + } + // We output the number of uses here to lit test that the number of uses was + // not increased by the remap functions. + LLVM_DEBUG(dbgs() << "Uses of " << refF.getName() << ": " << refF.getNumUses() + << "\n"); + + // Forcibly disable the tail info if we know we've omitted it. + if (!schedule.wrapperHasMain || !schedule.wrapperHasTail) { + // If we're missing a main loop then the tail loop becomes the main from + // the perspective of the metadata: have that steal the tail loop info. We + // should always have a main loop with an optional tail. + if (!schedule.wrapperHasMain) { + if (schedule.wrapperHasTail && tailInfo) { + mainInfo = *tailInfo; + } else { + // If we have neither a main nor a tail (which may happen at kernel + // compile time but we should never actually execute such a kernel - + // we already assume the local sizes are never zero, see elsewhere in + // this pass) then encode a token info metadata of 1. + mainInfo = VectorizationInfo{ElementCount::getFixed(1), workItemDim0, + /*isVectorPredicated*/ false}; + } + } + tailInfo = std::nullopt; + } + + encodeWrapperFnMetadata(*new_wrapper, mainInfo, tailInfo); + + // The subkernels can be marked as internal since its external uses have been + // superceded by this wrapper. This will help it get DCE'd once inlined. Any + // existing calls to this subkernel (e.g., another kernel calling this + // kernel) will prevent it from being removed unnecessarily. + barrierMain.getFunc().setLinkage(Function::InternalLinkage); + if (barrierTail) { + barrierTail->getFunc().setLinkage(Function::InternalLinkage); + } + + return new_wrapper; +} + +struct BarrierWrapperInfo { + StringRef BaseName; + // Information about the 'main' kernel + Function *MainF; + compiler::utils::VectorizationInfo MainInfo; + // Optional information about the 'tail' kernel + Function *TailF = nullptr; + std::optional TailInfo = std::nullopt; + // A 'tail' kernel which was explicitly omitted. + Function *SkippedTailF = nullptr; +}; + +PreservedAnalyses +compiler::utils::WorkItemLoopsPass::run(Module &M, ModuleAnalysisManager &MAM) { + // Cache the functions we're interested in as this pass introduces new ones + // which we don't want to run over. + SmallVector MainTailPairs; + const auto &GSGI = MAM.getResult(M); + + for (auto &F : M.functions()) { + if (!isKernelEntryPt(F)) { + continue; + } + + const auto BaseName = getBaseFnNameOrFnName(F); + auto VeczToOrigFnData = parseVeczToOrigFnLinkMetadata(F); + + const auto WorkItemDim0 = 0; + + const VectorizationInfo scalarTailInfo{ElementCount::getFixed(1), + WorkItemDim0, + /*IsVectorPredicated*/ false}; + + if (!VeczToOrigFnData) { + // If there was no vectorization metadata, it's a scalar kernel. + MainTailPairs.push_back({BaseName, &F, scalarTailInfo}); + continue; + } + + // If we got a vectorized kernel, wrap it using the vectorization factor. + const auto MainInfo = VeczToOrigFnData->second; + + // Start out assuming scalar tail, which is the default behaviour... + auto TailInfo = scalarTailInfo; + auto *TailFunc = VeczToOrigFnData->first; + // ... and search for a linked vector-predicated tail, which we prefer. + if (!MainInfo.IsVectorPredicated && TailFunc) { + SmallVector LinkedFns; + parseOrigToVeczFnLinkMetadata(*TailFunc, LinkedFns); + for (const auto &Link : LinkedFns) { + // Restrict our option to strict VF==VF matches. + if (Link.first != &F && Link.second.vf == MainInfo.vf && + Link.second.IsVectorPredicated) { + TailFunc = Link.first; + TailInfo = Link.second; + break; + } + } + } + + std::optional LocalSizeInVecDim; + if (auto WGS = parseRequiredWGSMetadata(F)) { + LocalSizeInVecDim = (*WGS)[WorkItemDim0]; + } + + // We can skip the tail in the following circumstances: + // * If we have no tail function (trusting that this is okay) + // * Vector-predicated kernels handle their own tails + // * The user has explicitly forced us to omit tails + // * We can prove that the vectorization factor fits the required/known + // local work-group size + if (!TailFunc || MainInfo.IsVectorPredicated || ForceNoTail || + (LocalSizeInVecDim && !MainInfo.vf.isScalable() && + *LocalSizeInVecDim % MainInfo.vf.getKnownMinValue() == 0)) { + MainTailPairs.push_back({BaseName, &F, MainInfo, /*TailF*/ nullptr, + /*TailInfo*/ std::nullopt, + /*SkippedTailF*/ TailFunc}); + } else { + // Else, emit a tail using the tail function. + MainTailPairs.push_back({BaseName, &F, MainInfo, TailFunc, TailInfo}); + } + } + + if (MainTailPairs.empty()) { + return PreservedAnalyses::all(); + } + + // Prune redundant wrappers we don't want to create for the sake of compile + // time. + SmallPtrSet RedundantMains; + for (const auto &P : MainTailPairs) { + // If we're creating a wrapper with a skipped 'tail' or a scalar 'tail', we + // don't want to create another wrapper where the scalar tail is the + // 'main', unless that tail is useful as a fallback sub-group kernel. A + // fallback sub-group kernel is one for which: + // * The 'main' has a required sub-group size that isn't the scalar size. + // * The 'main' and 'tail' kernels both make use of sub-group builtins. If + // neither do, there's no need for the fallback. + // * The 'main' kernel uses sub-groups but the 'main' vectorization factor + // cleanly divides the known local work-group size. + if (P.SkippedTailF || (P.TailInfo && P.TailInfo->vf.isScalar())) { + const auto *TailF = P.SkippedTailF ? P.SkippedTailF : P.TailF; + if (getReqdSubgroupSize(*P.MainF).value_or(1) != 1 || + (!GSGI.usesSubgroups(*P.MainF) && !GSGI.usesSubgroups(*TailF))) { + RedundantMains.insert(TailF); + } else if (auto wgs = parseRequiredWGSMetadata(*P.MainF)) { + const uint64_t local_size_x = wgs.value()[0]; + if (!P.MainInfo.IsVectorPredicated && + !(local_size_x % P.MainInfo.vf.getKnownMinValue())) { + RedundantMains.insert(TailF); + } + } + } + // If we're creating a wrapper with a VP 'tail', we don't want to create + // another wrapper where the VP is the 'main' + if (!P.MainInfo.IsVectorPredicated && P.TailInfo && + P.TailInfo->IsVectorPredicated) { + RedundantMains.insert(P.TailF); + } + } + + MainTailPairs.erase( + std::remove_if(MainTailPairs.begin(), MainTailPairs.end(), + [&RedundantMains](const BarrierWrapperInfo &I) { + return RedundantMains.contains(I.MainF); + }), + MainTailPairs.end()); + + SmallPtrSet Wrappers; + auto &BI = MAM.getResult(M); + + for (const auto &P : MainTailPairs) { + assert(P.MainF && "Missing main function"); + // Construct the main barrier + BarrierWithLiveVars MainBarrier(M, *P.MainF, P.MainInfo, IsDebug); + MainBarrier.Run(MAM); + + // Tail kernels are optional + if (!P.TailF) { + Wrappers.insert( + makeWrapperFunction(MainBarrier, nullptr, P.BaseName, M, BI)); + } else { + // Construct the tail barrier + assert(P.TailInfo && "Missing tail info"); + BarrierWithLiveVars TailBarrier(M, *P.TailF, *P.TailInfo, IsDebug); + TailBarrier.Run(MAM); + + Wrappers.insert( + makeWrapperFunction(MainBarrier, &TailBarrier, P.BaseName, M, BI)); + } + } + + // At this point we mandate that any kernels that haven't been wrapped with + // work-item loops can't be kernels, nor entry points. + for (auto &F : M) { + if (isKernelEntryPt(F) && !Wrappers.contains(&F)) { + dropIsKernel(F); + // FIXME: Also mark them as internal in case they contain symbols we + // haven't resolved as part of the work-item loop wrapping process. We + // rely on GlobalOptPass to remove such functions; this is the same root + // issue as some mux builtins require DCE for correctness. + F.setLinkage(GlobalValue::InternalLinkage); + } + } + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt new file mode 100644 index 0000000000000..7aa151998effa --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt @@ -0,0 +1,135 @@ +set(VECZ_PUBLIC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include) +set(VECZ_PRIVATE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/source) +set(VECZ_PRIVATE_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/source/include) + +set(COMMON_SRCS + ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/pass.h + ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/vecz_choices.h + ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/vecz_target_info.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/control_flow_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/divergence_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/instantiation_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/liveness_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/packetization_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/simd_width_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/stride_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/uniform_value_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/vectorizable_function_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/vectorization_unit_analysis.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/common_gep_elimination_pass.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/control_flow_conversion_pass.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/inline_post_vectorization_pass.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/instantiation_pass.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/interleaved_group_combine_pass.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetization_helpers.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetization_pass.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetizer.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/passes.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/printf_scalarizer.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/scalarization_pass.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/scalarizer.h + ${VECZ_PRIVATE_INCLUDE_DIR}/transform/ternary_transform_pass.h + ${VECZ_PRIVATE_INCLUDE_DIR}/control_flow_boscc.h + ${VECZ_PRIVATE_INCLUDE_DIR}/control_flow_roscc.h + ${VECZ_PRIVATE_INCLUDE_DIR}/debugging.h + ${VECZ_PRIVATE_INCLUDE_DIR}/ir_cleanup.h + ${VECZ_PRIVATE_INCLUDE_DIR}/llvm_helpers.h + ${VECZ_PRIVATE_INCLUDE_DIR}/memory_operations.h + ${VECZ_PRIVATE_INCLUDE_DIR}/offset_info.h + ${VECZ_PRIVATE_INCLUDE_DIR}/reachability.h + ${VECZ_PRIVATE_INCLUDE_DIR}/simd_packet.h + ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_context.h + ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_helpers.h + ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_heuristics.h + ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_unit.h + ${VECZ_PRIVATE_INCLUDE_DIR}/vectorizer.h + ${VECZ_PRIVATE_INCLUDE_DIR}/vecz_pass_builder.h + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/control_flow_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/divergence_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/instantiation_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/liveness_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/packetization_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/simd_width_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/stride_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/uniform_value_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/vectorizable_function_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/analysis/vectorization_unit_analysis.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/basic_mem2reg_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/builtin_inlining_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/common_gep_elimination_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/control_flow_conversion_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/inline_post_vectorization_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/loop_rotate_custom_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/instantiation_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/interleaved_group_combine_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetization_helpers.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetization_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetizer.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/passes.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/pre_linearize_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/printf_scalarizer.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/remove_intptr_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/scalarization_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/scalarizer.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/simplify_infinite_loop_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/squash_small_vectors_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/ternary_transform_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/transform/uniform_reassociation_pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/control_flow_boscc.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/control_flow_roscc.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/debugging.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/ir_cleanup.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/llvm_helpers.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/memory_operations.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/offset_info.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/pass.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/reachability.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/simd_packet.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info_arm.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info_riscv.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_choices.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_context.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_helpers.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_heuristics.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_unit.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vectorizer.cpp + ${VECZ_PRIVATE_SOURCE_DIR}/vecz_pass_builder.cpp +) + +if(MSVC) + # Disable: unreferenced formal parameter. + list(REMOVE_ITEM VECZ_COMPILE_OPTIONS -we4100) + list(APPEND VECZ_COMPILE_OPTIONS -wd4100) +endif() + +add_llvm_component_library(LLVMNativeCPUVecz + ${COMMON_SRCS} + LINK_COMPONENTS + NativeCPUPipeline + support + core + analysis + instcombine + aggressiveinstcombine + transformutils + scalaropts + ipo + passes + ) + +target_include_directories(LLVMNativeCPUVecz + PUBLIC $ + PRIVATE $ +) +target_compile_options(LLVMNativeCPUVecz PRIVATE ${VECZ_COMPILE_OPTIONS}) +target_compile_definitions(LLVMNativeCPUVecz PRIVATE + ${VECZ_COMPILE_DEFINITIONS}) + +# Currently disabled by default, these allow us to run lit tests using veczc +# with the target check-sycl-vecz +set(NATIVE_CPU_BUILD_VECZ_TEST_TOOLS OFF CACHE BOOL "Build vecz test and tools") +if (NATIVE_CPU_BUILD_VECZ_TEST_TOOLS) + add_subdirectory(tools) + add_subdirectory(test) +endif() diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h new file mode 100644 index 0000000000000..d7e59337fc261 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h @@ -0,0 +1,150 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Vecz passes header. + +#ifndef VECZ_PASS_H +#define VECZ_PASS_H + +#include +#include +#include + +#include +#include + +#include "vecz/vecz_choices.h" + +namespace llvm { +class ModulePass; +class StringRef; +class Module; +class TargetMachine; +} // namespace llvm + +namespace compiler { +namespace utils { +class BuiltinInfo; +} // namespace utils +} // namespace compiler + +namespace vecz { +/// @addtogroup vecz +/// @{ + +struct VeczPassOptions { + /// @brief boolean choices such as double support, partial scalarization + vecz::VectorizationChoices choices; + + /// @brief vectorization factor, including known min and scalable flag + llvm::ElementCount factor = llvm::ElementCount::getFixed(1); + + /// @brief automatically work out factor + bool vecz_auto = false; + + /// @brief Index of vectorization dimension to use (0 => x, 1 => y, 2 => z). + uint32_t vec_dim_idx = 0; + + /// @brief local_size Value specifying the local size for the function (0 is + /// unknown) + uint64_t local_size = 0; +}; + +/// @brief Returns the vectorization options that would vectorize the provided +/// function to its required sub-group size. +std::optional getReqdSubgroupSizeOpts(llvm::Function &); + +/// @brief Returns the vectorization options that would vectorize the provided +/// function to its required sub-group size (if set) or one of the device's +/// sub-group sizes. +/// +/// Only returns options if the function uses sub-group operations, as +/// determined by the SubGroupAnalysis pass. +/// +/// Tries to find a good fit that produces one of the device's sub-group sizes, +/// preferring ones which fit the known local work-group size and powers of +/// two. The device's sub-group sizes can be sorted such that preferable sizes +/// are placed towards the front. +std::optional +getAutoSubgroupSizeOpts(llvm::Function &, llvm::ModuleAnalysisManager &); + +/// @brief Analysis pass which determines on which functions @ref RunVeczPass +/// should operate. +class VeczPassOptionsAnalysis + : public llvm::AnalysisInfoMixin { + using VeczPassOptionsCallbackFn = + std::function &)>; + friend AnalysisInfoMixin; + static llvm::AnalysisKey Key; + VeczPassOptionsCallbackFn queryFunc = + [](llvm::Function &F, llvm::ModuleAnalysisManager &, + llvm::SmallVectorImpl &Opts) -> bool { + if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) { + return false; + } + // TODO what are our defaults, here? + Opts.emplace_back(); + return true; + }; + +public: + VeczPassOptionsAnalysis() = default; + /// @brief explicit constructor which uses the given callback to determine + /// whether vectorization should be performed on the passed function. If the + /// default constructor is used, all functions with a SPIR calling convention + /// will be vectorized + explicit VeczPassOptionsAnalysis(VeczPassOptionsCallbackFn queryFunc) + : queryFunc(queryFunc) {} + using Result = VeczPassOptionsCallbackFn; + Result run(llvm::Module &, llvm::ModuleAnalysisManager &) { + return queryFunc; + } +}; + +/// @brief A helper pass which can be used to inspect and test the +/// vectorization options set on a per-function basis. +class VeczPassOptionsPrinterPass + : public llvm::PassInfoMixin { + llvm::raw_ostream &OS; + +public: + explicit VeczPassOptionsPrinterPass(llvm::raw_ostream &OS) : OS(OS) {} + + llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &); +}; + +/// @brief A new-style module pass that provides a wrapper for using the +/// the ComputeAorta IR vectorizer. This vectorizes kernels +/// to vectorization factor specified when the pass is created. In our case this +/// is typically the local size in the first dimension but there are other +/// factors to consider when picking the vectorization factor, like being a +/// power of 2. This pass queries the @ref `VeczShouldRunOnFunctionAnalysis`, so +/// if you do not wish all kernels to be vectorized, you must ensure your pass +/// manager's ModuleAnalysisManager is configured with a custom @ref +/// `VeczShouldRunOnFunctionAnalysis` +class RunVeczPass : public llvm::PassInfoMixin { +public: + /// @brief llvm's entry point for the PassManager + llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &); +}; + +/// @} +} // namespace vecz + +#endif // VECZ_PASS_H diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h new file mode 100644 index 0000000000000..64ed72c120d98 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h @@ -0,0 +1,294 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Internal Vecz Choices header. + +#ifndef VECZ_VECZ_CHOICES_H_INCLUDED +#define VECZ_VECZ_CHOICES_H_INCLUDED + +#include +#include +#include + +// Forward declaration +namespace llvm { +class StringRef; +class Twine; +} // namespace llvm + +namespace vecz { + +/// @brief Describes and holds various Vecz choices. +/// +/// These choices can affect the code generated and are usually optimization +/// related. Since they are not always the best choice for a given target, they +/// are controlled at runtime by this class. +class VectorizationChoices { +public: + VectorizationChoices(); + ~VectorizationChoices() = default; + + /// @brief Enumeration with the available choices for Vecz. + /// + /// These are choices that can affect the code generated, often for + /// optimization reasons. The Choices are prefixed by a `e` prefix, + /// where `` is an arbitrary string to help document the intention + /// of the Choice. For example, optimizations are prefixed with + /// `eOptimization`. + /// + /// @note Each Choice has to be uniquely named without taking into account + /// it's prefix, i.e. there shouldn't be any Choices sharing the same name + /// but with different prefixes. Also, Choices names must not start with + /// `"no"`, although different capitalizations (e.g. `"No"`) are allowed. + /// Additionally, Choices' names should contain only alphanumeric characters. + /// These restrictions are in place to allow for a `Choices` string to be + /// parsable easily. See, for example, `parseChoicesString` . If you add a + /// new Choice here, please also update the parseChoicesString function, as + /// well as the two relevant `cl::opt` in `vectorizer.cpp`. + enum Choice { + /// @brief An invalid Choice ID, useful for error checking etc. Equals 0. + eInvalid = 0, + /// @brief Packetize uniform instructions instead of using a vector splat. + /// + /// When going through the packetization process, the default behaviour when + /// encountering a uniform instruction is creating a vector splat + /// with its value and stopping the packetization there. This option changes + /// that behaviour, and instead makes the packetizer packetize even the + /// uniform instructions, provided that they are used by a varying + /// instruction. + eOptimizationPacketizeUniform, + /// @brief Packetize uniform instructions, but only in loops. + /// + /// This is similar to eOptimizationPacketizeUniform, with the difference + /// that it only affects uniform values used inside loops. + eOptimizationPacketizeUniformInLoops, + /// @brief Emit loops for instantiated call instructions + /// + /// This will emit instantiated call instruction in loops instead of + /// actually instantiating them. It only works when the call instruction has + /// no users. + eOptimizationInstantiateCallsInLoops, + /// @brief Use the BOSCC linearization algorithm during Control-Flow + // Conversion. + // + // @note This optimization retains uniform branches by duplicating pieces + // of the code. + eLinearizeBOSCC, + /// @brief Turn on full scalarization in the Scalarization pass + // + // This is useful for testing the scalarizer, and isn't intended to deliver + // any performance benefits. + eFullScalarization, + /// @brief Treat division operations as being able to throw CPU exceptions + /// + /// @note This choice must be enabled for strict correctness on targets that + /// support hardware exceptions on division by zero/division overflow, which + /// require extra code to prevent traps on inactive vector lanes during + /// linearization. However, any trapping behaviour of the input IR may be + /// preserved (that is, on positively-executed code paths); it is left to + /// the front end to conform to OpenCL spec in this regard. + eDivisionExceptions, + /// @brief Generate a vector-predicated kernel such that no work-items + /// (vector elements) with side effects with IDs beyond the local workgroup + /// size are enabled. + /// + /// @note The exact semantics concerning which operations are + /// masked/unmasked are not defined. The guarantee is that the vectorized + /// kernel will be safe to execute on workgroups with sizes smaller than + /// the vector width. Some architectures may want to predicate beyond that + /// remit for performance reasons, even if the vector-predicated operations + /// are safe to execute on any input. + eVectorPredication, + /// @brief Force a default vectorization width, made without + /// target-specific knowledge. + /// + /// @note This is most-commonly used in testing. Packetization may make + /// decisions based on the target, which can make testing more difficult. + /// This choice forces the default vector register width. + eTargetIndependentPacketization, + }; + + /// @brief Check if a choice is enabled or not + /// @param C The choice to check for + /// @return true if the choice is enabled, false otherwise + bool isEnabled(Choice C) const { return Enabled.contains(C); } + + /// @brief Enable a choice + /// @param C The choice to enable + /// @return true if the choice was already enabled, false otherwise + bool enable(Choice C) { + auto res = Enabled.insert(C); + return res.second; + } + + /// @brief Disable a choice + /// @param C The choice to disable + /// @return true if the choice was enabled, false otherwise + bool disable(Choice C) { return Enabled.erase(C); } + + /// @brief Parse a semicolon separated of Choices to enable or disable + /// + /// This functions accepts a string of Choices, separated by semicolon, and + /// enables or disables them. The Choices are parsed according to the + /// following rules: + /// - The Choices are separated by a semicolon (';') character + /// - Only one separator is allowed between each Choice. + /// - Trailing separators are ignored (but only one is allowed still). + /// - Choices are specified as they are in their enumerations, without the + /// "e" prefix. + /// - Choices can be prefixed with the "no" prefix (without any whitespace), + /// which specifies that the Choice needs to be disabled instead of being + /// enabled. + /// - The "no" prefix only applies to the Choice it is attached to and not to + /// any following Choices. + /// - Whitespace between the Choices and the separators, as well as leading + /// and trailing whitespace at the beginning and end of the string, is + /// ignored. + /// + /// Examples: + /// - "PacketizeUniform" + /// - "PacketizeUniform;InstantiateCallsInLoops" + /// - "PacketizeUniform ; noInstantiateCallsInLoops \n" + /// - " noPacketizeUniform;noInstantiateCallsInLoops; " + /// + /// @param[in] Str The string containing the Choices to enable/disable + /// @return true on success, false if the parsing failed + bool parseChoicesString(llvm::StringRef Str); + + /// @brief Convert a Choice name from a string to the matching Choice value + /// + /// The choices are matched without their e prefix. + /// + /// @param[in] Str The string with the Choice name + /// @return The Choice name, or eInvalid in case of error + static Choice fromString(llvm::StringRef Str); + + // + // Specific getters and setters for the most commonly used choices + // + + /// @brief Check if the eOptimizationPacketizeUniform choice is set + /// @return true if the choice is set, false otherwise + bool packetizeUniform() const { + return isEnabled(eOptimizationPacketizeUniform); + } + /// @brief Enable the eOptimizationPacketizeUniform choice + /// @return true if eOptimizationPacketizeUniform was already enabled + bool enablePacketizeUniform() { + return enable(eOptimizationPacketizeUniform); + } + /// @brief Disable the eOptimizationPacketizeUniform choice + /// @return true if eOptimizationPacketizeUniform was enabled + bool disablePacketizeUniform() { + return disable(eOptimizationPacketizeUniform); + } + + /// @brief Check if the eOptimizationPacketizeUniformInLoops choice is set + /// @return true if the choice is set, false otherwise + bool packetizeUniformInLoops() const { + return isEnabled(eOptimizationPacketizeUniformInLoops); + } + /// @brief Enable the eOptimizationPacketizeUniformInLoops choice + /// @return true if eOptimizationPacketizeUniformInLoops was already enabled + bool enablePacketizeUniformInLoops() { + return enable(eOptimizationPacketizeUniformInLoops); + } + /// @brief Disable the eOptimizationPacketizeUniformInLoops choice + /// @return true if eOptimizationPacketizeUniformInLoops was enabled + bool disablePacketizeUniformInLoops() { + return disable(eOptimizationPacketizeUniformInLoops); + } + + /// @brief Check if the eOptimizationInstantiateCallsInLoops choice is set + /// @return true if the choice is set, false otherwise + bool instantiateCallsInLoops() const { + return isEnabled(eOptimizationInstantiateCallsInLoops); + } + /// @brief Enable the eOptimizationInstantiateCallsInLoops choice + /// @return true if eOptimizationInstantiateCallsInLoops was already enabled + bool enableInstantiateCallsInLoops() { + return enable(eOptimizationInstantiateCallsInLoops); + } + /// @brief Disable the eOptimizationInstantiateCallsInLoops choice + /// @return true if eOptimizationInstantiateCallsInLoops was enabled + bool disableInstantiateCallsInLoops() { + return disable(eOptimizationInstantiateCallsInLoops); + } + + /// @brief Check if the eLinearizeBOSCC choice is set + /// @return true if the choice is set, false otherwise + bool linearizeBOSCC() const { return isEnabled(eLinearizeBOSCC); } + /// @brief Enable the eLinearizeBOSCC choice + /// @return true if eLinearizeBOSCC was already enabled + bool enableLinearizeBOSCC() { return enable(eLinearizeBOSCC); } + /// @brief Disable the eLinearizeBOSCC choice + /// @return true if eLinearizeBOSCC was enabled + bool disableLinearizeBOSCC() { return disable(eLinearizeBOSCC); } + + /// @brief Check if the eVectorPredication choice is set + /// @return true if the choice is set, false otherwise + bool vectorPredication() const { return isEnabled(eVectorPredication); } + /// @brief Enable the eVectorPredication choice + /// @return true if eVectorPredication was already enabled + bool enableVectorPredication() { return enable(eVectorPredication); } + /// @brief Disable the eVectorPredication choice + /// @return true if eVectorPredication was enabled + bool disableVectorPredication() { return disable(eVectorPredication); } + + /// @brief Check if the eTargetIndependentPacketization choice is set + /// @return true if the choice is set, false otherwise + bool targetIndependentPacketization() const { + return isEnabled(eTargetIndependentPacketization); + } + /// @brief Enable the eTargetIndependentPacketization choice + /// @return true if eTargetIndependentPacketization was already enabled + bool enableTargetIndependentPacketization() { + return enable(eTargetIndependentPacketization); + } + /// @brief Disable the eTargetIndependentPacketization choice + /// @return true if eTargetIndependentPacketization was enabled + bool disableTargetIndependentPacketization() { + return disable(eTargetIndependentPacketization); + } + + struct ChoiceInfo { + llvm::StringLiteral name; + Choice number; + llvm::StringLiteral desc; + }; + + static llvm::ArrayRef queryAvailableChoices(); + +private: + /// @brief All the choices enabled + llvm::SmallSet Enabled; + + /// @brief Print an error message, used by parseChoicesString + /// + /// The error message will contain the message given as well as the Choices + /// string being parsed and the position that the error occured. + // + /// @param[in] Input The Choices string being parsed + /// @param[in] Position The position where the parsin error occured + /// @param[in] Msg The accompanying error message + static void printChoicesParseError(llvm::StringRef Input, unsigned Position, + llvm::Twine Msg); +}; + +} // namespace vecz +#endif // VECZ_VECZ_CHOICES_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h new file mode 100644 index 0000000000000..490247e70c995 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h @@ -0,0 +1,716 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief External vecz header. Contains the API to the vectorizer. + +#ifndef VECZ_VECZ_TARGET_INFO_H_INCLUDED +#define VECZ_VECZ_TARGET_INFO_H_INCLUDED + +#include +#include +#include +#include + +namespace llvm { +class TargetMachine; +class TargetTransformInfo; +class Type; +} // namespace llvm + +namespace vecz { +class VectorizationContext; + +/// @addtogroup vecz +/// @{ + +/// @brief Kinds of interleaved memory operations. +enum InterleavedOperation : int { + /// @brief Invalid memory operation. + eInterleavedInvalid = 0, + /// @brief Store memory operation. + eInterleavedStore, + /// @brief Load memory operation. + eInterleavedLoad, + /// @brief Masked Store memory operation. + eMaskedInterleavedStore, + /// @brief Masked Load memory operation. + eMaskedInterleavedLoad +}; + +/// @brief Used by the vectorizer to query for target capabilities and +/// materialize memory intrinsics. +class TargetInfo { +public: + /// @brief Create a new vector target info instance. + /// @param[in] tm LLVM target machine that will be used for compilation, can + /// be NULL if no target data is available. + TargetInfo(llvm::TargetMachine *tm); + + virtual ~TargetInfo() = default; + + /// @brief Return the target machine. + llvm::TargetMachine *getTargetMachine() const { return TM_; } + + /// @brief Create a vector load. If a stride greater than one is used, the + /// load will be interleaved, i.e. lanes are loaded from non-contiguous + /// memory. + /// + /// @note ptr refers to the unwidened element type, not the wide type. + /// ptr needs to be 'element aligned'. The element can itself be a + /// vector. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] ty Value type to load from memory. + /// @param[in] ptr Memory address to load a vector value from. + /// @param[in] stride Distance in elements between two lanes in memory. + /// A stride of one represents a contiguous load. + /// @param[in] alignment The alignment of the load, in bytes + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If null, the operation is unpredicated: + /// it is executed on all lanes. + /// + /// @return IR value that results from the vector load. + virtual llvm::Value *createLoad(llvm::IRBuilder<> &builder, llvm::Type *ty, + llvm::Value *ptr, llvm::Value *stride, + unsigned alignment, + llvm::Value *evl = nullptr) const; + + /// @brief Create a vector store. If a stride greater than one is used, the + /// store will be interleaved, i.e. lanes are stored to non-contiguous memory. + /// + /// @note ptr refers to the unwidened element type, not the wide type. + /// ptr needs to be 'element aligned'. The element can itself be a + /// vector. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] data Vector value to store to memory. + /// @param[in] ptr Memory address to store a vector value to. + /// @param[in] stride Distance in elements between two lanes in memory. + /// A stride of one represents a contiguous store. + /// @param[in] alignment The alignment of the store, in bytes + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If null, the operation is unpredicated: + /// it is executed on all lanes. + /// + /// @return IR value that results from the vector store. + virtual llvm::Value *createStore(llvm::IRBuilder<> &builder, + llvm::Value *data, llvm::Value *ptr, + llvm::Value *stride, unsigned alignment, + llvm::Value *evl = nullptr) const; + + /// @brief Create a masked vector load. + /// Only lanes with a non-zero mask will be loaded from the address. + /// Other lanes will contain undefined data. + /// + /// @note ptr refers to the unwidened element type, not the wide type. + /// ptr needs to be 'element aligned'. The element can itself be a + /// vector. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] ty Value type to load from memory. + /// @param[in] ptr Memory address to load a vector value from. + /// @param[in] mask Vector mask used to disable loading certain lanes. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the store. + /// + /// @return IR value that results from the masked vector load. + virtual llvm::Value *createMaskedLoad(llvm::IRBuilder<> &builder, + llvm::Type *ty, llvm::Value *ptr, + llvm::Value *mask, llvm::Value *evl, + unsigned alignment) const; + + /// @brief Create a masked vector store. + /// Only lanes with a non-zero mask will be stored to the address. + /// + /// @note ptr refers to the unwidened element type, not the wide type. + /// ptr needs to be 'element aligned'. The element can itself be a + /// vector. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] data Vector value to store to memory. + /// @param[in] ptr Memory address to store a vector value to. + /// @param[in] mask Vector mask used to disable storing certain lanes. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the store. + /// + /// @return IR value that results from the masked vector store. + virtual llvm::Value *createMaskedStore(llvm::IRBuilder<> &builder, + llvm::Value *data, llvm::Value *ptr, + llvm::Value *mask, llvm::Value *evl, + unsigned alignment) const; + + /// @brief Create a interleaved vector load. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] ty Value type to load from memory. + /// @param[in] ptr Memory address to load a vector value to. + /// @param[in] stride Stride for interleaved memory operation. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the load + /// + /// @return IR value that results from the interleaved load. + virtual llvm::Value *createInterleavedLoad(llvm::IRBuilder<> &builder, + llvm::Type *ty, llvm::Value *ptr, + llvm::Value *stride, + llvm::Value *evl, + unsigned alignment) const; + + /// @brief Create a interleaved vector store. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] data Vector value to store to memory. + /// @param[in] ptr Memory address to store a vector value to. + /// @param[in] stride Stride for interleaved memory operation. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the load + /// + /// @return IR value that results from the interleaved vector store. + virtual llvm::Value * + createInterleavedStore(llvm::IRBuilder<> &builder, llvm::Value *data, + llvm::Value *ptr, llvm::Value *stride, + llvm::Value *evl, unsigned alignment) const; + + /// @brief Create a masked interleaved vector load. + /// Only lanes with a non-zero mask will be loaded from the address. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] ty Value type to load from memory. + /// @param[in] ptr Memory address to load a vector value to. + /// @param[in] mask Vector mask used to disable loading certain lanes. + /// @param[in] stride Stride for interleaved memory operation. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the load + /// + /// @return IR value that results from the masked interleaved vector load. + virtual llvm::Value * + createMaskedInterleavedLoad(llvm::IRBuilder<> &builder, llvm::Type *ty, + llvm::Value *ptr, llvm::Value *mask, + llvm::Value *stride, llvm::Value *evl, + unsigned alignment) const; + + /// @brief Create a masked interleaved vector store. + /// Only lanes with a non-zero mask will be stored to the address. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] data Vector value to store to memory. + /// @param[in] ptr Memory address to store a vector value to. + /// @param[in] mask Vector mask used to disable storing certain lanes. + /// @param[in] stride Stride for interleaved memory operation. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the load + /// + /// @return IR value that results from the masked interleaved vector store. + virtual llvm::Value * + createMaskedInterleavedStore(llvm::IRBuilder<> &builder, llvm::Value *data, + llvm::Value *ptr, llvm::Value *mask, + llvm::Value *stride, llvm::Value *evl, + unsigned alignment) const; + + /// @brief Create a gather vector load. + /// Vector lanes are loaded from different memory addresses. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] ty Value type to load from memory. + /// @param[in] ptr Memory address to load a vector value from. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the store. + /// + /// @return IR value that results from the gather vector load. + virtual llvm::Value *createGatherLoad(llvm::IRBuilder<> &builder, + llvm::Type *ty, llvm::Value *ptr, + llvm::Value *evl, + unsigned alignment) const; + + /// @brief Create a scatter vector store. + /// Vector lanes are stored to different memory addresses. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] data Vector value to store to memory. + /// @param[in] ptr Memory address to store a vector value to. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the store. + /// + /// @return IR value that results from the scatter vector store. + virtual llvm::Value *createScatterStore(llvm::IRBuilder<> &builder, + llvm::Value *data, llvm::Value *ptr, + llvm::Value *evl, + unsigned alignment) const; + + /// @brief Create a masked gather vector load. + /// Only lanes with a non-zero mask will be loaded from different + /// address. + /// Other lanes will contain undefined data. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] ty Value type to load from memory. + /// @param[in] ptr Memory address to load a vector value from. + /// @param[in] mask Vector mask used to disable loading certain lanes. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the store. + /// + /// @return IR value that results from the masked gather vector load. + virtual llvm::Value *createMaskedGatherLoad(llvm::IRBuilder<> &builder, + llvm::Type *ty, llvm::Value *ptr, + llvm::Value *mask, + llvm::Value *evl, + unsigned alignment) const; + + /// @brief Create a masked scatter vector store. + /// Only lanes with a non-zero mask will be stored to the address. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] data Vector value to store to memory. + /// @param[in] ptr Memory address to store a vector value to. + /// @param[in] mask Vector mask used to disable storing certain lanes. + /// @param[in] evl 'effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// @param[in] alignment Alignment of the store. + /// + /// @return IR value that results from the masked scatter vector store. + virtual llvm::Value * + createMaskedScatterStore(llvm::IRBuilder<> &builder, llvm::Value *data, + llvm::Value *ptr, llvm::Value *mask, + llvm::Value *evl, unsigned alignment) const; + + /// @brief Create a scalable extractelement instruction. Note that the + /// operands are expected to have been pre-packetized before passing to this + /// function. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] Ctx Vectorization context. + /// @param[in] extract The original pre-packetized extractelement Instruction + /// @param[in] narrowTy Narrowed type of @a extract. + /// @param[in] src The packetized source vector + /// @param[in] index The packetized extraction index + /// @param[in] evl 'Effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// + /// @return A value identical to the requested extractelement + virtual llvm::Value *createScalableExtractElement( + llvm::IRBuilder<> &builder, vecz::VectorizationContext &Ctx, + llvm::Instruction *extract, llvm::Type *narrowTy, llvm::Value *src, + llvm::Value *index, llvm::Value *evl) const; + + /// @brief Create an outer broadcast of a vector. An outer broadcast is one + /// where a vector with length V is replicated in its entirety N times across + /// the lanes of a larger vector with length L x V. The broadcast factor is + /// expected to be scalable: + /// + /// outer_broadcast(, vscale x 1) -> + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] vector Vector to broadcast. + /// @param[in] VL Vector length. + /// @param[in] factor Broadcast factor. + virtual llvm::Value * + createOuterScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector, + llvm::Value *VL, + llvm::ElementCount factor) const; + + /// @brief Create an inner broadcast of a vector. An inner broadcast is one + /// where a vector with length V has its lanes individually and sequentially + /// replicated N times to fill a larger vector with length L x V. The + /// broadcast factor is expected to be a fixed amount: + /// + /// inner_broadcast(, 2) -> + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] vector Vector to broadcast. + /// @param[in] VL Vector length. + /// @param[in] factor Broadcast factor. + virtual llvm::Value * + createInnerScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector, + llvm::Value *VL, + llvm::ElementCount factor) const; + + /// @brief Utility function for packetizing an insertelement instruction by a + /// scalable factor. Note that the operands are expected to have been + /// pre-packetized before passing to this function. + /// + /// @param[in] builder the builder to create the needed instructions + /// @param[in] Ctx Vectorization context. + /// @param[in] insert the original pre-packetized insertelement Instruction + /// @param[in] elt the packetized element to insert + /// @param[in] into the packetized source vector + /// @param[in] index the packetized insertion index + /// @param[in] evl 'Effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes but obeys the mask parameter. + /// + /// @return a value identical to the requested insertelement + virtual llvm::Value *createScalableInsertElement( + llvm::IRBuilder<> &builder, vecz::VectorizationContext &Ctx, + llvm::Instruction *insert, llvm::Value *elt, llvm::Value *into, + llvm::Value *index, llvm::Value *evl) const; + + /// @brief Function allowing targets to customize the insertion of + /// instructions to calculate the vector-predicated kernel width. + /// + /// Note that this must return an expression equivalent to: + /// i32 = umin(%factor, %remainingIters) + /// This is the expression computed if this function returns nullptr. + /// + /// @param[in] builder the builder to create the needed instructions + /// @param[in] remainingIters the remaining number of work-items being + /// executed in the work-group in the dimension being vectorized. + /// @param[in] widestEltTy an optimization hint indicating the widest (vector + /// element) type in the kernel. Must not be relied on for correctness. + /// @param[in] factor the vectorization width. + virtual llvm::Value *createVPKernelWidth(llvm::IRBuilder<> &builder, + llvm::Value *remainingIters, + unsigned widestEltTy, + llvm::ElementCount factor) const { + (void)builder; + (void)remainingIters; + (void)widestEltTy; + (void)factor; + return nullptr; + } + + /// @brief Create a single-source vector shuffle with a general shuffle mask. + /// Can work with dynamic shuffle masks and scalable vectors, and can return + /// vectors of a different length to the source. + /// + /// @param[in] builder the builder to create the needed instructions + /// @param[in] src the source vector + /// @param[in] mask the shuffle mask + /// @param[in] evl 'Effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes. + /// + /// @return the result of the shuffle operation + virtual llvm::Value *createVectorShuffle(llvm::IRBuilder<> &builder, + llvm::Value *src, llvm::Value *mask, + llvm::Value *evl) const; + + /// @brief Create a vector slide-up operation, that moves all vector elements + /// up by one place, with the specified element inserted into the zeroth + /// position. + /// + /// @param[in] builder the builder to create the needed instructions + /// @param[in] src the source vector + /// @param[in] insert the value to slide into the vacant position + /// @param[in] evl 'Effective vector length' of the operation. Must be + /// pre-scaled for vector operations. If evl is null, the operation is not + /// length-predicated: it executes on all lanes. + /// + /// @return the result of the slide-up operation + virtual llvm::Value *createVectorSlideUp(llvm::IRBuilder<> &builder, + llvm::Value *src, + llvm::Value *insert, + llvm::Value *evl) const; + + /// @brief Determine whether the specified group of interleaved memory + /// instructions can be optimized or not. + /// + /// @param[in] val Memory access operation. + /// @param[in] kind Kind of interleaved instructions. + /// @param[in] stride Stride of the interleaved memory operations. + /// @param[in] groupSize Number of interleaved operations in the group. + /// + /// @return true if the interleaved group can be optimized, false otherwise. + virtual bool canOptimizeInterleavedGroup(const llvm::Instruction &val, + InterleavedOperation kind, + int stride, + unsigned groupSize) const; + + /// @brief Try to optimize a group of consecutive interleaved vector memory + /// instructions. These instructions collectively access a consecutive chunk + /// of memory and are sorted by increasing address. + /// + /// @note Pointers are scalar and need to be 'scalar aligned'. + /// @param[in] builder Builder used to create IR. + /// @param[in] Kind Kind of interleaved group to look for. + /// @param[in] group List of interleaved operations. + /// @param[in] masks List of mask operands. + /// @param[in] baseAddress Base pointer for the memory operation. + /// @param[in] stride Stride of the interleaved memory operations. + /// + /// @return Return true if the interleaved group was optimized or false. + virtual bool optimizeInterleavedGroup(llvm::IRBuilder<> &builder, + InterleavedOperation Kind, + llvm::ArrayRef group, + llvm::ArrayRef masks, + llvm::Value *baseAddress, + int stride) const; + + /// @brief (De-)interleave a list of vectors. + /// + /// @param[in] builder Builder used to generate new instructions. + /// @param[in,out] vectors List of vectors to (de-)interleave. + /// @param[in] forward true to interleave, false to deinterleave. + /// + /// @return true if the vectors were (de-)interleaved, false otherwise. + virtual bool interleaveVectors(llvm::IRBuilder<> &builder, + llvm::MutableArrayRef vectors, + bool forward) const; + + /// @brief Estimates the widest SIMD width that will fit into registers for a + /// given set of values. + /// + /// @param[in] TTI the Target Transform Info + /// @param[in] vals Set of values to fit into registers + /// @param[in] width the widest SIMD width to consider + /// @return the widest SIMD width that is expected to fit into registers, or + /// zero if the set can never fit into registers. + virtual unsigned + estimateSimdWidth(const llvm::TargetTransformInfo &TTI, + const llvm::ArrayRef vals, + unsigned width) const; + + /// @brief Get the preferred vector width for the given scalar type + /// + /// @param[in] TTI the Target Transform Info + /// @param[in] Ty the scalar type to get the width for + /// @return the preferred vector width + virtual unsigned getVectorWidthForType(const llvm::TargetTransformInfo &TTI, + const llvm::Type &Ty) const; + + /// @brief Return whether the value can be packetized by the given width. + /// + /// @param[in] Val The value to be packetized + /// @param[in] Width The vectorization factor by which to packetize Val + /// @return true if the value can be packetized, false otherwise. + virtual bool canPacketize(const llvm::Value *Val, + llvm::ElementCount Width) const; + + /// @return Whether a given vector type would be legal as the result of a + /// binary vp intrinsic. + virtual bool isVPVectorLegal(const llvm::Function &F, llvm::Type *Ty) const; + +protected: + /// @brief This type indicates legality of a VP/Masked memory operation in a + /// target. + class VPMemOpLegality { + public: + constexpr VPMemOpLegality() = default; + constexpr VPMemOpLegality(bool VPLegal, bool MaskLegal) + : VPLegal(VPLegal), MaskLegal(MaskLegal) {} + + /// @brief States whether the operation is legal as or not a VP intrinsic. + void setVPLegality(bool Legal) { VPLegal = Legal; } + + /// @brief States whether the operation is legal ot not as a masked memory + /// operation. + void setMaskLegality(bool Legal) { MaskLegal = Legal; } + + /// @brief Tests whether the operation is legal as a VP intrinsic. + constexpr bool isVPLegal() const { return VPLegal; } + + /// @brief Tests whether the operation is legal as a masked memory + /// operation. + constexpr bool isMaskLegal() const { return MaskLegal; } + + private: + bool VPLegal = false; + bool MaskLegal = false; + }; + + /// @brief Create an indices vector to be used in createScalableBroadcast() + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] ty Type of the indices vector. + /// @param[in] factor Vectorization factor. + /// @param[in] URem Whether to broadcast a fixed-length vector to a scalable + /// one or a scalable-vector by a fixed amount. + /// @param[in] N Name of the value to produce. + static llvm::Value *createBroadcastIndexVector(llvm::IRBuilder<> &builder, + llvm::Type *ty, + llvm::ElementCount factor, + bool URem, + const llvm::Twine &N = ""); + + /// @return A VPMemOpLegality enum stating whether we can create a vp.load or + /// a masked.load intrinsic. + /// + /// @param[in] F The function in which the instruction will be created. + /// @param[in] Ty Type of the vector to load. + /// @param[in] Alignment Alignment of the operation. + /// @param[in] AddrSpace Address space of the operation. + virtual VPMemOpLegality isVPLoadLegal(const llvm::Function *F, llvm::Type *Ty, + unsigned Alignment, + unsigned AddrSpace) const; + + /// @return A VPMemOpLegality enum stating whether we can create a vp.store or + /// a masked.store intrinsic. + /// + /// @param[in] F The function in which the instruction will be created. + /// @param[in] Ty Type of the vector to store. + /// @param[in] Alignment Alignment of the operation. + /// @param[in] AddrSpace Address space of the operation. + virtual VPMemOpLegality isVPStoreLegal(const llvm::Function *F, + llvm::Type *Ty, unsigned Alignment, + unsigned AddrSpace) const; + + /// @return A VPMemOpLegality enum stating whether we can create a vp.gather + /// or a masked.gather intrinsic. + /// + /// @param[in] F The function in which the instruction will be created. + /// @param[in] Ty Type of the vector to gather. + /// @param[in] Alignment Alignment of the operation. + /// @param[in] AddrSpace Address space of the operation. + virtual VPMemOpLegality isVPGatherLegal(const llvm::Function *F, + llvm::Type *Ty, unsigned Alignment, + unsigned AddrSpace) const; + + /// @return A VPMemOpLegality enum stating whether we can create a vp.scatter + /// or a masked.scatter intrinsic. + /// + /// @param[in] F The function in which the instruction will be created. + /// @param[in] Ty Type of the vector to scatter. + /// @param[in] Alignment Alignment of the operation. + /// @param[in] AddrSpace Address space of the operation. + virtual VPMemOpLegality isVPScatterLegal(const llvm::Function *F, + llvm::Type *Ty, unsigned Alignment, + unsigned AddrSpace) const; + + /// @brief Function to check whether a given type is valid as the element type + /// of a scalable vector used in a VP intrinsic. + /// + /// @param[in] Ty The type to be checked. + virtual bool isLegalVPElementType(llvm::Type *Ty) const; + + /// @brief LLVM target machine that will be used for compilation. + llvm::TargetMachine *TM_; + +private: + /// @brief Helper function to check legality of memory operations. + /// + /// @return Illegal in LLVM < 13 and check leagality in LLVM >= 13. + VPMemOpLegality + checkMemOpLegality(const llvm::Function *F, + llvm::function_ref + Checker, + llvm::Type *Ty, unsigned Alignment, + unsigned AddrSpace) const; + + /// @brief Create a broadcast of a vector. + /// + /// @param[in] builder Builder used to create IR. + /// @param[in] vector Vector to broadcast. + /// @param[in] VL Vector length. + /// @param[in] factor Vectorization factor. + /// @param[in] URem Whether to broadcast a fixed-length vector to a scalable + /// one or a scalable-vector by a fixed amount + llvm::Value *createScalableBroadcast(llvm::IRBuilder<> &builder, + llvm::Value *vector, llvm::Value *VL, + llvm::ElementCount factor, + bool URem) const; +}; + +/// @brief Caches and returns the TargetInfo for a Module. +class TargetInfoAnalysis : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + struct Result { + Result(std::unique_ptr &&I) : Info(std::move(I)) {} + /// Handle the invalidation of this information. + /// + /// When used as a result of TargetInfoAnalysis this method will be called + /// when the function this was computed for changes. When it returns false, + /// the information is preserved across those changes. + bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &, + llvm::ModuleAnalysisManager::Invalidator &) { + return false; + } + + operator TargetInfo *() { return Info.get(); } + operator const TargetInfo *() const { return Info.get(); } + + std::unique_ptr Info; + }; + + using CallbackFn = std::function; + + TargetInfoAnalysis(); + + TargetInfoAnalysis(llvm::TargetMachine *TM); + + TargetInfoAnalysis(CallbackFn TICallback) : TICallback(TICallback) {} + + /// @brief Retrieve the TargetInfo for the requested module. + Result run(llvm::Module &M, llvm::ModuleAnalysisManager &) { + return TICallback(M); + } + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "TargetInfo analysis"; } + +private: + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; + + /// @brief Callback function producing a BuiltinInfo on demand. + CallbackFn TICallback; +}; + +std::unique_ptr createTargetInfoArm(llvm::TargetMachine *tm); + +std::unique_ptr createTargetInfoAArch64(llvm::TargetMachine *tm); + +std::unique_ptr createTargetInfoRISCV(llvm::TargetMachine *tm); + +/// @brief Create a new vector target info instance. +/// @param[in] tm LLVM target machine that will be used for compilation, can +/// be NULL if no target data is available. +/// @return The new TargetInfo instance. +std::unique_ptr +createTargetInfoFromTargetMachine(llvm::TargetMachine *tm); + +/// @} +} // namespace vecz + +#endif // VECZ_VECZ_TARGET_INFO_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp new file mode 100644 index 0000000000000..cdeb01e71d77c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp @@ -0,0 +1,99 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/control_flow_analysis.h" + +#include +#include +#include +#include +#include +#include + +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" + +#define DEBUG_TYPE "vecz-cf" + +using namespace llvm; +using namespace vecz; + +//////////////////////////////////////////////////////////////////////////////// + +llvm::AnalysisKey CFGAnalysis::Key; + +CFGResult CFGAnalysis::run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM) { + CFGResult Res; + + LLVM_DEBUG(dbgs() << "CONTROL FLOW ANALYSIS\n"); + + const UniformValueResult &UVR = AM.getResult(F); + + bool mayDiverge = false; + for (BasicBlock &BB : F) { + // Update diverge information for a block which has varying branch. + auto *term = BB.getTerminator(); + if (isa(term) || isa(term)) { + // an "unreachable" terminator may be generated from an "optimization" + // of undefined behaviour in the IR; where a "trap" call has been + // introduced, the end of the Basic Block will never be reached. + // This should still be regarded as an exit block for our purposes. + if (Res.exitBB) { + emitVeczRemarkMissed(&F, &F, + "CFG should not have more than one exit block."); + Res.setFailed(true); + return Res; + } + Res.exitBB = &BB; + LLVM_DEBUG(dbgs() << BB.getName() << " returns\n"); + } else if (BranchInst *B = dyn_cast(term)) { + if (B->isConditional()) { + auto *const cond = B->getCondition(); + if (cond && UVR.isVarying(cond)) { + mayDiverge = true; + } + } + } else if (isa(term)) { + // Control Flow Conversion Pass is not able to handle switch instructions. + emitVeczRemarkMissed(&F, &F, "Unexpected Switch instruction."); + Res.setFailed(true); + return Res; + } + } + + if (!Res.getExitBlock()) { + emitVeczRemarkMissed(&F, &F, "Non-terminating CFG in"); + Res.setFailed(true); + return Res; + } + + const LoopInfo &LI = AM.getResult(F); + using RPOTraversal = ReversePostOrderTraversal; + const RPOTraversal FuncRPOT(&F); + if (containsIrreducibleCFG(FuncRPOT, LI)) { + emitVeczRemarkMissed(&F, &F, "Irreducible loop detected in"); + Res.setFailed(true); + return Res; + } + + if (mayDiverge) { + Res.setConversionNeeded(true); + } + + return Res; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp new file mode 100644 index 0000000000000..39c78b01a2bf4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp @@ -0,0 +1,808 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/divergence_analysis.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" + +#define DEBUG_TYPE "vecz" + +using namespace vecz; +using namespace llvm; + +namespace { +using RPOT = ReversePostOrderTraversal; +} // namespace + +BlockQueue::BlockQueue(const DivergenceResult &dr, + const DenseSet &blocks) + : DR(dr) { + indices.reserve(blocks.size()); + for (auto *const BB : blocks) { + indices.push_back(DR.getTagIndex(BB)); + } + + // Note that make_heap builds a Max heap, so we use `std::greater` to get a + // Min heap. + std::make_heap(indices.begin(), indices.end(), std::greater()); +} + +const BasicBlockTag &BlockQueue::pop() { + assert(!indices.empty() && "Trying to pop from an empty BlockQueue"); + std::pop_heap(indices.begin(), indices.end(), std::greater()); + const auto popped_index = indices.back(); + indices.pop_back(); + + return DR.getBlockTag(popped_index); +} + +void BlockQueue::push(size_t index) { + indices.push_back(index); + std::push_heap(indices.begin(), indices.end(), std::greater()); +} + +void BlockQueue::push(const BasicBlock *bb) { + indices.push_back(DR.getTagIndex(bb)); + std::push_heap(indices.begin(), indices.end(), std::greater()); +} + +DivergenceResult::DivergenceResult(Function &F, FunctionAnalysisManager &AM) + : F(F), AM(AM) {} + +size_t DivergenceResult::getTagIndex(const llvm::BasicBlock *BB) const { + assert(BB && "Trying to get the tag of a null BasicBlock"); + auto iter = BBMap.find(BB); + assert(iter != BBMap.end() && "BasicBlock tag is not defined"); + return iter->second; +} + +BasicBlockTag &DivergenceResult::getOrCreateTag(BasicBlock *BB) { + assert(BB && "Trying to get the tag of a null BasicBlock"); + const auto &result = BBMap.try_emplace(BB, basicBlockTags.size()); + if (result.second) { + // It's a new map entry, so create the new tag and return it. + basicBlockTags.emplace_back(); + auto &tag = basicBlockTags.back(); + tag.BB = BB; + return tag; + } + // Return the indexed tag. + return basicBlockTags[result.first->second]; +} + +LoopTag &DivergenceResult::getTag(const Loop *L) const { + assert(L && "Trying to get the tag of a null loop"); + auto iter = LMap.find(L); + assert(iter != LMap.end() && "Loop tag is not defined"); + return *iter->second; +} + +LoopTag &DivergenceResult::getOrCreateTag(Loop *L) { + assert(L && "Trying to get or create the tag of a null loop"); + auto &tag = LMap[L]; + if (!tag) { + tag = std::make_unique(); + tag->loop = L; + } + return *tag; +} + +bool DivergenceResult::hasFlag(const BasicBlock &BB, + BlockDivergenceFlag F) const { + return (getTag(&BB).divergenceFlag & F) == F; +} + +BlockDivergenceFlag DivergenceResult::getFlag(const BasicBlock &BB) const { + return getTag(&BB).divergenceFlag; +} + +void DivergenceResult::setFlag(const BasicBlock &BB, BlockDivergenceFlag F) { + auto &tag = getTag(&BB); + tag.divergenceFlag = static_cast(tag.divergenceFlag | F); +} + +void DivergenceResult::clearFlag(const BasicBlock &BB, BlockDivergenceFlag F) { + auto &tag = getTag(&BB); + tag.divergenceFlag = + static_cast(tag.divergenceFlag & ~F); +} + +bool DivergenceResult::isDivCausing(const BasicBlock &BB) const { + return (hasFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranch) || + hasFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranchFake)); +} + +bool DivergenceResult::isDivergent(const BasicBlock &BB) const { + return hasFlag(BB, BlockDivergenceFlag::eBlockIsDivergent); +} + +bool DivergenceResult::isOptional(const BasicBlock &BB) const { + return !isDivergent(BB); +} + +bool DivergenceResult::isByAll(const BasicBlock &BB) const { + return hasFlag(BB, BlockDivergenceFlag::eBlockIsByAll); +} + +bool DivergenceResult::isBlend(const BasicBlock &BB) const { + return hasFlag(BB, BlockDivergenceFlag::eBlockIsBlend); +} + +bool DivergenceResult::isUniform(const BasicBlock &BB) const { + return hasFlag(BB, BlockDivergenceFlag::eBlockIsUniform); +} + +bool DivergenceResult::hasFlag(const Loop &L, LoopDivergenceFlag F) const { + return (getTag(&L).divergenceFlag & F) == F; +} + +LoopDivergenceFlag DivergenceResult::getFlag(const Loop &L) const { + return getTag(&L).divergenceFlag; +} + +void DivergenceResult::setFlag(const Loop &L, LoopDivergenceFlag F) { + auto &tag = getTag(&L); + tag.divergenceFlag = static_cast(tag.divergenceFlag | F); +} + +void DivergenceResult::clearFlag(const Loop &L, LoopDivergenceFlag F) { + auto &tag = getTag(&L); + tag.divergenceFlag = static_cast(tag.divergenceFlag & ~F); +} + +bool DivergenceResult::computeBlockOrdering(DominatorTree &DT) { + LLVM_DEBUG(dbgs() << "Divergence Analysis: COMPUTE BLOCK ORDERING\n"); + + // The DCBI (Dominance Compact Block Indexing) is a topological ordering of + // the basic blocks that is also dominance compact, that is, an ordering such + // that for any block A, every block that A dominates follows in a contiguous + // subsequence in the ordering. To construct this, we gather a reverse post- + // order traversal over the CFG, and then a depth-first traversal over the + // dominator tree, ordering each node's children according to the previously + // calculated reverse post-order. We need to take special care of loop exits, + // however, since where a loop exits from some block other than a latch, + // the dominator tree traversal can erroneously order it inside of the loop. + // To prevent this, we store up exit blocks until we have processed all + // the blocks at the current loop level. + + struct DCnode { + BasicBlock *BB; + unsigned depth = 0; + }; + std::vector graph; + llvm::DenseMap indexMap; + + indexMap.reserve(F.size()); + { + // Note that a post-order traversal of the CFG does not include any blocks + // with no predecessors, other than the entry block. + unsigned index = 0; + for (auto *const BB : RPOT(&F)) { + indexMap[BB] = index++; + graph.emplace_back(); + graph.back().BB = BB; + + if (const auto *const LTag = getTag(BB).loop) { + graph.back().depth = LTag->loop->getLoopDepth(); + } + } + } + + // Do a depth-first traversal of the dominator tree + SmallVector stack; + stack.push_back(0); + uint32_t pos = 0; + const SmallVector children; + SmallVector loopExits; + while (!stack.empty()) { + const auto u = stack.pop_back_val(); + const auto &uNode = graph[u]; + + getTag(uNode.BB).pos = pos++; + + // Children in the same loop or subloops get added back to the stack. + // Children outside of the current loop get stored up until we processed + // everything in this loop. Note that we can accumulate exit blocks + // from multiple points within the loop, and across multiple depth levels. + auto *const DTNode = DT.getNode(uNode.BB); + unsigned stacked = 0; + for (auto *const childNode : make_range(DTNode->begin(), DTNode->end())) { + const auto child = indexMap[childNode->getBlock()]; + auto &cNode = graph[child]; + if (cNode.depth >= uNode.depth) { + stack.push_back(child); + ++stacked; + } else { + // Note that we can exit across more than one loop level, so we need to + // find the right place to insert it. + auto insert = loopExits.end(); + while (insert != loopExits.begin()) { + auto scan = insert - 1; + if (cNode.depth < graph[*scan].depth) { + insert = scan; + } else { + break; + } + } + loopExits.insert(insert, child); + } + } + // Sort any children added to the stack into post-order + std::sort(stack.end() - stacked, stack.end(), std::greater()); + + if (!loopExits.empty()) { + const unsigned curDepth = stack.empty() ? 0 : graph[stack.back()].depth; + const unsigned depth = std::max(curDepth, graph[loopExits.back()].depth); + unsigned count = 0; + while (!loopExits.empty() && depth == graph[loopExits.back()].depth) { + stack.push_back(loopExits.pop_back_val()); + ++count; + } + + // Sort the loop exits into post-order + std::sort(stack.end() - count, stack.end(), std::greater()); + } + } + assert(pos == graph.size() && "Incomplete DCBI"); + + reorderTags(pos); + return true; +} + +void DivergenceResult::reorderTags(size_t n) { + numOrderedBlocks = n; + + // This is a Cycle Sort. It re-orders the tags in the tag vector according to + // their calculated block index. Despite the two nested loops, it is O(n). + // Out-of-range indices (pos >= n) will be left where they are, but a later + // ordered tag might move it afterwards. + for (size_t i = 0, n = basicBlockTags.size(); i != n; ++i) { + auto &tag = basicBlockTags[i]; + while (tag.pos < n && tag.pos != i) { + std::swap(tag, basicBlockTags[tag.pos]); + } + } + + // Rebuild the index map after sorting. Note that we can't absorb this into + // the above loop, since an unordered tag might not be in its final position + // until all of the ordered tags are in their correct places. + for (size_t i = 0, n = basicBlockTags.size(); i != n; ++i) { + BBMap[basicBlockTags[i].BB] = i; + } +} + +bool DivergenceResult::computeLoopOrdering() { + loopOrdering.clear(); + for (const auto &pair : LMap) { + loopOrdering.push_back(pair.second.get()); + } + + std::sort(loopOrdering.begin(), loopOrdering.end(), + [](const LoopTag *LHS, const LoopTag *RHS) -> bool { + return LHS->loop->getLoopDepth() < RHS->loop->getLoopDepth(); + }); + + return true; +} + +void DivergenceResult::markDivCausing(BasicBlock &BB, DivergenceInfo &DI, + PostDominatorTree &PDT) { + if (isDivCausing(BB)) { + return; + } + + divCausingBlocks.push_back(&BB); + setFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranch); + LLVM_DEBUG(dbgs() << "Block " << BB.getName() << " is div_causing\n"); + + for (BasicBlock *succ : successors(&BB)) { + markDivergent(*succ); + } + + // If a block is a joint point (blend) of `BB`, then it is divergent (unless + // it is the post-dominator of `BB`). + const auto &joins = joinPoints(BB); + for (BasicBlock *const join : joins) { + setFlag(*join, BlockDivergenceFlag::eBlockIsBlend); + LLVM_DEBUG(dbgs() << "\tBlock " << join->getName() << " is blend\n"); + + if (!PDT.dominates(join, &BB)) { + markDivergent(*join); + } + + for (BasicBlock *const pred : predecessors(join)) { + // If at least 2 successors of `pred` are join points of `BB`, then mark + // `pred` as a fake div causing block because its successors may be + // executed by multiple work-items. + if (std::count_if( + succ_begin(pred), succ_end(pred), + [&joins](BasicBlock *succ) { return joins.count(succ); }) > 1) { + fakeDivCausingBlocks.insert(pred); + } + } + + // Join points of divergent branches need their PHIs marked varying. + DI.insert(join); + } +} + +void DivergenceResult::markDivLoopDivBlocks(BasicBlock &BB, Loop &L, + DivergenceInfo &DI) { + markDivergent(L); + + // Find loop exits through which some work-items may leave the loop while + // others keep iterating over it. These exit blocks can be reached from the + // div_causing block before reaching the latch because the divergent path + // cannot fully reconverge before leaving the loop (since the loop is + // divergent). + SmallVector exits; + L.getExitBlocks(exits); + const auto &divergentExits = escapePoints(BB, L); + for (BasicBlock *E : exits) { + if (divergentExits.contains(E)) { + markDivergent(*E); + } + // All loop exits of a divergent loop need their PHIs marked varying. + DI.insert(E); + } + + // The latch of a divergent loop is divergent. + markDivergent(*L.getLoopLatch()); +} + +void DivergenceResult::markDivergent(const BasicBlock &BB) { + if (!isDivergent(BB)) { + setFlag(BB, BlockDivergenceFlag::eBlockIsDivergent); + LLVM_DEBUG(dbgs() << "\tBlock " << BB.getName() << " is divergent\n"); + } +} + +void DivergenceResult::markDivergent(const Loop &L) { + if (!getTag(&L).isLoopDivergent()) { + setFlag(L, LoopDivergenceFlag::eLoopIsDivergent); + LLVM_DEBUG(dbgs() << "\tLoop " << L.getName() << " is divergent\n"); + } +} + +void DivergenceResult::markByAll(BasicBlock &src) { + Function &F = *src.getParent(); + const DominatorTree &DT = AM.getResult(F); + const PostDominatorTree &PDT = AM.getResult(F); + + BlockQueue queue(*this); + queue.push(&src); + + while (!queue.empty()) { + auto &BBTag = queue.pop(); + auto *const BB = BBTag.BB; + + if (isByAll(*BB)) { + continue; + } + + const bool isHeaderDivLoop = + BBTag.isLoopHeader() && BBTag.loop->isLoopDivergent(); + // If BB is a loop header, it can only be marked by_all if its loop does not + // diverge. + if (!isHeaderDivLoop) { + setFlag(*BB, BlockDivergenceFlag::eBlockIsByAll); + LLVM_DEBUG(dbgs() << "Block " << BB->getName() << " is by_all\n"); + } + + SmallVector descendants; + DT.getDescendants(BB, descendants); + + // For all descendants `D` of `BB` that post-dominate `BB`, `D` is by_all. + for (BasicBlock *D : descendants) { + if (D != BB) { + if (PDT.dominates(D, BB)) { + const auto DIndex = getTagIndex(D); + const auto *const DLoopTag = basicBlockTags[DIndex].loop; + // If we are not in a loop, or the loop we live in does not diverge + // nor does the one englobing us if it exists, then mark by_all. + if (DLoopTag) { + if (DLoopTag->isLoopDivergent()) + continue; + Loop *parentLoop = DLoopTag->loop->getParentLoop(); + if (parentLoop && !isByAll(*parentLoop->getHeader())) + continue; + } + queue.push(DIndex); + } + } + } + + // For all descendants `D` of `BB` that do not post-dominate `BB`, `D` is + // by_all if all predecessors of `D` are by_all. + // + // If BB is a divergent branch, it cannot propagate by_all to its + // successors. + if (!isHeaderDivLoop && !isDivCausing(*BB)) { + for (BasicBlock *D : descendants) { + if (D != BB) { + if (!PDT.dominates(D, BB)) { + if (std::all_of( + pred_begin(D), pred_end(D), + [this](BasicBlock *pred) { return isByAll(*pred); })) { + queue.push(D); + } + } + } + } + } + } +} + +bool DivergenceResult::isReachable(BasicBlock *src, BasicBlock *dst, + bool allowLatch) const { + DenseSet visited; + std::vector worklist; + + worklist.push_back(src); + visited.insert(src); + + while (!worklist.empty()) { + BasicBlock *BB = worklist.back(); + worklist.pop_back(); + + if (BB == dst) { + return true; + } + + const auto &BBTag = getTag(BB); + for (BasicBlock *succ : successors(BB)) { + if (!allowLatch && BBTag.isLoopBackEdge(succ)) { + continue; + } + if (visited.insert(succ).second) { + worklist.push_back(succ); + } + } + } + + return false; +} + +DenseSet DivergenceResult::joinPoints(BasicBlock &src) const { + if (src.getTerminator()->getNumSuccessors() < 2) { + return {}; + } + + Function &F = *src.getParent(); + const PostDominatorTree &PDT = AM.getResult(F); + + DenseMap defMap; + DenseSet joins; + + BlockQueue queue(*this); + + auto schedule = [&defMap, &joins, &queue](BasicBlock *block, + const BasicBlock *defBlock) { + auto defIt = defMap.find(block); + // First time we meet this block; not a join (yet). + if (defIt == defMap.end()) { + queue.push(block); + defMap.insert({block, defBlock}); + } else if (defIt->second != defBlock) { + // We've found a block that has two different incoming definitions; it is + // a join point. + joins.insert(block); + } + }; + + for (BasicBlock *const succ : successors(&src)) { + schedule(succ, succ); + } + + auto *Node = PDT.getNode(&src); + assert(Node && "Could not get node"); + auto *IDom = Node->getIDom(); + assert(IDom && "Could not get IDom"); + BasicBlock *PIDom = IDom->getBlock(); + assert(PIDom && "Could not get block"); + + while (!queue.empty()) { + auto &curTag = queue.pop(); + BasicBlock *cur = curTag.BB; + + if (cur == PIDom) { + continue; + } + + const BasicBlock *const defBlock = defMap.find(cur)->second; + + const auto *const curLTag = curTag.loop; + // If the successor is the header of a nested loop pretend its a single + // node with the loop's exits as successors. + if (curLTag && curLTag->header == cur) { + SmallVector exits; + curLTag->loop->getUniqueExitBlocks(exits); + for (BasicBlock *const exit : exits) { + if (exit == &src) { + continue; + } + schedule(exit, defBlock); + } + } else { + // the successors are either on the same loop level or loop exits + for (BasicBlock *const succ : successors(cur)) { + if (succ == &src) { + continue; + } + schedule(succ, defBlock); + } + } + } + + return joins; +} + +DenseSet DivergenceResult::escapePoints(const BasicBlock &src, + const Loop &L) const { + const LoopTag <ag = getTag(&L); + + DenseSet divergentExits; + + DenseSet visited; + BlockQueue queue(*this); + + queue.push(&src); + visited.insert(&src); + + while (!queue.empty()) { + const auto &BBTag = queue.pop(); + auto *const BB = BBTag.BB; + + // We found a divergent loop exit. + if (!L.contains(BB)) { + divergentExits.insert(BB); + continue; + } + + bool allowLatch = true; + auto *const loopTag = BBTag.loop; + // 'BB' is a backedge + if (loopTag && loopTag->latch == BB) { + if (loopTag == <ag) { + // `BB` is the latch of the current loop; forbid the backedge. + allowLatch = false; + } else { + // Otherwise, forbid the backedge only if none of the remaining blocks + // in the queue belong to `L`, in which case no exit block starting + // from the header of the nested loop can be divergent. + allowLatch = + std::any_of(queue.begin(), queue.end(), [this, &L](size_t index) { + return L.contains(basicBlockTags[index].BB); + }); + } + } + + for (BasicBlock *succ : successors(BB)) { + if (BBTag.isLoopBackEdge(succ) && !allowLatch) { + continue; + } + if (visited.insert(succ).second) { + queue.push(succ); + } + } + } + + return divergentExits; +} + +//////////////////////////////////////////////////////////////////////////////// + +llvm::AnalysisKey DivergenceAnalysis::Key; + +DivergenceResult DivergenceAnalysis::run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM) { + DivergenceResult Res(F, AM); + + LLVM_DEBUG(dbgs() << "DIVERGENCE ANALYSIS\n"); + Res.basicBlockTags.reserve(F.size() * 4); + + // Prepare the BasicBlockTags. + const LoopInfo &LI = AM.getResult(F); + for (BasicBlock &BB : F) { + // Create BB info entries. + BasicBlockTag &BBTag = Res.getOrCreateTag(&BB); + + // Update loop info. + if (Loop *L = LI.getLoopFor(&BB)) { + if (!BBTag.loop) { + BBTag.loop = &Res.getOrCreateTag(L); + BBTag.loop->latch = L->getLoopLatch(); + BBTag.loop->header = L->getHeader(); + BBTag.loop->preheader = L->getLoopPreheader(); + } + } + } + + // Find loop live values and update loop exit information. + Res.computeLoopOrdering(); + for (auto *const LTag : Res.loopOrdering) { + SmallVector loopExitBlocks; + LTag->loop->getExitBlocks(loopExitBlocks); + for (BasicBlock *BB : loopExitBlocks) { + auto &BBTag = Res.getTag(BB); + // If BB already leaves a loop, update it if the previous loop is nested + // in the current. + if (BBTag.outermostExitedLoop) { + if (BBTag.outermostExitedLoop->loop->getLoopDepth() > + LTag->loop->getLoopDepth()) { + BBTag.outermostExitedLoop = LTag; + } + } else { + BBTag.outermostExitedLoop = LTag; + } + + // LoopSimplify pass has already converted SSA form to LCSSA from. + // Let's use lcssa phi nodes to find loop live variables like llvm loop + // vectorizer. + // LoopSimplify pass is added on PreparationPass of vectorizer.cpp. + // + // See head comment on lib/Transforms/Utils/LCSSA.cpp + for (Instruction &I : *BB) { + if (PHINode *PHI = dyn_cast(&I)) { + // lcssa phi has incoming values defined in the loop. + for (Value *incoming : PHI->incoming_values()) { + if (Instruction *incomingInst = dyn_cast(incoming)) { + if (LTag->loop->contains(incomingInst->getParent())) { + LTag->loopLiveValues.insert(incoming); + LLVM_DEBUG(dbgs() << *incoming << " is a loop live value of " + << LTag->loop->getName() << "\n"); + } + } + } + } + } + } + } + + // From the UVA, we know which conditions are varying which allows us to + // find divergent branches. + // Moreover, from divergent branches - and therefore from divergent paths - + // we can find more varying values that are computed on those divergent paths. + // The latter allows us to find more divergent branches, and so on... + // We take a local copy of the UVR because it is not good to modify one + // analysis result from another analysis. However, after Control Flow + // Conversion has been run, all control flow divergence is converted into + // non-uniform dataflow so any subsequent run of the UVA is still correct. + auto UVR = AM.getResult(F); + auto &DT = AM.getResult(F); + auto &PDT = AM.getResult(F); + + Res.computeBlockOrdering(DT); + + std::vector> uniformBranches; + uniformBranches.reserve(F.size() - 1u); + for (BasicBlock &BB : F) { + if (BranchInst *B = dyn_cast(BB.getTerminator())) { + if (B->isConditional()) { + uniformBranches.push_back({&BB, B->getCondition()}); + } + } else if (SwitchInst *SI = dyn_cast(BB.getTerminator())) { + uniformBranches.push_back({&BB, SI->getCondition()}); + } + } + + while (!uniformBranches.empty()) { + // Partition the list so all the varying branches are grouped at the end. + const auto varyingBranches = + std::partition(uniformBranches.begin(), uniformBranches.end(), + [&UVR](std::pair &p) -> bool { + return !UVR.isVarying(p.second); + }); + + // Process all the varying branches. + DivergenceInfo divergenceInfo; + for (auto it = varyingBranches; it != uniformBranches.end(); ++it) { + BasicBlock *BB = it->first; + + // Find blocks diverged by varying branch block. + Res.markDivCausing(*BB, divergenceInfo, PDT); + + if (const auto *const LTag = Res.getTag(BB).loop) { + Loop *L = LTag->loop; + while (L) { + // If BB is a varying branch, mark the loop as diverging if any two + // instances of a SIMD group can leave the loop over different exit + // edges and/or in different iterations. This means that BB cannot + // be postdominated by any block of L. + auto *Node = PDT.getNode(BB); + assert(Node && "Could not get node"); + auto *IDom = Node->getIDom(); + assert(IDom && "Could not get IDom"); + BasicBlock *PIDom = IDom->getBlock(); + if (!L->contains(PIDom)) { + Res.markDivLoopDivBlocks(*BB, *L, divergenceInfo); + } else { + // If the loop does not diverge because of `BB`, none of its + // parent loops can diverge either. + break; + } + L = L->getParentLoop(); + } + } + } + + // Remove all the varying branches from the end of the list. + uniformBranches.erase(varyingBranches, uniformBranches.end()); + + // PHIs defined in join point of divergent branches and in exit blocks of + // divergent loops are varying. + bool updated = false; + for (BasicBlock *BB : divergenceInfo) { + const bool exitedLoop = Res.getTag(BB).outermostExitedLoop; + for (Instruction &I : *BB) { + if (PHINode *PHI = dyn_cast(&I)) { + // Loop exits might have constant phi nodes (lcssa value). + if (exitedLoop || !PHI->hasConstantOrUndefValue()) { + if (!UVR.isVarying(&I)) { + updated = true; + UVR.markVaryingValues(&I); + LLVM_DEBUG(dbgs() + << I.getName() << " is a varying instruction\n"); + } + } + } else { + break; + } + } + } + if (!updated) { + // We made no updates, so we processed all the varying branches. + break; + } + } + + // All blocks that are predecessors of join points of div causing blocks and + // have a uniform condition must be marked as fake div causing blocks because + // divergence may have occurred at the div causing block and we must make sure + // we execute all paths that lead to the join point. + for (BasicBlock *BB : Res.fakeDivCausingBlocks) { + if (BB->getTerminator()->getNumSuccessors() > 1 && !Res.isDivCausing(*BB)) { + Res.setFlag(*BB, BlockDivergenceFlag::eBlockHasDivergentBranchFake); + LLVM_DEBUG(dbgs() << "Found fake div causing block " << BB->getName() + << "\n"); + // Because we have marked `BB` as a target for linearization, its join + // points must be marked as `blend` because they may lose some + // predecessors during the rewiring. + for (BasicBlock *join : Res.joinPoints(*BB)) { + Res.setFlag(*join, BlockDivergenceFlag::eBlockIsBlend); + LLVM_DEBUG(dbgs() << "\tBlock " << join->getName() << " is blend\n"); + } + } + } + + // By definition, the entry block is by_all. + Res.markByAll(F.getEntryBlock()); + + return Res; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp new file mode 100644 index 0000000000000..f14239789e598 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp @@ -0,0 +1,135 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/instantiation_analysis.h" + +#include +#include + +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "memory_operations.h" +#include "vectorization_context.h" + +#define DEBUG_TYPE "vecz-instantiation" + +using namespace vecz; +using namespace llvm; + +namespace { +bool analyzeType(Type *Ty) { + return !Ty->isVoidTy() && !Ty->isVectorTy() && + !FixedVectorType::isValidElementType(Ty); +} + +bool analyzeMemOp(MemOp &Op) { + assert(Op.getPointerType()->isPointerTy() && "MemOp inconsistency"); + return analyzeType(Op.getDataType()); +} + +bool analyzeCall(const VectorizationContext &Ctx, CallInst *CI) { + Function *Callee = CI->getCalledFunction(); + VECZ_FAIL_IF(!Callee); + + // Handle internal builtins. + if (Ctx.isInternalBuiltin(Callee)) { + if (auto Op = MemOp::get(CI)) { + return analyzeMemOp(*Op); + } + return false; + } + + // Handle function containing pointers as parameter. + if (any_of(Callee->args(), + [](const Argument &A) { return A.getType()->isPointerTy(); })) { + return true; + } + + // Handle masked function calls + if (Ctx.isMaskedFunction(Callee)) { + return true; + } + + auto B = Ctx.builtins().analyzeBuiltin(*Callee); + const auto Props = B ? B->properties : 0; + + // Intrinsics without side-effects can be safely instantiated. + if (Callee->isIntrinsic() && + (Props & compiler::utils::eBuiltinPropertyNoSideEffects)) { + // If the intrinsic has a vector equivalent, then we can use it directly + // instead. + if (Props & compiler::utils::eBuiltinPropertyVectorEquivalent) { + return analyzeType(CI->getType()); + } + return true; + } + + // Functions returning void must have side-effects. + // We cannot vectorize them and instead we need to instantiate them. + const bool HasSideEffects = + Callee->getReturnType()->isVoidTy() || + (Props & compiler::utils::eBuiltinPropertySideEffects); + if (HasSideEffects && + (Props & compiler::utils::eBuiltinPropertySupportsInstantiation)) { + return true; + } + + return analyzeType(CI->getType()); +} + +bool analyzeAlloca(const VectorizationContext &Ctx, AllocaInst *alloca) { + // Possibly, we could packetize by creating a wider array, but for now let's + // just let instantiation deal with it. + if (alloca->isArrayAllocation()) { + return true; + } + + // We can create an array of anything, however, we need to be careful of + // alignment. In the case the alloca has a specific alignment requirement, we + // have to be sure it divides the type allocation size, otherwise only the + // first vector element would necessarily be correctly aligned. + auto *const dataTy = alloca->getAllocatedType(); + const uint64_t memSize = Ctx.dataLayout()->getTypeAllocSize(dataTy); + const uint64_t align = alloca->getAlign().value(); + return (align != 0 && (memSize % align) != 0); +} +} // namespace + +namespace vecz { +bool needsInstantiation(const VectorizationContext &Ctx, Instruction &I) { + if (CallInst *CI = dyn_cast(&I)) { + return analyzeCall(Ctx, CI); + } else if (LoadInst *Load = dyn_cast(&I)) { + if (auto Op = MemOp::get(Load)) { + return analyzeMemOp(*Op); + } + // If it's not a MemOp, assume we don't need to instantiate. + return false; + } else if (StoreInst *Store = dyn_cast(&I)) { + if (auto Op = MemOp::get(Store)) { + return analyzeMemOp(*Op); + } + // If it's not a MemOp, assume we don't need to instantiate. + return false; + } else if (AllocaInst *Alloca = dyn_cast(&I)) { + return analyzeAlloca(Ctx, Alloca); + } else if (isa(&I) || isa(&I)) { + return true; + } else { + return analyzeType(I.getType()); + } +} +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp new file mode 100644 index 0000000000000..6bdcf9c295412 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp @@ -0,0 +1,253 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Implementation based on Section 5.2 of the paper: +// Florian Brandner, Benoit Boissinot, Alain Darte, Benoît Dupont de Dinechin, +// Fabrice Rastello. +// Computing Liveness Sets for SSA-Form Programs. +// [Research Report] RR-7503, INRIA. 2011, pp.25. inria-00558509v2 +// +// https://hal.inria.fr/inria-00558509v2 + +#include "analysis/liveness_analysis.h" + +#include +#include +#include +#include +#include + +#include "vectorization_unit.h" + +using namespace llvm; +using namespace vecz; + +llvm::AnalysisKey LivenessAnalysis::Key; + +namespace { + +// Returns true if V defines a variable and is likely to require a register +bool definesVariable(const Value &V) { + // Constants are likely to be immediate values + if (isa(V)) { + return false; + } + + // If a value isn't used, it can't be live + if (V.user_empty()) { + return false; + } + + const auto valueType = V.getType(); + return !valueType->isVoidTy() && !valueType->isLabelTy() && + !valueType->isTokenTy() && !valueType->isMetadataTy(); +} + +// Tries to push a value onto the set, if it is not there already. +// Returns true if the value was pushed, false otherwise. +// +// Note that since the implementation completely processes every instruction +// sequentially, only the last element needs to be checked. +inline bool pushOnce(BlockLivenessInfo::LiveSet &s, Value *V) { + if (!s.empty() && s.back() == V) { + return false; + } + s.push_back(V); + return true; +} + +} // namespace + +class LivenessResult::Impl { +public: + Impl(LivenessResult &lr) : LR(lr) {} + + void recalculate(); + +private: + LivenessResult &LR; + + void computeByVar(const BasicBlock &BB); + + void computeVar(Value *V, const BasicBlock *BB); + + void mark(Value *V, const BasicBlock *parent, const BasicBlock *BB); + + void calculateMaxRegistersInBlock(const llvm::BasicBlock *BB); + + // private utility method for code conciseness + BlockLivenessInfo &info(const BasicBlock *BB) const { + auto BIi = LR.BlockInfos.find(BB); + assert(BIi != LR.BlockInfos.end() && "Block Liveness Info does not exist!"); + return BIi->second; + } +}; + +LivenessResult LivenessAnalysis::run(llvm::Function &F, + llvm::FunctionAnalysisManager &) { + Result R(F); + R.recalculate(); + return R; +} + +size_t LivenessResult::getMaxLiveVirtualRegisters() const { + return maxNumberOfLiveValues; +} + +const BlockLivenessInfo & +LivenessResult::getBlockInfo(const BasicBlock *BB) const { + auto found = BlockInfos.find(BB); + assert(found != BlockInfos.end() && "No liveness information for BasicBlock"); + return found->second; +} + +void LivenessResult::recalculate() { + maxNumberOfLiveValues = 0; + + BlockInfos.clear(); + + Impl impl(*this); + impl.recalculate(); +} + +void LivenessResult::Impl::recalculate() { + auto &F = LR.F; + + // Create infos in advance so things don't relocate under our feet. + for (auto &BB : F) { + (void)LR.BlockInfos[&BB]; + } + + // Arguments are always live-ins of the entry block (if they are used). + { + auto *BB = &F.getEntryBlock(); + auto &BI = info(BB); + for (auto &arg : F.args()) { + if (!arg.use_empty()) { + BI.LiveIn.push_back(&arg); + computeVar(&arg, BB); + } + } + } + + // Add all other variables to the live sets. + for (auto &BB : F) { + auto &BI = LR.BlockInfos[&BB]; + for (auto &I : BB) { + if (definesVariable(I)) { + if (isa(I)) { + // PHI nodes are always live-ins. + BI.LiveIn.push_back(&I); + } + computeVar(&I, &BB); + } + } + } + + // Calculate the maximum number of live values in every block. + for (auto &BB : F) { + calculateMaxRegistersInBlock(&BB); + } + + // Store the largest number of live values in the function. + for (const auto &entry : LR.BlockInfos) { + LR.maxNumberOfLiveValues = std::max(LR.maxNumberOfLiveValues, + entry.getSecond().MaxRegistersInBlock); + } +} + +void LivenessResult::Impl::computeVar(Value *V, const BasicBlock *BB) { + SmallPtrSet UseBlocks; + for (auto *User : V->users()) { + if (auto *UI = dyn_cast(User)) { + if (auto *PHI = dyn_cast(UI)) { + for (unsigned i = 0, n = PHI->getNumIncomingValues(); i != n; ++i) { + if (PHI->getIncomingValue(i) == V) { + const auto *Incoming = PHI->getIncomingBlock(i); + + if (pushOnce(info(Incoming).LiveOut, V) && Incoming != BB) { + UseBlocks.insert(Incoming); + } + } + } + } else { + const auto *Parent = UI->getParent(); + if (Parent != BB) { + UseBlocks.insert(Parent); + } + } + } + } + + for (auto *UB : UseBlocks) { + if (pushOnce(info(UB).LiveIn, V)) { + mark(V, BB, UB); + } + } +} + +void LivenessResult::Impl::mark(Value *V, const BasicBlock *parent, + const BasicBlock *BB) { + // Propagate backward + for (const auto *pred : predecessors(BB)) { + auto &PBI = info(pred); + if (pushOnce(PBI.LiveOut, V) && pred != parent && pushOnce(PBI.LiveIn, V)) { + mark(V, parent, pred); + } + } +} + +void LivenessResult::Impl::calculateMaxRegistersInBlock(const BasicBlock *BB) { + auto &BI = LR.BlockInfos[BB]; + const SmallPtrSet liveOut(BI.LiveOut.begin(), + BI.LiveOut.end()); + SmallPtrSet seenButNotInLiveOut; + + auto maxRegistersUsed = liveOut.size(); + auto registersUsed = liveOut.size(); + + // Walk backwards through instructions in a block to count the maximum number + // of live values in that block. + for (auto &inst : make_range(BB->rbegin(), BB->rend())) { + // Phi nodes were in live out or were counted as operands. No need to + // decrement the registerCount, as one of the arguments used a register. + if (isa(&inst)) { + break; + } + + // Operands are live so they use a register. Increment registerCount if not + // in live out or already counted. + for (const auto *operand : inst.operand_values()) { + if (definesVariable(*operand) && !liveOut.contains(operand) && + !seenButNotInLiveOut.contains(operand)) { + registersUsed++; + seenButNotInLiveOut.insert(operand); + } + } + + // If inst defines a variable, one less register was used before it + if (definesVariable(inst)) { + registersUsed--; + } + + maxRegistersUsed = std::max(registersUsed, maxRegistersUsed); + } + + assert(registersUsed == BI.LiveIn.size() && + "Final number of live values inconsistent with live-in"); + + BI.MaxRegistersInBlock = maxRegistersUsed; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp new file mode 100644 index 0000000000000..d5230a303e3c3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp @@ -0,0 +1,176 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/packetization_analysis.h" + +#include +#include +#include +#include +#include + +#include "analysis/stride_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "memory_operations.h" +#include "offset_info.h" +#include "vectorization_context.h" +#include "vectorization_unit.h" + +#define DEBUG_TYPE "vecz" + +using namespace vecz; +using namespace llvm; + +namespace { +bool isDivergenceReduction(const Function &F) { + compiler::utils::Lexer L(F.getName()); + return (L.Consume(VectorizationContext::InternalBuiltinPrefix) && + L.Consume("divergence_")); +} +} // namespace + +llvm::AnalysisKey PacketizationAnalysis::Key; + +PacketizationAnalysisResult::PacketizationAnalysisResult( + llvm::Function &f, StrideAnalysisResult &sar) + : F(f), SAR(sar), UVR(sar.UVR) { + // Vectorize branch conditions. + for (BasicBlock &BB : F) { + auto *TI = BB.getTerminator(); + if (UVR.isVarying(TI)) { + markForPacketization(TI); + } + } + + // Then vectorize other instructions, starting at leaves. + std::vector Leaves; + UVR.findVectorLeaves(Leaves); + + // Traverse the function from the leaves to find instructions that need to be + // packetized. + for (Instruction *I : Leaves) { + markForPacketization(I); + } +} + +void PacketizationAnalysisResult::markForPacketization(Value *V) { + if (!toPacketize.insert(V).second) { + return; + } + + auto *const I = dyn_cast(V); + if (!I) { + return; + } + + if (auto *phi = dyn_cast(I)) { + for (unsigned i = 0, n = phi->getNumIncomingValues(); i < n; ++i) { + auto *const incoming = phi->getIncomingValue(i); + if (UVR.isVarying(incoming)) { + markForPacketization(incoming); + } + } + return; + } + + auto mo = MemOp::get(I); + if (UVR.isMaskVarying(I)) { + if (mo) { + markForPacketization(mo->getMaskOperand()); + return; + } + + if (auto *const CI = dyn_cast(I)) { + Function *Callee = CI->getCalledFunction(); + if (Callee && UVR.Ctx.isInternalBuiltin(Callee) && + isDivergenceReduction(*Callee)) { + markForPacketization(CI->getOperand(0)); + return; + } + } + } + + if (mo) { + auto *const ptr = mo->getPointerOperand(); + if (ptr && UVR.isVarying(ptr)) { + const auto *info = SAR.getInfo(ptr); + assert(info && "markForPacketization: Unable to obtain stride info"); + + bool hasValidStride = info->hasStride(); + + // Analyse the computed stride to see if the pointer will need to be + // packetized. No packetization is necessary where a contiguous or + // interleaved memop can be created, since only the pointer to the + // first element will be used. + if (hasValidStride) { + // Get the pointer stride as a number of elements + auto *const eltTy = mo->getDataType(); + if (eltTy->isVectorTy() || eltTy->isPointerTy()) { + // No interleaved memops exist for vector element types or pointer + // types. We can only vectorize pointer loads/stores or widen vector + // load/stores if they are contiguous. + const auto stride = info->getConstantMemoryStride( + eltTy, &F.getParent()->getDataLayout()); + if (stride != 1) { + hasValidStride = false; + } + } else if (!VectorType::isValidElementType(eltTy)) { + hasValidStride = false; + } + } + + // Only mark the pointer for packetization if it does not have a + // valid linear stride + if (!hasValidStride) { + markForPacketization(ptr); + } + } + + auto *const data = mo->getDataOperand(); + auto *const mask = mo->getMaskOperand(); + if (data && UVR.isVarying(data)) { + markForPacketization(data); + } + if (mask && UVR.isVarying(mask)) { + markForPacketization(mask); + } + return; + } + + if (auto *const intrinsic = dyn_cast(I)) { + const auto intrinsicID = intrinsic->getIntrinsicID(); + if (intrinsicID == llvm::Intrinsic::lifetime_end || + intrinsicID == llvm::Intrinsic::lifetime_start) { + // We don't trace through lifetime intrinsics. + return; + } + } + + // Mark any varying operands for packetization.. + for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) { + auto *const opI = I->getOperand(i); + if (UVR.isVarying(opI)) { + markForPacketization(opI); + } + } +} + +PacketizationAnalysisResult +PacketizationAnalysis::run(Function &F, llvm::FunctionAnalysisManager &AM) { + auto &SAR = AM.getResult(F); + return Result(F, SAR); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp new file mode 100644 index 0000000000000..9354efd65bb12 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp @@ -0,0 +1,199 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/simd_width_analysis.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "analysis/liveness_analysis.h" +#include "analysis/packetization_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "vectorization_unit.h" +#include "vecz/vecz_target_info.h" + +#define DEBUG_TYPE "vecz-simd-width" + +using namespace llvm; +using namespace vecz; + +llvm::AnalysisKey SimdWidthAnalysis::Key; + +namespace { +bool definedOrUsedInLoop(Value *V, Loop *L) { + if (!L) { + // We're not in a loop, so consider everything. + return true; + } + + const auto *const I = dyn_cast(V); + if (I && L->contains(I)) { + // It's defined in the current loop. + return true; + } + + // If it's used in the current loop, return true, unless it is a PHI node. + // Values defined outwith the loop, but used only by a PHI node within it must + // be loop-carried variable initial values. If these are not otherwise used + // directly within the loop, then they are not really live inside the loop. + for (const auto *const U : V->users()) { + const auto *const I = dyn_cast(U); + if (I && !isa(I) && L->contains(I)) { + return true; + } + } + return false; +} +} // namespace + +// Avoid Spill implementation. It focus on avoiding register spill by optimizing +// register pressure. +unsigned SimdWidthAnalysis::avoidSpillImpl(Function &F, + FunctionAnalysisManager &AM, + unsigned MinWidth) { + VectorizationUnit &VU = AM.getResult(F).getVU(); + const TargetTransformInfo TTI = VU.context().getTargetTransformInfo(F); + const auto &Liveness = AM.getResult(F); + const auto &PAR = AM.getResult(F); + const LoopInfo &LI = AM.getResult(F); + // Determine the SIMD width based on a live values register usage estimation. + assert(!VU.width().isScalable() && "Can't handle scalable-vectors"); + unsigned SimdWidth = VU.width().getFixedValue(); + assert(SimdWidth != 0 && "SimdWidthAnalysis: SimdWidth == 0"); + + SmallSet OpenIntervals; + SmallVector IntervalArray; + + auto ShouldConsider = [&](const Value *V) -> bool { + // Filter out work item builtin calls such as get_local_id() + if (auto *const CI = dyn_cast(V)) { + if (const Function *Callee = CI->getCalledFunction()) { + if (auto B = VU.context().builtins().analyzeBuiltin(*Callee)) { + if (B->properties == compiler::utils::eBuiltinPropertyWorkItem) { + return false; + } + } + } + } + return true; + }; + + LLVM_DEBUG(dbgs() << "VEC(REG): Calculating max register usage:\n"); + for (const auto &BB : F) { + // Get the LiveIns for this Basic Block. + // The principle of the Loop Aware SIMD Width Analysis is that it is not + // acceptable to spill values in the middle of a loop, however it may be + // acceptable to spill some values before entering a loop. + const auto &BI = Liveness.getBlockInfo(&BB); + OpenIntervals.clear(); + auto *const CurLoop = LI.getLoopFor(&BB); + for (auto *V : BI.LiveOut) { + if (ShouldConsider(V) && PAR.needsPacketization(V) && + definedOrUsedInLoop(V, CurLoop)) { + OpenIntervals.insert(V); + } + } + + // Walk backwards through instructions in a block to count the maximum + // number of live values in that block. + for (auto &inst : make_range(BB.rbegin(), BB.rend())) { + if (isa(&inst)) { + break; + } + + // The first instruction in the reverse range will be the terminator, + // so we don't really need to consider it. However we do need to consider + // the live set at the point before the last (i.e. first) instruction, so + // we deal with the operands first and then process the live set. + if (PAR.needsPacketization(&inst)) { + const bool isGEP = isa(&inst); + for (auto operand : inst.operand_values()) { + if (isa(operand) || isa(operand)) { + if (!isGEP || PAR.needsPacketization(operand)) { + OpenIntervals.insert(operand); + } + } + } + } + + OpenIntervals.erase(&inst); + IntervalArray.assign(OpenIntervals.begin(), OpenIntervals.end()); + SimdWidth = VU.context().targetInfo().estimateSimdWidth( + TTI, IntervalArray, SimdWidth); + LLVM_DEBUG(dbgs() << "VEC(REG): Interval # " << OpenIntervals.size() + << " at SIMD Width " << SimdWidth << '\n'); + LLVM_DEBUG( + for (auto OII = OpenIntervals.begin(), OIIE = OpenIntervals.end(); + OII != OIIE; OII++) { dbgs() << "inst:" << **OII << '\n'; }); + + if (SimdWidth < MinWidth) { + return 0; + } + } + } + + LLVM_DEBUG(dbgs() << "VEC(REG): Found widest fitting SIMD width: " + << SimdWidth << '\n'); + return SimdWidth; +} + +SimdWidthAnalysis::Result +SimdWidthAnalysis::run(Function &F, llvm::FunctionAnalysisManager &AM) { + const TargetTransformInfo &TTI = AM.getResult(F); + const VectorizationUnit &VU = + AM.getResult(F).getVU(); + + // If the target does not provide vector registers, return 0. + MaxVecRegBitWidth = + TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector) + .getFixedValue(); + + if (MaxVecRegBitWidth == 0) { + return 0; + } + + // If the vectorization factor is for scalable vectors, return 0. + if (VU.width().isScalable()) { + return 0; + } + + auto SimdWidth = avoidSpillImpl(F, AM, 1); + if (SimdWidth != 0 && SimdWidth < 4) { + // We only return 0 (i.e. don't vectorize) in the case that the packetized + // values wouldn't fit into vector registers even with a factor of 1. If + // the packetized values fit into vector registers for any width, we use + // a baseline factor of 4 since this is empirically better than 2. + SimdWidth = 4; + } + + return SimdWidth; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp new file mode 100644 index 0000000000000..b98a149c97b12 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp @@ -0,0 +1,124 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/stride_analysis.h" + +#include +#include +#include + +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "memory_operations.h" +#include "offset_info.h" +#include "vectorization_context.h" +#include "vectorization_unit.h" + +#define DEBUG_TYPE "vecz" + +using namespace vecz; +using namespace llvm; + +llvm::AnalysisKey StrideAnalysis::Key; + +OffsetInfo &StrideAnalysisResult::analyze(Value *V) { + const auto find = analyzed.find(V); + if (find != analyzed.end()) { + return find->second; + } + + // We construct it on the stack first, and copy it into the map, because + // the constructor itself can create more things in the map and constructing + // it in-place could result in the storage being re-allocated while the + // constructor is still running. + const auto OI = OffsetInfo(*this, V); + return analyzed.try_emplace(V, OI).first->second; +} + +StrideAnalysisResult::StrideAnalysisResult(llvm::Function &f, + UniformValueResult &uvr, + AssumptionCache &AC) + : F(f), UVR(uvr), AC(AC) { + for (auto &BB : F) { + for (auto &I : BB) { + if (!UVR.isVarying(&I)) { + continue; + } + + if (auto mo = MemOp::get(&I)) { + auto *const ptr = mo->getPointerOperand(); + analyze(ptr); + } + } + } +} + +void StrideAnalysisResult::manifestAll(IRBuilder<> &B) { + const auto saved = B.GetInsertPoint(); + for (auto &info : analyzed) { + info.second.manifest(B, *this); + } + B.SetInsertPoint(saved->getParent(), saved); +} + +Value *StrideAnalysisResult::buildMemoryStride(IRBuilder<> &B, llvm::Value *Ptr, + llvm::Type *EleTy) const { + if (auto *const info = getInfo(Ptr)) { + return info->buildMemoryStride(B, EleTy, &F.getParent()->getDataLayout()); + } + return nullptr; +} + +StrideAnalysisResult StrideAnalysis::run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM) { + auto &AC = AM.getResult(F); + auto &UVR = AM.getResult(F); + return Result(F, UVR, AC); +} + +PreservedAnalyses StrideAnalysisPrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &SAR = AM.getResult(F); + OS << "StrideAnalysis for function '" << F.getName() << "':\n"; + + for (auto &BB : F) { + for (auto &I : BB) { + if (auto MO = MemOp::get(&I)) { + auto *const Ptr = MO->getPointerOperand(); + if (!Ptr) { + continue; + } + if (const OffsetInfo *Info = SAR.getInfo(Ptr)) { + OS << "* Stride for " << *Ptr << "\n - "; + if (Info->mayDiverge()) { + OS << "divergent"; + } else if (Info->hasStride()) { + OS << "linear"; + } else if (Info->isUniform()) { + OS << "uniform"; + } else { + OS << "unknown"; + } + if (Info->isStrideConstantInt()) { + OS << " stride of " << Info->getStrideAsConstantInt(); + } + OS << "\n"; + } + } + } + } + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp new file mode 100644 index 0000000000000..0d24a43a81921 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp @@ -0,0 +1,563 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/uniform_value_analysis.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "analysis/instantiation_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "memory_operations.h" +#include "vectorization_unit.h" + +#define DEBUG_TYPE "vecz" + +using namespace vecz; +using namespace llvm; + +namespace { + +// Find leaves by recursing through an instruction's uses +bool findStrayLeaves(UniformValueResult &UVR, Instruction &I, + DenseSet &Visited) { + for (const Use &U : I.uses()) { + auto *User = U.getUser(); + if (isa(User) || isa(User) || + isa(User)) { + if (UVR.isValueOrMaskVarying(User)) { + return true; + } + } else if (auto *CI = dyn_cast(User)) { + if (CI->use_empty()) { + // Any call instruction with no uses is counted as a leaf. This case + // should also cover any kind of masked stores, since masked stores are + // builtin calls with no uses, there is no need to explicitly check for + // masked stores. + if (UVR.isValueOrMaskVarying(CI)) { + return true; + } + } + } else if (auto *UI = dyn_cast(User)) { + if (isa(User)) { + // Don't trace through loads + } else if (Visited.insert(UI).second) { + if (findStrayLeaves(UVR, *UI, Visited)) { + return true; + } + } + } + } + return false; +} + +bool isDivergenceReduction(const Function &F) { + compiler::utils::Lexer L(F.getName()); + return (L.Consume(VectorizationContext::InternalBuiltinPrefix) && + L.Consume("divergence_")); +} + +bool isTrueUniformInternal(const Value *V, unsigned Depth) { + if (!V) { + return false; + } + + // Constants and Arguments that can't be undef/poison are truly uniform + if (isa(V) || isa(V)) { + return isGuaranteedNotToBePoison(V); + } + + constexpr unsigned DepthLimit = 6; + + if (Depth < DepthLimit) { + // For a specific subset of instructions, if all operands are truly + // uniform, then the instruction is too. + // FIXME: This is pessimistic. We could improve this by extending the list + // of instructions covered. We could also use flow-sensitive analysis in + // isGuaranteedNotToBePoison to enhance its capabilities. + if (const auto *I = dyn_cast(V)) { + if (isa(I) || isa(I) || isa(I) || + isa(I) || isa(I) || isa(I)) { + return isGuaranteedNotToBePoison(I) && + llvm::all_of(I->operands(), [Depth](Value *Op) { + return isTrueUniformInternal(Op, Depth + 1); + }); + } + } + } + + return false; +} + +} // namespace + +UniformValueResult::UniformValueResult(Function &F, VectorizationUnit &vu) + : F(F), VU(vu), Ctx(VU.context()), dimension(VU.dimension()) {} + +bool UniformValueResult::isVarying(const Value *V) const { + auto found = varying.find(V); + if (found == varying.end()) { + return false; + } + return found->second == VaryingKind::eValueVarying; +} + +bool UniformValueResult::isMaskVarying(const Value *V) const { + auto found = varying.find(V); + if (found == varying.end()) { + return false; + } + return found->second == VaryingKind::eMaskVarying; +} + +bool UniformValueResult::isValueOrMaskVarying(const Value *V) const { + auto found = varying.find(V); + if (found == varying.end()) { + return false; + } + return found->second != VaryingKind::eValueTrueUniform && + found->second != VaryingKind::eValueActiveUniform; +} + +bool UniformValueResult::isTrueUniform(const Value *V) { + auto found = varying.find(V); + if (found != varying.end()) { + return found->second == VaryingKind::eValueTrueUniform; + } + if (!isTrueUniformInternal(V, /*Depth=*/0)) { + return false; + } + // Cache this result to help speed up future queries + varying[V] = VaryingKind::eValueTrueUniform; + return true; +} + +/// @brief Utility function to check whether an instruction is a call to a +/// reduction or broadcast operaton. +/// +/// @param[in] I Instruction to check +/// @param[in] BI BuiltinInfo for platform-specific builtin IDs +/// @return true if the instruction is a call to a reduction or broadcast +/// builtin. +static bool +isGroupBroadcastOrReduction(const Instruction &I, + const compiler::utils::BuiltinInfo &BI) { + if (!isa(&I)) { + return false; + } + auto *const CI = cast(&I); + auto *const Callee = CI->getCalledFunction(); + if (!Callee) { + return false; + } + auto B = BI.analyzeBuiltin(*Callee); + if (!B) { + return false; + } + auto Info = BI.isMuxGroupCollective(B->ID); + return Info && (Info->isSubGroupScope() || Info->isWorkGroupScope()) && + (Info->isAnyAll() || Info->isReduction() || Info->isBroadcast()); +} + +void UniformValueResult::findVectorLeaves( + std::vector &Leaves) const { + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + // Reductions and broadcasts are always vector leaves regardless of + // uniformity. + if (isGroupBroadcastOrReduction(I, BI)) { + Leaves.push_back(&I); + continue; + } + + if (!isVarying(&I)) { + if (isMaskVarying(&I)) { + // it's a leaf if only its mask operand is varying, since the value + // itself will be uniform and won't propagate "varying" to its users. + Leaves.push_back(&I); + continue; + } + if (CallInst *CI = dyn_cast(&I)) { + Function *Callee = CI->getCalledFunction(); + if (!Callee) { + continue; + } + + // If its a call to user defined function whose use is empty, and is + // uniform then add it to the leaves + if (!Callee->isIntrinsic() && CI->use_empty()) { + // Try to identify the called function + const auto Builtin = BI.analyzeBuiltin(*Callee); + if (!Builtin) { + Leaves.push_back(CI); + } + } + } + continue; + } + + if (StoreInst *Store = dyn_cast(&I)) { + Instruction *Ptr = dyn_cast(Store->getPointerOperand()); + if (Ptr && isVarying(Ptr)) { + Leaves.push_back(Store); + } + continue; + } + + if (ReturnInst *Ret = dyn_cast(&I)) { + Leaves.push_back(Ret); + continue; + } + + if (AtomicRMWInst *RMW = dyn_cast(&I)) { + Leaves.push_back(RMW); + continue; + } else if (AtomicCmpXchgInst *CmpXchg = dyn_cast(&I)) { + Leaves.push_back(CmpXchg); + continue; + } + + // Functions that have no uses are leaves. + if (CallInst *CI = dyn_cast(&I)) { + bool IsCallLeaf = false; + if (CI->use_empty()) { + IsCallLeaf = true; + } else if (auto Op = MemOp::get(CI)) { + // Handle masked stores. + if (Op->isStore() && + (Op->isMaskedMemOp() || Op->isMaskedInterleavedMemOp() || + Op->isMaskedScatterGatherMemOp())) { + IsCallLeaf = true; + } + } else if (Ctx.isMaskedAtomicFunction(*CI->getCalledFunction())) { + IsCallLeaf = true; + } + if (IsCallLeaf) { + Leaves.push_back(CI); + continue; + } + } + } + } +} + +void UniformValueResult::findVectorRoots(std::vector &Roots) const { + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + CallInst *CI = dyn_cast(&I); + if (!CI || !CI->getCalledFunction()) { + continue; + } + const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension); + if (!Builtin) { + continue; + } + const auto Uniformity = Builtin->uniformity; + if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID || + Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) { + // Calls to `get_global_id`/`get_local_id` are roots. + Roots.push_back(CI); + } else if ((Uniformity == compiler::utils::eBuiltinUniformityNever) && + !CI->getType()->isVoidTy()) { + // Non-void builtins with side-effects are also roots. + Roots.push_back(CI); + } + } + } + + // Add vectorized arguments to the list of roots. + for (const VectorizerTargetArgument &TargetArg : VU.arguments()) { + if (!TargetArg.IsVectorized && !TargetArg.PointerRetPointeeTy) { + continue; + } + + if (&F == VU.scalarFunction()) { + Roots.push_back(TargetArg.OldArg); + } else if (&F == VU.vectorizedFunction()) { + if (TargetArg.Placeholder) { + Roots.push_back(TargetArg.Placeholder); + } else { + Roots.push_back(TargetArg.NewArg); + } + } + } +} + +AllocaInst *UniformValueResult::findAllocaFromPointer(Value *Pointer) { + while (Pointer) { + if (AllocaInst *Alloca = dyn_cast(Pointer)) { + return Alloca; + } else if (GetElementPtrInst *GEP = dyn_cast(Pointer)) { + Pointer = GEP->getPointerOperand(); + } else if (BitCastInst *BC = dyn_cast(Pointer)) { + Pointer = BC->getOperand(0); + } else if (LoadInst *Load = dyn_cast(Pointer)) { + Pointer = Load->getPointerOperand(); + } else { + return nullptr; + } + } + + return nullptr; +} + +void UniformValueResult::markVaryingValues(Value *V, Value *From) { + auto &vary = varying[V]; + // Do not visit values twice. + if (vary == VaryingKind::eValueVarying) { + return; + } + + if (CallInst *CI = dyn_cast(V)) { + // Some builtins produce a uniform value regardless of their inputs. + Function *Callee = CI->getCalledFunction(); + if (Callee) { + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + if (const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension)) { + const auto Uniformity = Builtin->uniformity; + if (Uniformity == compiler::utils::eBuiltinUniformityAlways) { + return; + } + } + if (auto Op = MemOp::get(CI)) { + // The mask cannot affect the MemOp value, even though we may still + // need to packetize the mask.. + auto *Mask = Op->getMaskOperand(); + if (Mask && From == Mask) { + vary = VaryingKind::eMaskVarying; + return; + } + } else if (Ctx.isInternalBuiltin(Callee)) { + // A divergence reduction builtin's value is uniform even though its + // argument is not, since it is a reduction over the SIMD width. + if (isDivergenceReduction(*Callee)) { + vary = VaryingKind::eMaskVarying; + return; + } + } + } + } + + // Mark V as being varying. + vary = VaryingKind::eValueVarying; + LLVM_DEBUG(dbgs() << "vecz: Needs packetization: " << *V << "\n"); + + // Visit all users of V, they are varying too. + for (const Use &Use : V->uses()) { + User *User = Use.getUser(); + markVaryingValues(User, V); + } + + // Mark uses of V for certain kinds of values. + Instruction *VIns = dyn_cast(V); + if (!VIns) { + return; + } + + if (StoreInst *Store = dyn_cast(VIns)) { + // Find the base address for the store. Storing varying values to an + // alloca location requires the alloca to be vectorized. + // We don't want to use extractMemOffset here because this requires the + // uniform value analysis to be finished. + AllocaInst *Alloca = findAllocaFromPointer(Store->getPointerOperand()); + if (Alloca) { + markVaryingValues(Alloca); + } + } else if (LoadInst *Load = dyn_cast(VIns)) { + AllocaInst *Alloca = findAllocaFromPointer(Load->getPointerOperand()); + if (Alloca) { + markVaryingValues(Alloca); + } + } else if (GetElementPtrInst *GEP = dyn_cast(VIns)) { + // We need to clear the flags because the initial address may be out of + // bounds but masked out. + GEP->setNoWrapFlags(GEPNoWrapFlags::none()); + + // Same as with the stores + AllocaInst *Alloca = findAllocaFromPointer(GEP->getPointerOperand()); + if (Alloca) { + markVaryingValues(Alloca); + } + } else if (BitCastInst *BC = dyn_cast(VIns)) { + // Same as with the stores + AllocaInst *Alloca = findAllocaFromPointer(BC->getOperand(0)); + if (Alloca) { + markVaryingValues(Alloca); + } + } else if (CallInst *CI = dyn_cast(VIns)) { + // Stores might be function calls as well + // Known MemOps have one known pointer operand which we can check. + if (auto Op = MemOp::get(CI)) { + if (auto *const Ptr = Op->getPointerOperand()) { + if (auto *Alloca = findAllocaFromPointer(Ptr)) { + markVaryingValues(Alloca); + } + } + } else { + // Check all parameters of unknown calls with pointer arguments. + for (auto &A : CI->args()) { + if (A->getType()->isPointerTy()) { + if (auto *Alloca = findAllocaFromPointer(A)) { + markVaryingValues(Alloca); + } + } + } + } + } +} + +Value *UniformValueResult::extractMemBase(Value *Address) { + if (BitCastInst *BCast = dyn_cast(Address)) { + return extractMemBase(BCast->getOperand(0)); + } else if (auto *ASCast = dyn_cast(Address)) { + return extractMemBase(ASCast->getOperand(0)); + } else if (isa(Address)) { + return Address; + } else if (isa(Address)) { + return Address; + } else if (isa(Address)) { + return Address; + } else if (isa(Address)) { + return Address; + } else if (auto *const Phi = dyn_cast(Address)) { + // If all the incoming values are the same, we can trace through it. In + // the general case, it's not trivial to check that the stride is the same + // from every incoming block, and since incoming values may not dominate + // the IRBuilder insert point, we might not even be able to build the + // offset expression instructions there. + if (auto *const CVal = Phi->hasConstantValue()) { + return extractMemBase(CVal); + } + + // In the simple case of a loop-incremented pointer using a GEP, we can + // handle it thus: + auto NumIncoming = Phi->getNumIncomingValues(); + if (NumIncoming != 2) { + // Perhaps we can handle more than one loop latch, but not yet. + return nullptr; + } + + if (auto *const GEP = + dyn_cast(Phi->getIncomingValue(1))) { + // If it's a simple loop iterator, the base can be analyzed from the + // initial value. + if (GEP->getPointerOperand() == Phi) { + for (const auto &index : GEP->indices()) { + if (isVarying(index.get())) { + return nullptr; + } + } + return extractMemBase(Phi->getIncomingValue(0)); + } + } + + return nullptr; + } else if (auto *GEP = dyn_cast(Address)) { + // Try to recursively extract the base from the GEP base. + return extractMemBase(GEP->getPointerOperand()); + } else if (isVarying(Address)) { + // If it's varying we can't analyze it any further. + return nullptr; + } else { + // If it's uniform we can just return the uniform address. + return Address; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +llvm::AnalysisKey UniformValueAnalysis::Key; + +UniformValueResult +UniformValueAnalysis::run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM) { + VectorizationUnit &VU = AM.getResult(F).getVU(); + UniformValueResult Res(F, VU); + std::vector Roots; + Res.findVectorRoots(Roots); + + // Mark all roots and their uses as being varying. + for (Value *Root : Roots) { + Res.markVaryingValues(Root); + } + + const compiler::utils::BuiltinInfo &BI = Res.Ctx.builtins(); + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + // Find atomic instructions, these are always varying + if (I.isAtomic()) { + Res.markVaryingValues(&I); + continue; + } + + // The same goes for the atomic builtins as well + if (CallInst *CI = dyn_cast(&I)) { + if (Function *Callee = CI->getCalledFunction()) { + const auto Builtin = BI.analyzeBuiltin(*Callee); + if (Builtin && + Builtin->properties & compiler::utils::eBuiltinPropertyAtomic) { + Res.markVaryingValues(&I); + continue; + } + } + } + } + } + + // If an alloca has been initialized with a uniform value, findVectorLeaves() + // will not pick up the store instruction as a leaf, even when that alloca is + // used by some other leaves. We have to go through all the allocas and mark + // them as varying if any varying instructions use them. This is the case + // also for masked stores where only the mask is varying. + bool Changed = true; + while (Changed) { + DenseSet Visited; + Changed = false; + bool Remaining = false; + for (Instruction &I : F.front()) { + if (isa(&I)) { + if (!Res.isVarying(&I)) { + if (findStrayLeaves(Res, I, Visited)) { + // We found a varying leaf, so this Alloca is non-uniform. + Res.markVaryingValues(&I); + + // Marking an alloca as varying could mark a leaf as varying that + // may also depend on a different alloca, so we have to go again. + Changed = true; + } else { + Remaining = true; + } + } + } else { + break; + } + } + Changed &= Remaining; + } + + return Res; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp new file mode 100644 index 0000000000000..edf0101ba883a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp @@ -0,0 +1,133 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/vectorizable_function_analysis.h" + +#include +#include +#include +#include + +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "vectorization_context.h" + +#define DEBUG_TYPE "vecz-function-analysis" + +using namespace vecz; +using namespace llvm; + +llvm::AnalysisKey VectorizableFunctionAnalysis::Key; + +/// @brief Tell Vecz to go ahead and handle calls to declaration-only functions +/// +/// This flag is for testing and debugging purposes and it should not be used +/// for normal code as instantiating undefined functions is not always valid. +static cl::opt HandleDeclOnlyCalls( + "vecz-handle-declaration-only-calls", + cl::desc("Go ahead and handle calls to declaration-only functions")); + +namespace { + +/// @brief Determine whether the instruction can be vectorized or not. +/// +/// @param[in] I Instruction to check for vectorizability. +/// @param[in] Ctx VectorizationContext for BuiltinInfo. +/// +/// @return true if I can be vectorized, false otherwise. +bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) { + // Certain instructions just cannot appear. + switch (I.getOpcode()) { + default: + break; + case Instruction::IndirectBr: + case Instruction::VAArg: + case Instruction::Invoke: + case Instruction::Resume: + case Instruction::LandingPad: + return false; + } + + // User function calls. + if (const CallInst *CI = dyn_cast(&I)) { + if (const Function *Callee = CI->getCalledFunction()) { + // We are going to assume that we can handle LLVM intrinsics for now and + // let the later passes deal with them + if (Callee->isIntrinsic()) { + return true; + } + + // All builtins should be vectorizable, in principle. "Invalid builtins" + // correspond to user functions. + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + const auto Builtin = BI.analyzeBuiltin(*Callee); + if (!Builtin) { + // If it is a user function missing a definition, we cannot safely + // instantiate it. For example, what if it contains calls to + // get_global_id internally? + if (Callee->isDeclaration()) { + return HandleDeclOnlyCalls; + } + // The same goes for functions we cannot inline, at least until we have + // a way of determining if a function can be safely instantiated or not. + if (Callee->hasFnAttribute(Attribute::NoInline)) { + return false; + } + } + } + } + + return true; +} + +/// @brief Determine whether the function can be vectorized or not. +/// +/// @param[in] F Function to check for vectorizability. +/// @param[in] Ctx VectorizationContext for BuiltinInfo. +/// +/// @return true if F can be vectorized, false otherwise. +bool canVectorize(const Function &F, const VectorizationContext &Ctx) { + // Do not vectorize functions with the OptNone attribute. Also do not + // vectorize functions with the NoInline attribute, since conceptually, the + // vectorized kernel calls the original kernel in a loop, and then that gets + // inlined and optimized. + if (F.hasFnAttribute(Attribute::OptimizeNone) || + F.hasFnAttribute(Attribute::NoInline)) { + return false; + } + + // Look for things that are not (yet?) supported. + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + if (!canVectorize(I, Ctx)) { + return false; + } + } + } + return true; +} + +} // namespace + +VectorizableFunctionAnalysis::Result +VectorizableFunctionAnalysis::run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM) { + Result res; + auto &Ctx = AM.getResult(F).getContext(); + + res.canVectorize = canVectorize(F, Ctx); + return res; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp new file mode 100644 index 0000000000000..484da6c6c8eae --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp @@ -0,0 +1,40 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/vectorization_unit_analysis.h" + +#define DEBUG_TYPE "vecz-unit-analysis" + +using namespace vecz; + +llvm::AnalysisKey VectorizationUnitAnalysis::Key; + +VectorizationUnitAnalysis::Result +VectorizationUnitAnalysis::run(llvm::Function &F, + llvm::FunctionAnalysisManager &) { + return Result{Ctx.getActiveVU(&F)}; +} + +#undef DEBUG_TYPE +#define DEBUG_TYPE "vecz-context-analysis" + +llvm::AnalysisKey VectorizationContextAnalysis::Key; + +VectorizationContextAnalysis::Result +VectorizationContextAnalysis::run(llvm::Function &, + llvm::FunctionAnalysisManager &) { + return Result{Context}; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp new file mode 100644 index 0000000000000..b6099bad61731 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp @@ -0,0 +1,1401 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "control_flow_boscc.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "analysis/divergence_analysis.h" +#include "analysis/liveness_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "ir_cleanup.h" +#include "llvm_helpers.h" +#include "reachability.h" +#include "vectorization_context.h" +#include "vectorization_unit.h" +#include "vecz/vecz_choices.h" + +#define DEBUG_TYPE "vecz-cf" + +using namespace llvm; +using namespace vecz; + +namespace { +using RPOT = ReversePostOrderTraversal; + +bool isUsedOutsideDefinitionBlock(Value *V) { + if (Instruction *I = dyn_cast(V)) { + return std::any_of(I->user_begin(), I->user_end(), [&I](User *U) { + return cast(U)->getParent() != I->getParent(); + }); + } + return false; +} + +/// @brief Check whether a block is "trivial" according to a heuristic +/// @param[in] BB the Basic Block to check +/// @return true if the block is trivial +bool isTrivialBlock(const BasicBlock &BB) { + if (BB.size() > 3) { + return false; + } + + for (const auto &I : BB) { + if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects() || + isa(&I)) { + return false; + } + } + return true; +} + +} // namespace + +/// @brief Check whether a uniform region is viable and worth keeping. +/// @param[in] region the region to check +/// @param[in] noDuplicateBlocks blocks the region is not alowed to contain +/// @return false iff the region should be discarded. + +bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() { + LLVM_DEBUG(dbgs() << "DUPLICATE UNIFORM REGIONS\n"); + + // Keep tracks of blocks that contain NoDuplicate calls. + DenseSet noDuplicateBlocks; + SmallPtrSet noDuplicateLoops; + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (CallInst *CI = dyn_cast(&I)) { + if (CI->hasFnAttr(Attribute::NoDuplicate)) { + noDuplicateBlocks.insert(&BB); + auto *const loop = DR->getTag(&BB).loop; + if (loop) { + noDuplicateLoops.insert(loop->loop); + } + break; + } + } + } + } + + // First, create the regions. + VECZ_FAIL_IF(!createUniformRegions(noDuplicateBlocks)); + + // Keep track of blocks that belong to loops. If a whole loop is duplicated, + // then a new loop object should be created for the uniform version. + SmallVector duplicatedLoops; + SmallPtrSet duplicatedLoopSet; + + const size_t size = + std::accumulate(uniformRegions.begin(), uniformRegions.end(), 0, + [](size_t base, const UniformRegion ®ion) { + return base + region.predicatedBlocks.size(); + }); + std::vector newBlocks; + newBlocks.reserve(size); + + // Conserve the original edges of the CFG. + for (BasicBlock &BB : F) { + for (BasicBlock *succ : successors(&BB)) { + uniformEdges[&BB].push_back(succ); + } + } + + // Then duplicate them. + for (auto ®ion : uniformRegions) { + BasicBlock *entry = region.entryBlock; + + std::vector sortedNewRegionBlocks; + sortedNewRegionBlocks.reserve(region.predicatedBlocks.size()); + + // Process the region's predicated blocks in DCBI order. + // Gather the block indices, then sort them. + std::vector predicatedBlockIndices; + predicatedBlockIndices.reserve(region.predicatedBlocks.size()); + for (auto *const B : region.predicatedBlocks) { + predicatedBlockIndices.push_back(DR->getTagIndex(B)); + } + std::sort(predicatedBlockIndices.begin(), predicatedBlockIndices.end()); + + for (const auto index : predicatedBlockIndices) { + const auto &BTag = DR->getBlockTag(index); + auto *const B = BTag.BB; + auto *const LTag = BTag.loop; + + // If the block is the BOSCC entry block, we don't want to duplicate it + // unless it is part of a loop. + if (B == entry && !LTag) { + continue; + } + + BasicBlock *newB = nullptr; + // If we have already cloned 'B', then we can reuse the cloned version. + if (VMap.count(B)) { + continue; + } + + newB = CloneBasicBlock(B, VMap, ".uniform", &F); + VMap.insert({B, newB}); + region.uniformBlocks.insert(newB); + newBlocks.push_back(newB); + sortedNewRegionBlocks.push_back(newB); + + // The new blocks will remain uniform + BasicBlockTag &newBTag = DR->getOrCreateTag(newB); + DR->setFlag(*newB, eBlockIsUniform); + + if (LTag) { + auto *const loop = LTag->loop; + if (LTag->header == B) { + duplicatedLoopSet.insert(loop); + duplicatedLoops.push_back(loop); + } + + if (!duplicatedLoopSet.contains(loop)) { + newBTag.loop = LTag; + loop->addBasicBlockToLoop(newB, *LI); + } + } + } + + // Splice the newly inserted blocks into the function right before the + // first div_causing block. + if (!sortedNewRegionBlocks.empty() && + entry->getNextNode() != sortedNewRegionBlocks[0]) { + F.splice(entry->getNextNode()->getIterator(), &F, + sortedNewRegionBlocks[0]->getIterator(), F.end()); + } + } + + // Since we added all loops by their headers in DCBI order, inner loops will + // always follow outer loops, so there is no need to sort them. + for (Loop *L : duplicatedLoops) { + if (!LMap.contains(L) && !noDuplicateLoops.contains(L)) { + VECZ_FAIL_IF(!duplicateUniformLoops(L)); + } + } + + // Fix the duplicated instructions arguments. + for (BasicBlock *B : newBlocks) { + const bool notHeader = !DR->getTag(B).isLoopHeader(); + + for (Instruction &I : *B) { + RemapInstruction(&I, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + + // Update the phi nodes if a uniform block has any incoming blocks* + // that are not div causing. In that case, the predicated incoming blocks + // will never be rewired to the uniform block so we can remove the + // incoming block from the phi node, unless 'B' is a loop header, in which + // case its predicated preheader (if any) will be rewired to it while we + // connect the regions). + // + // *NOTE a non-div-causing incoming block may or may not be a predicated + // block. A By All block with a non-varying branch can still branch into + // a BOSCC region, which would seem to break the SESE criteria. + if (notHeader) { + if (PHINode *PHI = dyn_cast(&I)) { + for (unsigned i = 0; i < PHI->getNumIncomingValues(); ++i) { + BasicBlock *PHIB = PHI->getIncomingBlock(i); + if (!DR->isUniform(*PHIB) && + !DR->hasFlag(*PHIB, + BlockDivergenceFlag::eBlockHasDivergentBranch)) { + PHI->removeIncomingValue(i--); + } + } + } + } + } + } + + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::duplicateUniformLoops(Loop *L) { + const LoopTag <ag = DR->getTag(L); + Loop *const uniformL = LI->AllocateLoop(); + + // Either add 'uniformL' as a child of a loop or as a top level loop. + // If it is a child loop, either add it as a child of a uniform loop if it + // exists, otherwise as a child of a predicated loop. + if (Loop *parentL = L->getParentLoop()) { + auto it = LMap.find(parentL); + if (it != LMap.end()) { + it->second->addChildLoop(uniformL); + } else { + parentL->addChildLoop(uniformL); + } + } else { + LI->addTopLevelLoop(uniformL); + } + + LMap.insert({L, uniformL}); + + LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " has been duplicated\n"); + + // Fill the loop tag. + LoopTag *uniformLTag = &DR->getOrCreateTag(uniformL); + + // The preheader of the loop may not have been duplicated. + BasicBlock *preheader = LTag.preheader; + if (BasicBlock *uniformPreheader = getBlock(preheader)) { + preheader = uniformPreheader; + } + uniformLTag->preheader = preheader; + uniformLTag->header = getBlock(LTag.header); + uniformLTag->latch = getBlock(LTag.latch); + + LLVM_DEBUG(dbgs() << "\tPreheader: " << uniformLTag->preheader->getName() + << "\n"); + LLVM_DEBUG(dbgs() << "\tHeader: " << uniformLTag->header->getName() << "\n"); + LLVM_DEBUG(dbgs() << "\tLatch: " << uniformLTag->latch->getName() << "\n"); + + // Add all blocks to the uniform version. + for (BasicBlock *blockL : L->blocks()) { + if (DR->getTag(blockL).loop->loop == L) { + BasicBlockTag &uniformBlockLTag = DR->getTag(getBlock(blockL)); + uniformL->addBasicBlockToLoop(uniformBlockLTag.BB, *LI); + uniformBlockLTag.loop = uniformLTag; + } + } + + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::createUniformRegions( + const DenseSet &noDuplicateBlocks) { + auto discardRegion = + [&noDuplicateBlocks](const UniformRegion ®ion) -> bool { + // To determine if it is worth it to duplicate the uniform region, we must + // take several elements into account: + // - The length of the duplicated code + // - branch probability + // size_t cost = + // std::accumulate(Region->predicatedBlocks.begin(), + // Region->predicatedBlocks.end(), 0, + // [](int x, BasicBlock *B) { return x + + // B->size(); }); + // PercentageOfAllTrue = + // runTimeValuesOfVectorPredicateAllTrue / + // runTimeValuesOfVectorPredicate; + // + // It may not be worth to duplicate the whole uniform region but still worth + // to duplicate some of the divergent branches in it. + + if (region.predicatedBlocks.empty() /*|| cost > max*/) { + return true; + } + + // If the region we want to duplicate contains NoDuplicate + // function calls, then we cannot duplicate it. + if (std::any_of(region.predicatedBlocks.begin(), + region.predicatedBlocks.end(), + [&noDuplicateBlocks](BasicBlock *B) { + return noDuplicateBlocks.count(B); + })) { + LLVM_DEBUG(dbgs() << "Region of " << region.entryBlock->getName() + << " cannot be duplicated because of " + "NoDuplicate instructions\n"); + return true; + } + + // It's not worth BOSCCing if all the blocks are trivial + if (std::all_of(region.predicatedBlocks.begin(), + region.predicatedBlocks.end(), + [](BasicBlock *B) { return isTrivialBlock(*B); })) { + return true; + } + + return false; + }; + + // We wish to identify Single-Entry, Single-Exit regions of the CFG + // that contain divergence-causing branches. A SESE region is defined + // as a subgraph of the CFG with an entry point at A and an exit point + // at B such that: + // 1. A dominates B + // 2. B post-dominates A + // 3. Any loop containing A also contains B, and vice-versa. + // + // The properties of the Dominance-Compact Block Indexing also happen to + // imply SESE-compactness, so once we identify an entry point, we can + // construct a SESE region by finding the exit block that post-dominates + // everything in a subsequence of the DCBI starting from A. + // + // We had assumed initailly that any divergence-causing block will be the + // start of a SESE region. However, certain edge cases have arisen during + // testing that demonstrate that this is not the case. In practice, this + // doesn't seem to matter, as long as we can fully identify the predicated + // subset of the SESE region, so we are really working with Multiple-Entry, + // Single-Exit regions here. This was the cause of the BOSCC Back Door bug + // that was encountered previously, where the entry block of a + // supposed SESE region did not actually dominate everything in the region, + // which in this case was caused by an additional non-divergent code path + // (the "back door" entry point), but it is equally possible for two + // divergence-causing branches to enter a predicated region. + // + // a) A* b) A c) A d) A . + // / \ / \ / \ / \ . + // B D B* D B* D* B* D* . + // / \ / \ / \ / \ / \ / \ / \ / \ . + // C F E C F E C F E C F E . + // \ | / \ | / \ | / \ / / . + // \ | / \ | / \ | / G / . + // \|/ \|/ \|/ \ / . + // X X X X . + // + // Figure 1. CFGs showing SESE regions. Divergence-causing blocks are marked + // with an asterisk. Blocks are labelled alphabetically in DCBI order. + // + // (1a) shows the case of a SESE region with a divergence-causing entry block. + // + // (1b) shows the "back door" case, where a block inside the predicated + // sub-region has a non-divergent predecessor outside of it. + // + // (1c) shows a SESE region with two divergence-causing entry points into the + // predicated sub-region. This will result in two overlapping regions. + // + // (1d) shows a case where the exit block of the SESE region is not the + // immediate post-dominator of B, the first-encountered divergence causing + // block. Therefore the two overlapping regions have different exit blocks. + // + // Another situation can arise where the SESE region can contain + // two completely unconnected predicated subregions. Although the DCBI is + // SESE compact, a SESE region can still contain other, nested SESE regions. + // Since an entry point into the predicated subregion is not necessarily the + // SESE entry point, all predicated blocks may not be reachable from every + // entry point. Because of these cases, it is necessary to consider each + // divergence causing block that is not part of the predicated subregion of + // any other divergence causing block as the entry point of their own SESE + // regions, even though this does not strictly satisfy the SESE criteria. + // + // a) A b) A Figure 2. + // / \ / \ . + // B* E* / D* (2a) shows a case of two independent regions + // / \ / \ / / \ sharing an exit block. + // C D F G B* E F . + // \ | | / / \ \ / (2b) shows a case where a SESE subregion will + // \| |/ C \ G appear in the middle of the DCBI of the + // \ / \ \ / subregion beginning with B. G post-dominates + // X \ H D, forming a complete nested SESE region. + // \ / . + // X . + + struct SESEInfo { + BasicBlock *BB = nullptr; + bool divCausing = false; + bool predicated = false; + }; + + // Collect all the blocks in the worklist + const auto &DCBI = DR->getBlockOrdering(); + const size_t numBlocks = DCBI.size(); + SmallVector SESE; + SESE.reserve(numBlocks); + for (const auto &BBTag : DCBI) { + SESE.emplace_back(); + SESE.back().BB = BBTag.BB; + } + + // Mark all the divergence-causing blocks + for (auto *const BB : DR->getDivCausingBlocks()) { + SESE[DR->getTagIndex(BB)].divCausing = true; + } + + // Create the BOSCC regions + for (size_t i = 0; i != numBlocks;) { + auto &info = SESE[i]; + if (!info.divCausing) { + ++i; + continue; + } + + uniformRegions.emplace_back(); + auto ®ion = uniformRegions.back(); + const size_t entryPos = i; + size_t exitPos = 0u; + size_t firstPredicated = numBlocks; + + region.entryBlock = info.BB; + region.divergentBranches.push_back(info.BB); + + SmallVector stack; + + // If we are in a divergent loop, then the whole loop needs a uniform + // version. + const auto *const entryLoopTag = DR->getTag(info.BB).loop; + if (entryLoopTag && entryLoopTag->isLoopDivergent()) { + auto *const loop = entryLoopTag->loop; + for (BasicBlock *loopB : loop->blocks()) { + const size_t pos = DR->getTagIndex(loopB); + firstPredicated = std::min(firstPredicated, pos); + SESE[pos].predicated = true; + region.predicatedBlocks.insert(loopB); + + if (loop->isLoopExiting(loopB)) { + stack.push_back(pos); + } + } + } + + // Traverse the CFG from the entry point, marking blocks for predication + stack.push_back(entryPos); + while (!stack.empty()) { + auto *const cur = SESE[stack.pop_back_val()].BB; + for (BasicBlock *succ : successors(cur)) { + const size_t succPos = DR->getTagIndex(succ); + + auto *const succLoopTag = DR->getBlockTag(succPos).loop; + if ((!succLoopTag || !succLoopTag->isLoopDivergent()) && + // The region 'entry' creates contains only blocks that are + // contained in its SESE region. + PDT->properlyDominates(succ, region.entryBlock)) { + VECZ_ERROR_IF(exitPos != 0u && succPos != exitPos, + "SESE region multiple exit blocks identified"); + exitPos = succPos; + continue; + } + + auto &succInfo = SESE[succPos]; + if (!succInfo.predicated) { + firstPredicated = std::min(firstPredicated, succPos); + stack.push_back(succPos); + region.predicatedBlocks.insert(succ); + succInfo.predicated = true; + } + } + } + VECZ_ERROR_IF(exitPos == 0u, "SESE region exit block not identified"); + region.exitBlock = SESE[exitPos].BB; + i = exitPos; + + // Collect any other divergent branches in the predicated region, and clear + // the predication flags so regions can overlap. + for (unsigned j = firstPredicated; j != exitPos; ++j) { + auto &ji = SESE[j]; + if (ji.divCausing && j > entryPos) { + if (ji.predicated) { + region.divergentBranches.push_back(ji.BB); + ji.divCausing = false; + } else if (j < i) { + // Found another unpredicated divergent branch between the entry + // point and the exit point. Reset the iterator so we can process it. + i = j; + } + } + ji.predicated = false; + } + + if (discardRegion(region)) { + // It's not worth keeping this region. + uniformRegions.pop_back(); + } + } + + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() { + LLVM_DEBUG(dbgs() << "CONNECT BOSCC REGIONS\n"); + + // If we have not duplicated a loop but we have duplicated the preheader, + // then the loop now has 2 preheaders. We thus need to blend them into one + // single preheader. + for (auto *const LTag : DR->getLoopOrdering()) { + if (!LTag->isLoopDivergent() && !LMap.contains(LTag->loop)) { + BasicBlock *predicatedPreheader = LTag->preheader; + if (BasicBlock *uniformPreheader = getBlock(predicatedPreheader)) { + BasicBlock *header = LTag->header; + + LLVM_DEBUG(dbgs() << "Loop " << header->getName() + << " has two preheaders\n"); + + // Create a new loop preheader that blends both the uniform and + // predicated preheaders, to keep well formed loops (with only one + // incoming preheader). + BasicBlock *newPreheader = BasicBlock::Create( + F.getContext(), predicatedPreheader->getName() + ".blend", &F, + header); + BranchInst::Create(header, newPreheader); + + // Set the successor of both preheaders to be the new preheader. + auto *predicatedPreheaderT = predicatedPreheader->getTerminator(); + auto *uniformPreheaderT = uniformPreheader->getTerminator(); + VECZ_ERROR_IF(predicatedPreheaderT->getNumSuccessors() != 1, + "Preheader should have only one successor"); + VECZ_ERROR_IF(uniformPreheaderT->getNumSuccessors() != 1, + "Preheader should have only one successor"); + predicatedPreheaderT->setSuccessor(0, newPreheader); + uniformPreheaderT->setSuccessor(0, newPreheader); + + // Update the tags. + BasicBlockTag &newPreheaderTag = DR->getOrCreateTag(newPreheader); + newPreheaderTag.loop = DR->getTag(predicatedPreheader).loop; + LTag->preheader = newPreheader; + + DR->setFlag(*newPreheader, DR->getFlag(*predicatedPreheader)); + + addInRegions(newPreheader, predicatedPreheader); + } + } + } + + // We must make the outermost non duplicated loop's preheader target the + // outermost duplicated uniform and predicated loop's headers. The first + // iteration of the loop will necessarily have all lanes activated until it + // reaches the first divergent block. Also, once the loop starts diverging, + // there is no way to go back to a dynamically uniform loop, so there is no + // point allowing the loop to go back and forth between its uniform and + // predicated versions. Only going from the uniform to the predicated + // version makes sense. + for (const auto &pair : LMap) { + Loop *uniformL = pair.second; + const Loop *L = pair.first; + + if (Loop *parentL = L->getParentLoop()) { + if (LMap.contains(parentL)) { + continue; + } + } + + const auto <ag = DR->getTag(L); + BasicBlock *preheader = LTag.preheader; + if (!VMap.count(preheader)) { + auto *T = preheader->getTerminator(); + VECZ_ERROR_IF(T->getNumSuccessors() != 1, + "Preheader has more than one successor"); + + LLVM_DEBUG(dbgs() << "Non duplicated preheader " << preheader->getName() + << "must target uniform loop " << uniformL->getName() + << "\n"); + + // Add a path from 'preheader' to the uniform loop header and make it + // always branch to it. We want to keep the edge from 'preheader' to the + // predicated loop header (even though we will never branch to it) to ease + // some needed blendings later on. + IRCleanup::deleteInstructionNow(T); + BranchInst::Create(DR->getTag(uniformL).header, LTag.header, + ConstantInt::getTrue(F.getContext()), preheader); + } + } + + DenseSet connectedBlocks; + for (auto ®ion : uniformRegions) { + // Each uniform version of div causing blocks need an entry point to the + // predicated CFG. + for (BasicBlock *B : region.divergentBranches) { + if (connectedBlocks.insert(B).second) { + if (BasicBlock *uniformB = getBlock(B)) { + VECZ_FAIL_IF(!connectUniformRegion(region, B, uniformB)); + } else { + VECZ_FAIL_IF(!connectUniformRegion(region, B, B)); + } + } else { + // No other region should have connected the entry block. + BasicBlock *entry = region.entryBlock; + VECZ_FAIL_IF(B == entry); + } + } + } + + // If a uniform block targets a predicated block, the latter needs its + // operands that have a uniform and predicated version blended. + for (const auto &predicatedBTag : DR->getBlockOrdering()) { + if (BasicBlock *uniformB = getBlock(predicatedBTag.BB)) { + for (BasicBlock *succ : successors(uniformB)) { + // We've found a uniform block that targets a predicated block prior + // to connecting the regions. + if (!DR->isUniform(*succ)) { + LLVM_DEBUG(dbgs() << "Uniform block " << uniformB->getName() + << " targets predicated block " << succ->getName() + << "\n"); + VECZ_FAIL_IF( + !blendConnectionPoint(succ, {predicatedBTag.BB, uniformB})); + } + } + } + } + + // Add all the uniform blocks into the worklist now they got connected. + DT->recalculate(F); + PDT->recalculate(F); + VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated"); + VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated"); + VECZ_FAIL_IF(!computeBlockOrdering()); + + // NOTE doing the Liveness Analysis here is potentially dangerous, since we + // have yet to fully restore SSA form. + liveness = &AM.getResult(F); + RC->recalculate(F); + VECZ_FAIL_IF(!blendFinalize()); + + // Sort URVBlender in a post order so that the replaced new values don't + // overlap with old ones. + if (!URVB.empty()) { + std::sort(URVB.begin(), URVB.end(), + [this](const URVBlender::value_type &LHS, + const URVBlender::value_type &RHS) { + return DR->getTagIndex(LHS.first) > DR->getTagIndex(RHS.first); + }); + + // Now that the CFG has been fully rewired and every node is correctly + // connected, we can replace the blended values uses with their new + // value. + DenseSet toDelete; + for (const URVBlender::value_type &blender : URVB) { + BasicBlock *block = blender.first; + Value *from = blender.second.first; + Instruction *to = blender.second.second; + if (!isUsedOutsideDefinitionBlock(from)) { + toDelete.insert(to); + } else { + VECZ_ERROR_IF(!isa(from), + "Trying to replace uses of a value"); + VECZ_FAIL_IF( + !replaceReachableUses(*RC, cast(from), to, block)); + } + } + + for (Instruction *I : toDelete) { + IRCleanup::deleteInstructionNow(I); + } + } + + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion( + UniformRegion ®ion, BasicBlock *predicatedB, BasicBlock *uniformB) { + auto replaceIncomingBlock = [](BasicBlock *B, BasicBlock *from, + BasicBlock *to) { + for (Instruction &I : *B) { + if (PHINode *PHI = dyn_cast(&I)) { + const int fromIdx = PHI->getBasicBlockIndex(from); + if (fromIdx != -1) { + PHI->setIncomingBlock(fromIdx, to); + } + } else { + break; + } + } + }; + + LLVM_DEBUG(dbgs() << "\tConnect uniform region of " << predicatedB->getName() + << "\n"); + + ConstantInt *trueCI = ConstantInt::getTrue(F.getContext()); + + auto *T = uniformB->getTerminator(); + + BasicBlock *target = predicatedB->getTerminator()->getSuccessor(0); + + // 1. For each pair {taken, fallthrough} of successors of uniformB, + // a. 'taken' is taken if the exit mask towards that edge is full, i.e. if + // it contains all-true values. + // b. otherwise, we branch to a new block, 'boscc_indir'. If the exit mask + // towards 'fallthrough' is full, branch to the latter. + // c. Otherwise, it means the mask is not dynamically uniform, but varying, + // so we need to branch into the varying counterpart of the uniformregion + // region. The chosen block to branch to is the first successor of + // predicatedB. + // 2. When a latch is divergent, we make the uniform latch target the + // predicated header. + // 3. We need to feed the last computed uniform values when transitioning to + // the varying version. + BasicBlock *runtimeCheckerBlock = uniformB; + DR->setFlag(*uniformB, eBlockNeedsAllOfMask); + + // 1. + SmallVector succs = uniformEdges[predicatedB]; + const size_t size = succs.size(); + VECZ_ERROR_IF(size == 0, "BasicBlock has no successors"); + for (size_t i = 0; i < size; ++i) { + // Not all successors of a BOSCC entry block may be duplicated. + if (BasicBlock *uniformSucc = getBlock(succs[i])) { + succs[i] = uniformSucc; + } + LLVM_DEBUG(dbgs() << "\tSuccessor " << i << ": " << succs[i]->getName() + << "\n"); + } + + for (size_t i = 0; i + 1 < size; ++i) { + BasicBlock *succ = succs[i]; + + BasicBlock *BOSCCIndir = BasicBlock::Create( + uniformB->getContext(), uniformB->getName() + ".boscc_indir", &F, + succ->getNextNode()); + + region.uniformBlocks.insert(BOSCCIndir); + + BasicBlockTag &BOSCCIndirTag = DR->getOrCreateTag(BOSCCIndir); + DR->setFlag(*BOSCCIndir, static_cast( + eBlockNeedsAllOfMask | eBlockIsUniform)); + BOSCCIndirTag.loop = DR->getTag(runtimeCheckerBlock).loop; + if (BOSCCIndirTag.loop) { + BOSCCIndirTag.loop->loop->addBasicBlockToLoop(BOSCCIndir, *LI); + } + + auto *cond = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + PassState.getMaskInfo(uniformB).exitMasks.lookup(succ), + trueCI, "", runtimeCheckerBlock); + BranchInst::Create(succ, BOSCCIndir, cond, runtimeCheckerBlock); + + if (i > 0) { + // Update the incoming block of the phi nodes in 'succ' from 'uniformB' + // to 'runtimeCheckerBlock'. + replaceIncomingBlock(succ, uniformB, runtimeCheckerBlock); + } + + runtimeCheckerBlock = BOSCCIndir; + } + + BasicBlock *succ = succs[size - 1]; + auto *cond = + CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, + PassState.getMaskInfo(uniformB).exitMasks.lookup(succ), + trueCI, "", runtimeCheckerBlock); + + BasicBlock *connectionPoint = target; + + const auto *const LTag = DR->getTag(predicatedB).loop; + const bool needsStore = LTag && LMap.contains(LTag->loop); + if (needsStore) { + // 'store' is a block that will contain all the uniform versions of the + // live in instructions of the predicated target. + BasicBlock *store = BasicBlock::Create( + target->getContext(), uniformB->getName() + ".boscc_store", &F, + runtimeCheckerBlock->getNextNode()); + + region.uniformBlocks.insert(store); + + BasicBlockTag &storeTag = DR->getOrCreateTag(store); + DR->setFlag(*store, eBlockIsUniform); + + // 2. + auto *const uniformLTag = DR->getTag(uniformB).loop; + const bool isLoopLatch = uniformLTag && (uniformLTag->latch == uniformB); + if (isLoopLatch) { + BasicBlock *header = LTag->header; + PHINode *entryMask = + cast(PassState.getMaskInfo(header).entryMask); + Value *latchMask = + PassState.getMaskInfo(uniformB).exitMasks.lookup(uniformLTag->header); + VECZ_ERROR_IF(!latchMask, "Exit mask does not exist"); + entryMask->addIncoming(latchMask, store); + connectionPoint = header; + + if (succ == uniformLTag->header) { + uniformLTag->latch = runtimeCheckerBlock; + } + } + + BranchInst::Create(connectionPoint, store); + + // 'store' belongs in the first outer loop non duplicated. + Loop *parentLoop = LTag->loop->getParentLoop(); + while (parentLoop && LMap.contains(parentLoop)) { + parentLoop = parentLoop->getParentLoop(); + } + if (parentLoop) { + storeTag.loop = &DR->getTag(parentLoop); + parentLoop->addBasicBlockToLoop(store, *LI); + } + + target = store; + } + + // 1.c. 'uniformB' has a new runtime check, we can remove its old one. + IRCleanup::deleteInstructionNow(T); + BranchInst::Create(succ, target, cond, runtimeCheckerBlock); + + // Update the incoming block of the new successors of 'runTimeCheckerBlock'. + replaceIncomingBlock(succ, uniformB, runtimeCheckerBlock); + + if (uniformB == predicatedB) { + replaceIncomingBlock(connectionPoint, predicatedB, runtimeCheckerBlock); + } else { + // 3. + VECZ_FAIL_IF(!blendConnectionPoint( + connectionPoint, + {predicatedB, needsStore ? target : runtimeCheckerBlock})); + + if (needsStore) { + region.storeBlocks.emplace_back(); + auto &sb = region.storeBlocks.back(); + sb.connectionPoint = connectionPoint; + sb.target = target; + sb.runtimeCheckerBlock = runtimeCheckerBlock; + } + } + + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::blendConnectionPoint( + BasicBlock *CP, const std::pair &incoming) { + const auto *const CPLTag = DR->getTag(CP).loop; + for (auto ®ion : uniformRegions) { + // Create blend instructions at each blend point following 'CP'. + if (region.contains(CP) || (CP == region.exitBlock) || + (CP == region.entryBlock)) { + // Compute all the blend points that will need to have blend instructions + // because of 'CP'. These blocks are all the blocks that have more than + // one predecessor, that belong to the same region as 'CP', and that + // succeed it. + if (!region.blendPoints.contains(CP)) { + // The first blend point impacted by 'CP' is 'CP' itself. + region.blendPoints.insert({CP, {CP}}); + + DenseSet visited{CP}; + std::queue queue; + queue.push(CP); + while (!queue.empty()) { + BasicBlock *cur = queue.front(); + queue.pop(); + // The region exit block is the delimiter of the region. + if (cur == region.exitBlock) { + continue; + } + for (BasicBlock *succ : successors(cur)) { + if (visited.insert(succ).second) { + queue.push(succ); + if (std::distance(pred_begin(succ), pred_end(succ)) > 1) { + // Nested loops are dominated. + if (CPLTag == DR->getTag(succ).loop || + (CPLTag && !CPLTag->loop->contains(succ))) { + region.blendPoints[CP].push_back(succ); + } + } + } + } + } + } + + region.connections.push_back(UniformRegion::ConnectionInfo{CP, incoming}); + } + } + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::blendFinalize() { + for (auto ®ion : uniformRegions) { + for (const auto &connection : region.connections) { + BasicBlock *CP = connection.connectionPoint; + auto &incoming = connection.incoming; + + // Create blend instructions at each blend point following 'CP'. + for (BasicBlock *blendPoint : region.blendPoints[CP]) { + LLVM_DEBUG(dbgs() << "BLEND CONNECTION POINT " << blendPoint->getName() + << "\n"); + + for (Instruction &I : *blendPoint) { + if (PHINode *PHI = dyn_cast(&I)) { + // Only add 'incoming' for 'CP' because for the other blend points + // we don't actually add a new edge. + if (blendPoint != CP || + PHI->getBasicBlockIndex(incoming.second) != -1) { + continue; + } + + unsigned idx = 0; + for (; idx < PHI->getNumIncomingValues(); ++idx) { + // If one incoming block of the phi node is the predicated version + // of the new, uniform, incoming block, use its uniform incoming + // value version if it exists. + if (PHI->getIncomingBlock(idx) == incoming.first) { + if (Value *V = getUniformV(PHI->getIncomingValue(idx))) { + if (Instruction *VI = dyn_cast(V)) { + if (RC->isReachable(VI->getParent(), incoming.second)) { + PHI->addIncoming(VI, incoming.second); + break; + } + } + } + } + } + if (idx == PHI->getNumIncomingValues()) { + PHI->addIncoming(getDefaultValue(PHI->getType()), + incoming.second); + } + LLVM_DEBUG( + dbgs() + << "PHINode " << PHI->getName() << ": Add incoming value " + << PHI->getIncomingValueForBlock(incoming.second)->getName() + << " from " << incoming.second->getName() << " in " + << blendPoint->getName() << "\n"); + } else { + break; + } + } + } + } + region.connections.clear(); + } + + DenseSet blendBlocks; + for (const auto ®ion : uniformRegions) { + for (auto &CP : region.blendPoints) { + for (BasicBlock *blendPoint : CP.second) { + blendBlocks.insert(blendPoint); + } + } + } + + for (const auto &tag : DR->getBlockOrdering()) { + BasicBlock *blendPoint = tag.BB; + if (!blendBlocks.contains(blendPoint)) { + continue; + } + + DenseSet blendedValues; + for (Instruction &I : *blendPoint) { + if (PHINode *PHI = dyn_cast(&I)) { + if (PHI->getName().contains(".boscc_blend")) { + for (Value *v : PHI->incoming_values()) { + blendedValues.insert(v); + } + } + } else { + break; + } + } + + for (auto *liveInVal : liveness->getBlockInfo(blendPoint).LiveIn) { + if (blendedValues.contains(liveInVal)) { + continue; + } + + auto *liveIn = dyn_cast(liveInVal); + if (!liveIn) { + continue; + } + + BasicBlock *src = liveIn->getParent(); + + // Nothing to be done if the definition block has no uniform + // equivalent. + BasicBlock *uniformSrc = getBlock(src); + if (!uniformSrc) { + continue; + } + + // Nothing to be done if the instruction: + // - dominates the connection point, + // - cannot reach 'CP'. + if (DT->dominates(src, blendPoint)) { + continue; + } + + if (!RC->isReachable(src, blendPoint)) { + continue; + } + + Value *uniformLiveIn = getDefaultValue(liveIn->getType()); + if (Value *V = getUniformV(liveIn)) { + uniformLiveIn = V; + } + + LLVM_DEBUG(dbgs() << "Blend live in " << liveIn->getName() << " in " + << blendPoint->getName() << "\n"); + + PHINode *blend = PHINode::Create(liveIn->getType(), 2, + liveIn->getName() + ".boscc_blend"); + blend->insertBefore(blendPoint->begin()); + bool replaceUniform = false; + bool replacePredicate = false; + // For each predecessor, if it can reach the instruction, set the + // latter as the incoming value, otherwise set a default value. + for (BasicBlock *pred : predecessors(blendPoint)) { + if (DR->isUniform(*pred)) { + Instruction *uniformLiveInI = dyn_cast(uniformLiveIn); + if (uniformLiveInI && + !RC->isReachable(uniformLiveInI->getParent(), pred)) { + blend->addIncoming(getDefaultValue(uniformLiveInI->getType()), + pred); + } else { + replaceUniform = true; + blend->addIncoming(uniformLiveIn, pred); + } + } else if (DR->getTag(pred).isLoopBackEdge(blendPoint)) { + blend->addIncoming(blend, pred); + } else { + if (!RC->isReachable(liveIn->getParent(), pred)) { + blend->addIncoming(getDefaultValue(liveIn->getType()), pred); + } else { + replacePredicate = true; + blend->addIncoming(liveIn, pred); + } + } + LLVM_DEBUG(dbgs() << "\tAdd incoming value " + << blend->getIncomingValueForBlock(pred)->getName() + << " from " << pred->getName() << "\n"); + } + + // If we have blended 'liveIn' in 'CP', update the uses. + if (replacePredicate) { + URVB.push_back({blendPoint, {liveIn, blend}}); + addReference(blend, liveIn); + } + // If we have blended 'uniformLiveIn' in 'CP', update the uses. + if (replaceUniform && isa(uniformLiveIn)) { + URVB.push_back({blendPoint, {uniformLiveIn, blend}}); + } + + // Update the blend instructions in the loop header, if any. + VECZ_FAIL_IF( + !updateLoopBlendValues(DR->getTag(blendPoint).loop, liveIn, blend)); + blendedValues.insert(liveIn); + } + } + + for (const auto ®ion : uniformRegions) { + for (auto &sb : region.storeBlocks) { + BasicBlock *connectionPoint = sb.connectionPoint; + BasicBlock *target = sb.target; + BasicBlock *runtimeCheckerBlock = sb.runtimeCheckerBlock; + + // Create a bunch of lcssa instructions into 'store' so that the repair + // SSA doesn't have to look for the instructions inside the uniform loop. + for (Instruction &I : *connectionPoint) { + if (PHINode *PHI = dyn_cast(&I)) { + const int idx = PHI->getBasicBlockIndex(target); + VECZ_ERROR_IF(idx == -1, "Connection point PHIs must have incoming " + "block from the target"); + if (Instruction *incoming = + dyn_cast(PHI->getIncomingValue(idx))) { + LLVM_DEBUG(dbgs() + << "Create live-in lcssa of " << incoming->getName() + << " in " << target->getName() << "\n"); + + PHINode *blend = PHINode::Create( + incoming->getType(), 1, incoming->getName() + ".boscc_lcssa"); + blend->insertBefore(target->begin()); + blend->addIncoming(incoming, runtimeCheckerBlock); + PHI->setIncomingValue(idx, blend); + } + } else { + break; + } + } + } + } + return true; +} + +BasicBlock *ControlFlowConversionState::BOSCCGadget::getBlock(BasicBlock *B) { + auto BUniform = VMap.find(B); + if (BUniform != VMap.end()) { + return cast(BUniform->second); + } + return nullptr; +} + +Loop *ControlFlowConversionState::BOSCCGadget::getLoop(Loop *L) { + auto LUniform = LMap.find(L); + if (LUniform != LMap.end()) { + return LUniform->second; + } + return nullptr; +} + +void ControlFlowConversionState::BOSCCGadget::getUnduplicatedEntryBlocks( + SmallVectorImpl &blocks) const { + for (const auto ®ion : uniformRegions) { + if (VMap.count(region.entryBlock) == 0) { + blocks.push_back(region.entryBlock); + } + } +} + +void ControlFlowConversionState::BOSCCGadget::createReference( + Value *pred, Value *uni, bool needsMapping) { + if (!pred || !uni) { + return; + } + auto predIt = VMap.find(pred); + if (predIt != VMap.end()) { + predIt->second = uni; + } else { + VMap.insert({pred, uni}); + } + + if (needsMapping) { + if (Instruction *uniI = dyn_cast(uni)) { + RemapInstruction(uniI, VMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + } + } +} + +void ControlFlowConversionState::BOSCCGadget::addReference(Value *pred, + Value *old) { + auto uniformOldIt = VMap.find(old); + if (uniformOldIt != VMap.end()) { + VMap.insert({pred, uniformOldIt->second}); + } +} + +void ControlFlowConversionState::BOSCCGadget::addInRegions(BasicBlock *newB, + BasicBlock *refB) { + for (auto ®ion : uniformRegions) { + if (region.contains(refB)) { + if (region.predicatedBlocks.insert(newB).second) { + LLVM_DEBUG(dbgs() << "BasicBlock " << newB->getName() + << " added to BOSCC region: " + << region.entryBlock->getName() << "\n"); + } + } + } +} + +Value * +ControlFlowConversionState::BOSCCGadget::getUniformV(Value *predicatedV) { + auto uniformVIt = VMap.find(predicatedV); + if (uniformVIt != VMap.end()) { + return uniformVIt->second; + } + return nullptr; +} + +void ControlFlowConversionState::BOSCCGadget::updateValue(Value *from, + Value *to) { + auto fromIt = VMap.find(from); + if (fromIt != VMap.end()) { + Value *fromUniform = fromIt->second; + VMap.erase(from); + VMap.insert({to, fromUniform}); + } +} + +bool ControlFlowConversionState::BOSCCGadget::linkMasks() { + for (const auto &BTag : DR->getBlockOrdering()) { + auto *const BB = BTag.BB; + if (auto *const uniformB = getBlock(BB)) { + // Both sets of masks had better exist by this point. + auto &masks = PassState.getMaskInfo(BB); + auto &masksUniform = PassState.getMaskInfo(uniformB); + createReference(masks.entryMask, masksUniform.entryMask); + + for (auto *const succ : successors(BB)) { + auto *const uniformSucc = getBlock(succ); + auto *const target = uniformSucc ? uniformSucc : succ; + createReference(masks.exitMasks.lookup(succ), + masksUniform.exitMasks.lookup(target)); + } + } + } + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::updateLoopBlendValues( + LoopTag *LTag, Instruction *from, Instruction *to) { + auto createLatchIncoming = [&from, <ag, this] { + auto *ret = + PHINode::Create(from->getType(), 2, from->getName() + ".boscc_blend"); + ret->insertBefore(LTag->latch->begin()); + Value *uniform = getUniformV(from); + Value *default_val = getDefaultValue(from->getType()); + for (BasicBlock *pred : predecessors(LTag->latch)) { + Value *incoming = default_val; + if (RC->isReachable(from->getParent(), pred)) { + incoming = from; + } else if (uniform) { + Instruction *uinst = dyn_cast(uniform); + if (!uinst || RC->isReachable(uinst->getParent(), pred)) { + incoming = uniform; + } + } + ret->addIncoming(incoming, pred); + } + URVB.push_back({LTag->latch, {from, ret}}); + addReference(ret, from); + return ret; + }; + + while (LTag) { + PHINode *latchIncoming = nullptr; + // Try looking for an existing `boscc_blend` value for `from` to avoid + // creating a new one in the latch. + for (Instruction &latchI : *LTag->latch) { + if (PHINode *PHI = dyn_cast(&latchI)) { + if (PHI->getName().contains(".boscc_blend")) { + for (Value *incomingValue : PHI->incoming_values()) { + if (incomingValue == from) { + latchIncoming = PHI; + break; + } + } + if (latchIncoming) { + break; + } + } + } else { + break; + } + } + // Update all uses of `from` in the header with the blended value from the + // latch. Since the CFG is final now, this should cover everything. + for (Instruction &headerI : *LTag->header) { + if (PHINode *PHI = dyn_cast(&headerI)) { + const int latchIdx = PHI->getBasicBlockIndex(LTag->latch); + VECZ_ERROR_IF(latchIdx == -1, + "Header has no incoming value from the latch"); + if ((PHI == to) || (PHI->getIncomingValue(latchIdx) == from)) { + if (!latchIncoming) { + latchIncoming = createLatchIncoming(); + } + PHI->setIncomingValue(latchIdx, latchIncoming); + } + } else { + break; + } + } + + if (Loop *L = LTag->loop->getParentLoop()) { + LTag = &DR->getTag(L); + } else { + break; + } + } + + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::computeBlockOrdering() { + // Create a map from entry blocks to their uniform regions + DenseMap entryMap; + unsigned maxUBlocks = 0; + for (const auto ®ion : uniformRegions) { + if (!region.uniformBlocks.empty()) { + entryMap[region.entryBlock] = ®ion; + } + maxUBlocks = std::max(maxUBlocks, region.uniformBlocks.size()); + } + + // Gather the blocks outside of the uniform regions according to the already + // computed order, leaving gaps for the uniform regions to fill in. + // Note that uniform region blocks do not appear in the block ordering yet. + // Also note that we can't use pointers to BasicBlockTags here since + // `PassState.computeBlockOrdering()` re-orders the tags vector. + SmallVector filtered; + for (const auto &tag : DR->getBlockOrdering()) { + filtered.push_back(tag.BB); + const auto found = entryMap.find(tag.BB); + if (found != entryMap.end()) { + const auto *const region = found->second; + filtered.resize(filtered.size() + region->uniformBlocks.size()); + } + } + + // Recompute the ordering over the uniform regions + VECZ_FAIL_IF(!PassState.computeBlockOrdering()); + + // Filter by region and fill in the gaps + SmallVector uniformBlocks; + uniformBlocks.reserve(maxUBlocks); + for (auto it = filtered.begin(), ie = filtered.end(); it != ie;) { + auto *const BB = *it; + + const auto found = entryMap.find(BB); + if (found != entryMap.end()) { + // If the entry block of the region is NOT duplicated, add the uniform + // blocks after it. + const bool entryDupe = getBlock(BB); + if (!entryDupe) { + ++it; + } + + // Gather the indices of the uniform blocks and sort them. + const auto ®ion = *found->second; + uniformBlocks.clear(); + for (auto *const uBB : region.uniformBlocks) { + uniformBlocks.push_back(DR->getTagIndex(uBB)); + } + std::sort(uniformBlocks.begin(), uniformBlocks.end()); + + // Insert the uniform blocks into the gap. + for (const auto uBBi : uniformBlocks) { + (*it++) = DR->getBlockTag(uBBi).BB; + } + + // If the entry block of the region IS duplicated, add it after the + // uniform blocks. + if (entryDupe) { + (*it++) = BB; + } + } else { + ++it; + } + } + + uint32_t pos = 0; + for (auto *const BB : filtered) { + DR->getTag(BB).pos = pos++; + } + DR->reorderTags(filtered.size()); + + return true; +} + +bool ControlFlowConversionState::BOSCCGadget::cleanUp() { + // BOSCC can create a lot of PHI nodes that are not really necessary. + // LCSSA PHI nodes (in Store Blocks) are only required as an intermediate + // state and are trivially redundant, and sometimes blends are created that + // blend the same two values together. Also, sometimes values are blended + // even though they have no further uses and can be removed as dead code. + + const RPOT rpot(&F); + std::vector blends; + for (auto *BB : rpot) { + for (auto I = BB->begin(); I != BB->end();) { + auto *PHI = dyn_cast(&*(I++)); + if (!PHI) { + break; + } + if (!PHI->getName().contains(".boscc_")) { + continue; + } + + if (auto *V = PHI->hasConstantValue()) { + PHI->replaceAllUsesWith(V); + IRCleanup::deleteInstructionNow(PHI); + } else { + blends.push_back(PHI); + } + } + } + + while (!blends.empty()) { + PHINode *PHI = blends.back(); + if (PHI->use_empty()) { + IRCleanup::deleteInstructionNow(PHI); + } + blends.pop_back(); + } + + return true; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp new file mode 100644 index 0000000000000..02f6e9e68ca9b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp @@ -0,0 +1,150 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "control_flow_roscc.h" + +#include +#include +#include +#include + +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "ir_cleanup.h" + +#define DEBUG_TYPE "vecz-cf" + +// WHAT THIS DOES +// +// A common pattern in OpenCL kernels is a line near the start of the program +// like the following: +// +// if (some_condition) return; +// +// Where "some_condition" is non-uniform, the BOSCC control flow optimization +// can do very well with this. However, without BOSCC, the entire program will +// have been linearized and the early return will disappear entirely. It is +// desirable to maintain this sort of early exit branch in order to avoid +// doing unnecessary work. We can do this by inserting a uniform branch to the +// return block without the need to duplicate the rest of the kernel into +// uniform and non-uniform versions, as BOSCC does. This can improve the +// performance significantly without requiring complex CFG changes. + +using namespace llvm; +using namespace vecz; + +namespace { +/// @brief checks if the given block contains only a return instruction +bool isReturnBlock(const llvm::BasicBlock &BB) { + if (BB.size() != 1) { + return false; + } + + auto *T = BB.getTerminator(); + if (auto *const branch = dyn_cast(T)) { + if (branch->isUnconditional()) { + // We can see straight through a block that only contains a single + // unconditional branch. + return isReturnBlock(*branch->getSuccessor(0)); + } + } + + return isa(T); +} +} // namespace + +bool ControlFlowConversionState::ROSCCGadget::run(Function &F) { + bool changed = false; + + SmallVector RetBranches; + for (auto &BB : F) { + if (LI->getLoopFor(&BB)) { + // No need to do this transform on loop exits + continue; + } + + auto *T = BB.getTerminator(); + if (auto *Branch = dyn_cast(T)) { + if (Branch->isConditional() && Branch->getNumSuccessors() == 2) { + Value *cond = Branch->getCondition(); + if (UVR->isVarying(cond)) { + size_t countReturns = 0; + for (auto *succ : Branch->successors()) { + if (isReturnBlock(*succ)) { + ++countReturns; + } + } + + // Only consider ROSCC when there is exactly one returning successor. + if (countReturns == 1) { + RetBranches.push_back(Branch); + } + } + } + } + } + + ConstantInt *trueCI = ConstantInt::getTrue(F.getContext()); + ConstantInt *falseCI = ConstantInt::getFalse(F.getContext()); + + for (auto *Branch : RetBranches) { + BasicBlock *BB = Branch->getParent(); + + BasicBlock *newBB = SplitBlock(BB, Branch, DT, LI); + newBB->setName(Twine(BB->getName(), ".ROSCC")); + + // update the PostDominatorTree manually.. + auto *Node = PDT->getNode(BB); + assert(Node && "Could not get node"); + auto *IDom = Node->getIDom(); + assert(IDom && "Could not get IDom"); + auto *Block = IDom->getBlock(); + assert(Block && "Could not get Block"); + PDT->addNewBlock(newBB, Block); + + // Remove the unconditional branch created by splitting.. + IRCleanup::deleteInstructionNow(BB->getTerminator()); + + // Create a new Uniform branch condition to the Return block.. + // Note that a conditional branch's successors are returned in reverse + // order, relative to how they appear in the IR, with the "true" target + // last. However, "getSuccessor(n)" also indexes backwards, from the end. + BasicBlock *SuccT = Branch->getSuccessor(0); + BasicBlock *SuccF = Branch->getSuccessor(1); + const bool Which = isReturnBlock(*SuccT); + + BasicBlock *ReturnBlock = Which ? SuccT : SuccF; + Value *Cond = Branch->getCondition(); + auto *newCond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Cond, + Which ? falseCI : trueCI, "", BB); + newCond->setName(Twine(Cond->getName(), ".ROSCC")); + BranchInst::Create(newBB, ReturnBlock, newCond, BB); + + // Update Dominator and PostDominator trees.. + DT->insertEdge(BB, ReturnBlock); + PDT->insertEdge(BB, ReturnBlock); + + changed = true; + } + + assert((!changed || DT->verify()) && + "ROSCC: Dominator Tree failed verification"); + + assert((!changed || PDT->verify()) && + "ROSCC: Post-Dominator Tree failed verification"); + + return changed; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp new file mode 100644 index 0000000000000..9d30786cf3d39 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp @@ -0,0 +1,88 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "debugging.h" + +#include + +using namespace llvm; + +namespace vecz { + +/// @brief Create the std::string containing the message for the remark +/// +/// @param[in] V The value (can be `nullptr`) to be included in the remark +/// @param[in] Msg The main remark message +/// @param[in] Note An optional additional note to provide more context/info. +/// @return The remark message as it is to be printed +static std::string createRemarkMessage(const Value *V, StringRef Msg, + StringRef Note = "") { + std::string helper_str("Vecz: "); + raw_string_ostream helper_stream(helper_str); + helper_stream << Msg; + if (V) { + if (isa(V)) { + // Instructions are already prefixed by two spaces when printed + V->print(helper_stream, /*IsForDebug=*/true); + } else if (const Function *F = dyn_cast(V)) { + // Printing a functions leads to its whole body being printed + helper_stream << " function \"" << F->getName() << "\""; + } else { + helper_stream << " "; + V->print(helper_stream, /*IsForDebug=*/true); + } + } + helper_stream << '\n'; + + // Provide extra context, if supplied + if (!Note.empty()) { + helper_stream << " note: " << Note << '\n'; + } + + return helper_stream.str(); +} + +void emitVeczRemarkMissed(const Function *F, const Value *V, StringRef Msg, + StringRef Note) { + const Instruction *I = V ? dyn_cast(V) : nullptr; + auto RemarkMsg = createRemarkMessage(V, Msg, Note); + OptimizationRemarkEmitter ORE(F); + if (I) { + ORE.emit(OptimizationRemarkMissed("vecz", "vecz", I) << RemarkMsg); + } else { + const DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc(); + ORE.emit(OptimizationRemarkMissed("vecz", "vecz", D, &(F->getEntryBlock())) + << RemarkMsg); + } +} + +void emitVeczRemarkMissed(const Function *F, StringRef Msg, StringRef Note) { + emitVeczRemarkMissed(F, nullptr, Msg, Note); +} + +void emitVeczRemark(const Function *F, const Value *V, StringRef Msg) { + const Instruction *I = V ? dyn_cast(V) : nullptr; + const DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc(); + + auto RemarkMsg = createRemarkMessage(V, Msg); + OptimizationRemarkEmitter ORE(F); + ORE.emit(OptimizationRemark("vecz", "vecz", F) << RemarkMsg); +} + +void emitVeczRemark(const Function *F, StringRef Msg) { + emitVeczRemark(F, nullptr, Msg); +} +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h new file mode 100644 index 0000000000000..f538de0e6bed4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h @@ -0,0 +1,98 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @brief Analysis of control flow. + +#ifndef VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED +#define VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED + +#include + +namespace llvm { +class BasicBlock; +} // namespace llvm + +namespace vecz { + +/// @brief Holds the results and state for CFG analysis. +struct CFGResult { + /// @brief true if analysis failed, e.g. CFG conversion cannot be done. + bool failed = false; + /// @brief true if CFG conversion is needed to vectorize the function. + bool convNeeded = false; + /// @brief Single basic block that exits the function. + llvm::BasicBlock *exitBB = nullptr; + + /// @brief Create new analysis results for the given function. + CFGResult() = default; + + /// @brief Deleted copy constructor. + CFGResult(const CFGResult &) = delete; + + /// @brief Move constructor. + /// + /// @param[in,out] Res Existing results to move. + CFGResult(CFGResult &&Res) = default; + + /// @brief Access the failed flag. + /// @return true if analysis failed. + bool getFailed() const { return failed; } + + /// @brief Access the failed flag. + /// @param[in] newVal New value for the flag. + void setFailed(bool newVal) { failed = newVal; } + + /// @brief Determine whether CFG conversion is needed for the function or not. + bool isConversionNeeded() const { return convNeeded; } + /// @brief Set whether CFG conversion is needed for the function or not. + /// @param[in] newVal Whether conversion is needed or not. + void setConversionNeeded(bool newVal) { convNeeded = newVal; } + + /// @brief Single block in the function that returns to the caller or null. + llvm::BasicBlock *getExitBlock() const { return exitBB; } +}; + +/// @brief Analysis that determines whether a function can have divergent +/// control flow and so whether CFG conversion is needed or not. +class CFGAnalysis : public llvm::AnalysisInfoMixin { +public: + /// @brief Create a new CFG analysis object. + CFGAnalysis() = default; + + /// @brief Type of the analaysis result. + using Result = CFGResult; + + /// @brief Perform CFG analysis on the function to determine whether control + /// flow conversion is required and possible or not. + /// + /// @param[in,out] F Function to analyze. + /// @param[in,out] AM FunctionAnalysisManager providing analyses + /// + /// @return CFG analysis result. + CFGResult run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); + + /// @brief Analysis name. + static llvm::StringRef name() { return "CFG analysis"; } + +private: + friend llvm::AnalysisInfoMixin; + /// @brief Unique identifier for the analysis. + static llvm::AnalysisKey Key; +}; + +} // namespace vecz + +#endif // VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h new file mode 100644 index 0000000000000..cb66e38ba1bde --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h @@ -0,0 +1,480 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Divergence analysis. + +#ifndef VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED +#define VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace llvm { +class BasicBlock; +class Loop; +} // namespace llvm + +namespace vecz { +struct BasicBlockTag; +struct LoopTag; + +/// @brief Analysis flags that can be attached to LLVM basic blocks. +enum BlockDivergenceFlag { + /// @brief Flag value where no flag is set. + eBlockHasNoFlag = 0, + /// @brief True if the block has a divergent branch (different paths might be + /// taken by different work items. + eBlockHasDivergentBranch = (1 << 0), + /// @brief True if the block has no divergent branch but has all its + /// successors divergent. + eBlockHasDivergentBranchFake = (1 << 1), + /// @brief True if the block belongs in a diverged path. + eBlockIsDivergent = (1 << 2), + /// @brief True if the block is an introduced divergent conditional loop exit. + /// The operation is performed during the transformation of a divergent loop. + eBlockIsVirtualDivergentLoopExit = (1 << 3), + /// @brief True if the block is a join point of a divergent branch. + eBlockIsBlend = (1 << 4), + /// @brief True if no divergence is present when reaching the block. + eBlockIsByAll = (1 << 5), + /// @brief True if the block is uniform (duplicated version of a predicated + /// block from BOSCC). + eBlockIsUniform = (1 << 6), + /// @brief True if the block needs an all-of mask. + eBlockNeedsAllOfMask = (1 << 7) +}; + +/// @brief Analysis flags that can be attached to LLVM loops. +enum LoopDivergenceFlag { + /// @brief Flag value where no flag is set. + eLoopNoFlag = 0, + /// @brief Whether or not the loop may diverge because of a diverging block. + eLoopIsDivergent = (1 << 0) +}; + +/// @brief Type that maps basic blocks to tags. +using DenseBBMap = llvm::DenseMap; +/// @brief Type that maps loops to tags. +using DenseLoopMap = + llvm::DenseMap>; +/// @brief Type that maps loop live values and their associated state from the +/// previous loop iteration. +using DenseLoopResultPHIsMap = + llvm::SmallDenseMap; +/// @brief Type that maps loop live values and updated value. +using DenseLoopResultUpdatesMap = + llvm::SmallDenseMap; + +class DivergenceResult; + +/// @brief Queue that orders blocks by their DCBI (smallest first). +struct BlockQueue { + using index_type = uint32_t; + using index_list = std::vector; + + const DivergenceResult &DR; + + /// @brief The DCBI indices of the blocks in the queue, in min-heap order. + /// Since we can easily retrieve the BasicBlockTag from the DCBI ordered + /// `blockOrdering` vector, and since the queue priority is entirly based on + /// the index, it is sufficient to store only the indices to perform the + /// queue operations. + index_list indices; + + /// @brief Constructs an empty BlockQueue + BlockQueue(const DivergenceResult &dr) : DR(dr) {}; + + /// @brief Constructs a BlockQueue from a set of blocks. + BlockQueue(const DivergenceResult &dr, + const llvm::DenseSet &blocks); + + /// @brief Returns the number of blocks in the queue. + size_t size() const { return indices.size(); } + + /// @brief Returns whether the queue is empty. + bool empty() const { return indices.empty(); } + + /// @brief Pushes a block on the queue by its DCBI index. + void push(size_t index); + + /// @brief Pushes a block on the queue by pointer. + /// Prefer `push(size_t)` if the tag index is available. + void push(const llvm::BasicBlock *bb); + + /// @brief Pops a block from the queue and returns it. + const BasicBlockTag &pop(); + + /// @brief Const iterator to beginning of index list, for inspection. + index_list::const_iterator begin() const { return indices.begin(); } + + /// @brief Const iterator to end of index list, for inspection. + index_list::const_iterator end() const { return indices.end(); } +}; + +/// @brief Describes a loop contained in the function to vectorize. +struct LoopTag { + /// @brief Compiler loop info. + llvm::Loop *loop = nullptr; + /// @brief Loop entering point. + llvm::BasicBlock *preheader = nullptr; + /// @brief Loop entry point. + llvm::BasicBlock *header = nullptr; + /// @brief Single block that jumps back to the loop header. + llvm::BasicBlock *latch = nullptr; + /// @brief Loop live values on the loop. + llvm::SmallPtrSet loopLiveValues; + /// @brief Map between loop live values and their associated state from the + /// previous loop iteration. + DenseLoopResultPHIsMap loopResultPrevs; + /// @brief Map between loop live values and their updated value. + DenseLoopResultUpdatesMap loopResultUpdates; + /// @brief Loop exit that has been chosen during partial linearization. + llvm::BasicBlock *pureExit = nullptr; + + LoopDivergenceFlag divergenceFlag = LoopDivergenceFlag::eLoopNoFlag; + + bool isLoopDivergent() const { + return divergenceFlag & LoopDivergenceFlag::eLoopIsDivergent; + } +}; + +/// @brief Describes a basic block contained in the function to vectorize. +struct BasicBlockTag { + /// @brief Compiler basic block object. + llvm::BasicBlock *BB = nullptr; + /// @brief Inner most loop this block belongs to, if any. + LoopTag *loop = nullptr; + /// @brief Outermost loop left by this block. + LoopTag *outermostExitedLoop = nullptr; + + /// @brief Unique sorted block index. + uint32_t pos = ~0u; + + /// @brief Create a new basic block tag. + BasicBlockTag() = default; + /// @brief Deleted address-of operator + BasicBlockTag *operator&() = delete; + /// @brief Deleted const address-of operator + const BasicBlockTag *operator&() const = delete; + + BlockDivergenceFlag divergenceFlag = BlockDivergenceFlag::eBlockHasNoFlag; + + /// @brief Convenience function for finding the varying property of the branch + /// without having to query the Uniform Value Analysis + bool hasVaryingBranch() const { + return divergenceFlag & BlockDivergenceFlag::eBlockHasDivergentBranch; + } + + /// @brief Determine whether there is a backedge from this tag's basic block + /// to the target basic block. + /// + /// @param[in] toBB Potential target for the backedge. + /// + /// @return true if there is a backedge, false otherwise. + bool isLoopBackEdge(llvm::BasicBlock *toBB) const { + return loop && (loop->latch == BB) && (loop->header == toBB); + } + + /// @brief Determine whether this block is the header of its loop (if any). + /// @return true iff the block is the loop header for its loop + bool isLoopHeader() const { return loop && loop->header == BB; } +}; + +/// @brief Divergent blocks whose PHI nodes may vary. +using DivergenceInfo = llvm::DenseSet; + +/// @brief Holds the result of Divergence Analysis for a given function. +class DivergenceResult { +public: + /// @brief Create a new DA result for the given unit. + /// @param[in] AM FunctionAnalysisManager providing analyses. + DivergenceResult(llvm::Function &F, llvm::FunctionAnalysisManager &AM); + + /// @brief Generate a block ordering. + /// + /// This is based on a dominance-compact block indexing (DCBI) where we + /// topologically order blocks that belong to the same dominator tree. + /// + /// @returns true if no errors occurred. + bool computeBlockOrdering(llvm::DominatorTree &DT); + + /// @brief Reorders the tags in the tags vector according to their DBCI + /// indices. + /// @param[in] n the number of tags in the DCBI + void reorderTags(size_t n); + + /// @brief Generate a loop ordering. + /// + /// This populates the `loopOrdering` vector with loop tags sorted by depth. + /// + /// @returns true if no errors occurred. + bool computeLoopOrdering(); + + /// @brief Gets a BasicBlockTag by its DCBI index + /// @param[in] index the DCBI index + /// @returns reference to the BasicBlockTag + const BasicBlockTag &getBlockTag(size_t index) const { + return basicBlockTags[index]; + } + + /// @brief Gets the DCBI ordered range of BasicBlockTags. + llvm::ArrayRef getBlockOrdering() const { + return llvm::ArrayRef(basicBlockTags.data(), + numOrderedBlocks); + } + + llvm::ArrayRef getLoopOrdering() { return loopOrdering; } + + size_t getTagIndex(const llvm::BasicBlock *BB) const; + + /// @brief Retrieve a tag for the given basic block. + /// + /// @param[in] BB Basic block to retrieve a tag for. + /// + /// @return Basic block tag. + BasicBlockTag &getTag(const llvm::BasicBlock *BB) { + return basicBlockTags[getTagIndex(BB)]; + } + + const BasicBlockTag &getTag(const llvm::BasicBlock *BB) const { + return basicBlockTags[getTagIndex(BB)]; + } + + /// @brief Retrieve or create a tag for the given basic block. + /// + /// @param[in] BB Basic block to retrieve or create a tag for. + /// + /// @return Basic block tag. + BasicBlockTag &getOrCreateTag(llvm::BasicBlock *BB); + + /// @brief Try to retrieve a tag for the given loop. + /// + /// @param[in] L Loop to retrieve a tag for. + /// + /// @return Loop tag. + LoopTag &getTag(const llvm::Loop *L) const; + + /// @brief Retrieve or create a tag for the given loop. + /// + /// @param[in] L Loop to retrieve a tag for. + /// + /// @return Loop tag. + LoopTag &getOrCreateTag(llvm::Loop *L); + + /// @brief Determine whether the tag contains the given flags or not. + /// + /// @param[in] BB Basic block whose flag we check. + /// @param[in] F Flags to test. + /// + /// @return true if the tag contains all the given flags, false otherwise. + bool hasFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F) const; + /// @brief Get the given flags for the tag. + /// + /// @param[in] BB Basic block whose flag we want to get. + BlockDivergenceFlag getFlag(const llvm::BasicBlock &BB) const; + /// @brief Set the given flags for the tag. + /// + /// @param[in] BB Basic block whose flag we set. + /// @param[in] F Flags to set for the tag. + void setFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F); + /// @brief Clear the given flags for the tag. + /// + /// @param[in] BB Basic block whose flag we clear. + /// @param[in] F Flags to clear for the tag. + void clearFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F); + /// @brief Check whether the basic block contains a div causing flag. + /// + /// @param[in] BB Basic block whose flag we check. + /// + /// @return true if the tag is div causing, false otherwise. + bool isDivCausing(const llvm::BasicBlock &BB) const; + /// @brief Check whether the basic block contains a divergent flag. + /// + /// @param[in] BB Basic block whose flag we check. + /// + /// @return true if the tag is divergent, false otherwise. + bool isDivergent(const llvm::BasicBlock &BB) const; + /// @brief Check whether the basic block contains an optional flag. + /// + /// @param[in] BB Basic block whose flag we check. + /// + /// @return true if the tag is optional, false otherwise. + bool isOptional(const llvm::BasicBlock &BB) const; + /// @brief Check whether the basic block contains a by_all flag. + /// + /// @param[in] BB Basic block whose flag we check. + /// + /// @return true if the tag is by_all, false otherwise. + bool isByAll(const llvm::BasicBlock &BB) const; + /// @brief Check whether the basic block contains a blend flag. + /// + /// @param[in] BB Basic block whose flag we check. + /// + /// @return true if the tag is blend, false otherwise. + bool isBlend(const llvm::BasicBlock &BB) const; + /// @brief Check whether the basic block contains a uniform flag. + /// + /// @param[in] BB Basic block whose flag we check. + /// + /// @return true if the tag is uniform, false otherwise. + bool isUniform(const llvm::BasicBlock &BB) const; + + /// @brief Determine whether the tag contains the given flags or not. + /// + /// @param[in] L Loop whose flag we check. + /// @param[in] F Flags to test. + /// + /// @return true if the tag contains all the given flags, false otherwise. + bool hasFlag(const llvm::Loop &L, LoopDivergenceFlag F) const; + /// @brief Get the given flags for the tag. + /// + /// @param[in] L Loop whose flag we want to get. + LoopDivergenceFlag getFlag(const llvm::Loop &L) const; + /// @brief Set the given flags for the tag. + /// + /// @param[in] L Loop whose flag we set. + /// @param[in] F Flags to set for the tag. + void setFlag(const llvm::Loop &L, LoopDivergenceFlag F); + /// @brief Clear the given flags for the tag. + /// + /// @param[in] L Loop whose flag we clear. + /// @param[in] F Flags to clear for the tag. + void clearFlag(const llvm::Loop &L, LoopDivergenceFlag F); + + /// @brief Check if a block Src can reach a block Dst, either within the same + /// SESE region, or outside too. + /// @param[in] src Source node. + /// @param[in] dst Destination node. + /// @param[in] allowLatch Whether reachability is computed with latches or + /// not. + /// @return Whether or not dst is reachable from src. + bool isReachable(llvm::BasicBlock *src, llvm::BasicBlock *dst, + bool allowLatch = false) const; + + /// @brief List of blocks having a divergent branch. + const std::vector &getDivCausingBlocks() const { + return divCausingBlocks; + } + +private: + friend class DivergenceAnalysis; + + /// @brief Mark a block div causing and mark blocks that are control dependent + /// to be divergent + /// @param[in] BB Div causing block. + /// @param[in,out] DI Divergence information of the function. + /// @param[in,out] PDT PostDominatorTree of the function. + void markDivCausing(llvm::BasicBlock &BB, DivergenceInfo &DI, + llvm::PostDominatorTree &PDT); + /// @brief Mark divergent blocks in a loop (loop exits and latch) that are + /// control dependent of a divergent branch. + /// @param[in] BB Div causing block. + /// @param[in] L Loop that BB diverges. + /// @param[in,out] DI Divergence information of the function. + void markDivLoopDivBlocks(llvm::BasicBlock &BB, llvm::Loop &L, + DivergenceInfo &DI); + /// @brief Mark a block to be divergent. + /// @param[in] BB Block to mark. + void markDivergent(const llvm::BasicBlock &BB); + /// @brief Mark a loop to be divergent. + /// @param[in] L Loop to mark. + void markDivergent(const llvm::Loop &L); + /// @brief Recursively mark a block by_all. + /// @param[in] BB Block to mark. + void markByAll(llvm::BasicBlock &BB); + + /// @brief Find join points of a block. + /// @param[in] src Starting block + /// @return List of blocks that have a disjoint path from the starting block. + llvm::DenseSet joinPoints(llvm::BasicBlock &src) const; + /// @brief Find escape points of a divergent loop. + /// + /// Escape points are loop exit blocks from which some work-items may leave + /// through because of a divergent branch. + /// @param[in] src Divergent branch + /// @param[in] L Divergent loop + /// @return List of exit blocks some work-item may leave through. + llvm::DenseSet escapePoints(const llvm::BasicBlock &src, + const llvm::Loop &L) const; + + /// @brief the Function the analysis was run on + llvm::Function &F; + /// @brief AM FunctionAnalysisManager providing analyses. + llvm::FunctionAnalysisManager &AM; + + /// @brief Basic block tag mappings. + DenseBBMap BBMap; + /// @brief Loop tag mappings. + DenseLoopMap LMap; + + /// @brief Storage for the Basic Block Tags + std::vector basicBlockTags; + /// @brief The number of blocks in the DCBI ordering. + size_t numOrderedBlocks = 0; + + /// @brief List of Loop Tags ordered by loop depth + llvm::SmallVector loopOrdering; + + /// @brief Blocks that have a divergent branch. + std::vector divCausingBlocks; + + /// @brief Blocks with uniform conditions that must be considered div causing + /// because they have a join point of a div causing block as their + /// successor. + llvm::DenseSet fakeDivCausingBlocks; +}; + +/// @brief Analysis that determines divergent blocks, i.e. program points +/// that must not be skipped during SIMD execution. +class DivergenceAnalysis : public llvm::AnalysisInfoMixin { + friend llvm::AnalysisInfoMixin; + +public: + /// @brief Create a new analysis object. + DivergenceAnalysis() = default; + + /// @brief Type of result produced by the analysis. + using Result = DivergenceResult; + + /// @brief Determine which values in the function are uniform and which are + /// potentially varying. + /// + /// @param[in] F Function to analyze. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Analysis result for the function. + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "Divergence analysis"; } + +private: + /// @brief Unique identifier for the pass. + static llvm::AnalysisKey Key; +}; +} // namespace vecz + +#endif // VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h new file mode 100644 index 0000000000000..daf31e624a35d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h @@ -0,0 +1,36 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED +#define VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED + +namespace llvm { +class Instruction; +} // namespace llvm + +namespace vecz { +class VectorizationContext; + +/// @brief Determine whether the given instruction needs to be instantiated. +/// +/// @param[in] CTx the vectorization context +/// @param[in] I Instruction to analyze. +/// +/// @return true iff the instruction requires instantiation. +bool needsInstantiation(const VectorizationContext &Ctx, llvm::Instruction &I); +} // namespace vecz + +#endif // VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h new file mode 100644 index 0000000000000..e36188b41dff5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h @@ -0,0 +1,100 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file liveness_analysis.h +/// +/// @brief Live Variable Set Analysis + +#ifndef VECZ_ANALYSIS_LIVENESS_ANALYSIS_H +#define VECZ_ANALYSIS_LIVENESS_ANALYSIS_H + +#include +#include +#include +#include + +namespace llvm { +class Loop; +class LoopInfo; +class Function; +class BasicBlock; +class Value; +} // namespace llvm + +namespace vecz { +class VectorizationUnit; + +struct BlockLivenessInfo { + using LiveSet = llvm::SmallVector; + + LiveSet LiveIn; + LiveSet LiveOut; + size_t MaxRegistersInBlock = 0; +}; + +class LivenessResult { +public: + LivenessResult(llvm::Function &F) : F(F) {} + + LivenessResult() = delete; + LivenessResult(const LivenessResult &) = delete; + LivenessResult(LivenessResult &&) = default; + ~LivenessResult() = default; + + void recalculate(); + + size_t getMaxLiveVirtualRegisters() const; + const BlockLivenessInfo &getBlockInfo(const llvm::BasicBlock *) const; + +private: + class Impl; + + llvm::Function &F; + + size_t maxNumberOfLiveValues; + + llvm::DenseMap BlockInfos; +}; + +/// Analysis pass to perform liveness analysis and estimate register pressure by +/// counting the number of live virtual registers in a function. +/// +/// Values in a basic block's live set are guaranteed to be in program order. +class LivenessAnalysis : public llvm::AnalysisInfoMixin { + friend llvm::AnalysisInfoMixin; + +public: + using Result = LivenessResult; + + LivenessAnalysis() = default; + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "Liveness analysis"; } + + /// Estimate the number of registers needed by F by counting the number of + /// live values. + /// + /// Assumes a reducible CFG. In OpenCL 1.2 whether or not irreducible control + /// flow is illegal is implementation defined. + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &); + + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; +}; + +} // namespace vecz + +#endif // VECZ_ANALYSIS_LIVENESS_ANALYSIS_H diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h new file mode 100644 index 0000000000000..ddd8c97d0c8f2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h @@ -0,0 +1,106 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Stride analysis. + +#ifndef VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED +#define VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED + +#include +#include +#include +#include + +namespace llvm { +class Function; +class Value; +} // namespace llvm + +namespace vecz { + +class StrideAnalysisResult; +struct UniformValueResult; + +/// @brief Holds the result of Packetization Analysis for a given function. +class PacketizationAnalysisResult { +public: + /// @brief The function being analyzed + llvm::Function &F; + /// @brief The Stride Analysis Result to use during analysis + StrideAnalysisResult &SAR; + /// @brief The Uniform Value Result to use during analysis + UniformValueResult &UVR; + + /// @brief Traverse the function, starting from the vector leaves, and mark + /// instructions for packetization where needed. Note that the resulting set + /// MAY not be exhaustive, since it is not always easy to predict where the + /// packetizer might fail and fall back on instantiation, in which case + /// pointers will need to be packetized regardless of linear stride. + PacketizationAnalysisResult(llvm::Function &f, StrideAnalysisResult &sar); + + /// @brief Returns whether the packetization set is empty or not. + bool isEmpty() const { return toPacketize.empty(); } + + /// @brief query whether the given value has been marked for packetization. + /// + /// @param[in] V the value to query + /// @return true if the value was marked for packetization, false otherwise. + bool needsPacketization(const llvm::Value *V) const { + return toPacketize.contains(V); + } + +private: + void markForPacketization(llvm::Value *V); + + /// @brief The set of instructions that need to be packetized. + /// This equates to all non-uniform values except for values used only in + /// address computations with constant linear strides. + llvm::DenseSet toPacketize; +}; + +/// @brief Analysis that determines whether pointer operands of memory +/// operations have a linear dependence on the work item ID. +class PacketizationAnalysis + : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + /// @brief Create a new analysis object. + PacketizationAnalysis() {} + + using Result = PacketizationAnalysisResult; + + /// @brief Run the Packetization Analysis + /// + /// @param[in] F Function to analyze. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Analysis result for the function. + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "Packetization analysis"; } + +private: + /// @brief Unique identifier for the pass. + static llvm::AnalysisKey Key; +}; + +} // namespace vecz + +#endif // VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h new file mode 100644 index 0000000000000..bee7f0f1c0046 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h @@ -0,0 +1,68 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief SIMD width analysis. + +#ifndef VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED +#define VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED + +#include +#include + +#include "vectorization_unit.h" + +namespace vecz { + +class LivenessResult; + +/// @brief Choose a good SIMD width for the given function. +class SimdWidthAnalysis : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + /// @brief Create a new instance of the pass. + SimdWidthAnalysis() = default; + + /// @brief Type of result produced by the analysis. + struct Result { + Result(unsigned value) : value(value) {} + unsigned value; + }; + + /// @brief Run the SIMD width analysis pass on the given function. + /// @param[in] F Function to analyze. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// @return Preferred SIMD vectorization factor for the function or zero. + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "SIMD width analysis"; } + +private: + unsigned avoidSpillImpl(llvm::Function &, llvm::FunctionAnalysisManager &, + unsigned MinWidth = 2); + + /// @brief Vector register width from TTI, if available. + unsigned MaxVecRegBitWidth; + + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; +}; +} // namespace vecz + +#endif // VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h new file mode 100644 index 0000000000000..8b6d641e29681 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h @@ -0,0 +1,141 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Stride analysis. + +#ifndef VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED +#define VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED + +#include +#include +#include +#include +#include + +#include "offset_info.h" + +namespace llvm { +class Function; +class Value; +} // namespace llvm + +namespace vecz { + +struct UniformValueResult; + +/// @brief Holds the result of Stride Analysis for a given function. +class StrideAnalysisResult { +public: + /// @brief The function being analyzed + llvm::Function &F; + /// @brief The Uniform Value Result to use during analysis + UniformValueResult &UVR; + /// @brief AssumptionCache for computing live bits of uniform values + llvm::AssumptionCache &AC; + + StrideAnalysisResult(llvm::Function &f, UniformValueResult &uvr, + llvm::AssumptionCache &AC); + + /// @brief generate stride `ConstantInt`s or `Instruction`s for all analyzed + /// values. + void manifestAll(llvm::IRBuilder<> &B); + + /// @brief gets a pointer to the info struct for this value's analysis. + OffsetInfo *getInfo(llvm::Value *V) { + const auto find = analyzed.find(V); + return (find != analyzed.end()) ? &find->second : nullptr; + } + + /// @brief gets a pointer to the info struct for this value's analysis. + const OffsetInfo *getInfo(llvm::Value *V) const { + const auto find = analyzed.find(V); + return (find != analyzed.end()) ? &find->second : nullptr; + } + + /// @brief construct the offset info for the given value. + OffsetInfo &analyze(llvm::Value *V); + + /// @brief build the strides as `Instructions` or `ConstantInts`. + /// Strides may be needed as `llvm::Values` by transform passes, but we are + /// not allowed to construct them during an analysis pass. However, note that + /// information about manifested stride `Value`s will survive until the + /// analysis is invalidated. + const OffsetInfo &manifest(llvm::IRBuilder<> &B, llvm::Value *V) { + const auto find = analyzed.find(V); + assert(find != analyzed.end() && + "Trying to manifest unanalyzed OffsetInfo"); + return find->second.manifest(B, *this); + } + + /// @brief gets the manifested memory stride for this value, if present. + /// + /// @param[in] B IRBuilder for creating new instructions/values + /// @param[in] Ptr the pointer to calculate the stride for + /// @param[in] EleTy the type that the pointer points to + /// @returns the stride of the memory operation, in number of elements + llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Value *Ptr, + llvm::Type *EleTy) const; + +private: + /// @brief A map of values onto OffsetInfos that were already analyzed. + llvm::DenseMap analyzed; +}; + +/// @brief Analysis that determines whether pointer operands of memory +/// operations have a linear dependence on the work item ID. +class StrideAnalysis : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + /// @brief Create a new analysis object. + StrideAnalysis() {} + + using Result = StrideAnalysisResult; + + /// @brief Run the Stride Analysis + /// + /// @param[in] F Function to analyze. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Analysis result for the function. + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "Stride analysis"; } + +private: + /// @brief Unique identifier for the pass. + static llvm::AnalysisKey Key; +}; + +/// @brief Helper pass to print out the contents of the StrideAnalysis +/// analysis. +class StrideAnalysisPrinterPass + : public llvm::PassInfoMixin { + llvm::raw_ostream &OS; + +public: + explicit StrideAnalysisPrinterPass(llvm::raw_ostream &OS) : OS(OS) {} + + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); +}; + +} // namespace vecz + +#endif // VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h new file mode 100644 index 0000000000000..a221e6cba1447 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h @@ -0,0 +1,200 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Uniform Value analysis. + +#ifndef VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED +#define VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED + +#include +#include +#include +#include + +#include + +namespace llvm { +class Value; +class Instruction; +} // namespace llvm + +namespace vecz { + +class VectorizationContext; +class VectorizationUnit; + +/// @brief Holds the result of Uniform Value Analysis for a given function. +struct UniformValueResult { + enum class VaryingKind { + /// @brief The value is truly uniform on all active and inactive lanes. + eValueTrueUniform, + /// @brief The value is uniform on active lanes. May be poison or undefined + /// on inactive lanes. + eValueActiveUniform, + /// @brief The value is varying and lanes may see different values. + eValueVarying, + /// @brief The value is uniform, but its mask is not. + /// Used for masked memory operations with a uniform address but varying + /// mask. + eMaskVarying, + }; + + /// @brief The function the analysis was run on. + llvm::Function &F; + /// @brief Vectorization unit the analysis was run on. + VectorizationUnit &VU; + /// @brief The Vectorization Context of the analysis. + VectorizationContext &Ctx; + /// @brief The vectorization dimension + unsigned dimension; + /// @brief The actual results of the analysis. + llvm::DenseMap varying; + + /// @brief Create a new UVA result for the given unit. + /// @param[in] F Function to analyze. + /// @param[in] VU Function to analyze. + UniformValueResult(llvm::Function &F, VectorizationUnit &VU); + + /// @brief Determine whether the given value needs to be packetized or not. + /// + /// @param[in] V Value to analyze. + /// + /// @return true if the value needs to be packetized, false otherwise. + bool isVarying(const llvm::Value *V) const; + + /// @brief Determine whether the given value has a varying mask or not. + /// + /// @param[in] V Value to analyze. + /// + /// @return true if the value has a varying mask, false otherwise. + bool isMaskVarying(const llvm::Value *V) const; + + /// @brief Determine whether the given value has a varying mask or not. + /// + /// @param[in] V Value to analyze. + /// + /// @return true if the value is varying or has a varying mask, false + /// otherwise. + bool isValueOrMaskVarying(const llvm::Value *V) const; + + /// @brief Determine (on demand) whether the given value is a true uniform + /// value. + /// + /// @param[in] V Value to analyze. + /// + /// @return true if the value is true uniform, false otherwise. Caches the + /// result for future queries. + bool isTrueUniform(const llvm::Value *V); + + /// @brief Remove the value from the analysis. + /// + /// @param[in] V Value to remove. + void remove(const llvm::Value *V) { varying.erase(V); } + + /// @brief Uncritically set a value to varying. + /// This can be used to keep the result valid after expression transforms. + /// Use with care, since it does not recursively update value users. + /// + /// @param[in] V Value to set. + void setVarying(const llvm::Value *V) { + varying[V] = VaryingKind::eValueVarying; + } + + /// @brief Look for vector roots in the function. + /// + /// Roots are values which are scalar in the original function but are defined + /// to be vector in the vectorized function. + /// + /// Users of roots need to be vectorized too but are not considered roots. + /// As such they will not be returned in Roots. + /// + /// Examples: + /// * Calls to get_global_id() + /// * Calls to get_local_id() + /// + /// @param[in,out] Roots List of roots to update. + void findVectorRoots(std::vector &Roots) const; + + /// @brief Look for vector leaves in the function. + /// + /// Leaves are instructions that allow vectorized values to 'escape' from the + /// function. + /// + /// Examples: + /// * Store instructions (when the value to store is vectorized) + /// * Operands of call instructions (when the call needs to be vectorized) + /// * Return instructions + /// + /// @param[in,out] Leaves List of leaves to update. + void findVectorLeaves(std::vector &Leaves) const; + + /// @brief Find the alloca that this pointer points to + /// + /// @param[in] Pointer The pointer that is (potentially) pointing in an alloca + /// + /// @return the alloca if found, or nullptr otherwise + static llvm::AllocaInst *findAllocaFromPointer(llvm::Value *Pointer); + + /// @brief Try to extract the base pointer of the address. + /// + /// @param[in] Address Address to split into base and offset. + /// + /// @return Base address. + llvm::Value *extractMemBase(llvm::Value *Address); + + // private: + /// @brief Mark any value in the function that depends on V as being varying. + /// + /// @param[in] V Value used to start the vectorization search. + /// @param[in] From Optional value being used by `V`. + void markVaryingValues(llvm::Value *V, llvm::Value *From = nullptr); +}; + +/// @brief Analysis that determine whether values in a function are uniform or +/// varying. +class UniformValueAnalysis + : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + /// @brief Create a new analysis object. + UniformValueAnalysis() {} + + /// @brief Type of result produced by the analysis. + using Result = UniformValueResult; + + /// @brief Determine which values in the function are uniform and which are + /// potentially varying. + /// + /// @param[in] F Function to analyze. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Analysis result for the function. + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "Uniform value analysis"; } + +private: + /// @brief Unique identifier for the pass. + static llvm::AnalysisKey Key; +}; + +} // namespace vecz + +#endif // VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h new file mode 100644 index 0000000000000..6bc813caeea0e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h @@ -0,0 +1,71 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Vectorizable Function analysis. + +#ifndef VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED +#define VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED + +#include +#include + +namespace llvm { +class Value; +} + +namespace vecz { + +/// @brief Determines whether vectorization of a function is possible. +class VectorizableFunctionAnalysis + : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + /// @brief Create a new instance of the pass. + VectorizableFunctionAnalysis() = default; + + /// @brief Type of result produced by the analysis. + struct Result { + /// @brief Whether the function can be vectorized. + bool canVectorize = false; + + /// @brief Handle invalidation events from the new pass manager. + /// + /// @return false, as this analysis can never be invalidated. + bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &, + llvm::FunctionAnalysisManager::Invalidator &) { + return false; + } + }; + + /// @brief Determine whether vectorization of a function is possible. + /// @param[in] F Function to analyze. + /// @return VectorizationUnit corresponding to this function + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "Vectorizable Function analysis"; } + +private: + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; +}; + +} // namespace vecz + +#endif // VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h new file mode 100644 index 0000000000000..7244236587d2f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h @@ -0,0 +1,121 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file vectorization_unit_analysis.h +/// +/// @brief VectorizationUnit analysis. + +#ifndef VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED +#define VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED + +#include +#include + +#include + +#include "vectorization_context.h" +#include "vectorization_unit.h" + +namespace vecz { + +/// @brief Caches and returns the VectorizationUnit for a Function. +class VectorizationUnitAnalysis + : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + /// @brief Create a new instance of the pass. + VectorizationUnitAnalysis(const VectorizationContext &Ctx) : Ctx(Ctx) {} + + /// @brief Type of result produced by the analysis. + class Result { + VectorizationUnit *VU = nullptr; + + public: + Result() = default; + Result(VectorizationUnit *VU) : VU(VU) {} + VectorizationUnit &getVU() { + assert(hasResult()); + return *VU; + } + bool hasResult() { return VU; } + + /// @brief Handle invalidation events from the new pass manager. + /// + /// @return false, as this analysis can never be invalidated. + bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &, + llvm::FunctionAnalysisManager::Invalidator &) { + return false; + } + }; + + /// @brief Retrieve the VectorizationUnit for the requested function. + /// @param[in] F Function to analyze. + /// @return VectorizationUnit corresponding to this function + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "VectorizationUnit analysis"; } + +private: + const VectorizationContext &Ctx; + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; +}; + +/// @brief Caches and returns the VectorizationContext for a Function. +class VectorizationContextAnalysis + : public llvm::AnalysisInfoMixin { + friend AnalysisInfoMixin; + +public: + /// @brief Create a new instance of the pass. + VectorizationContextAnalysis(VectorizationContext &Ctx) : Context(Ctx) {} + + /// @brief Type of result produced by the analysis. + class Result { + VectorizationContext &Ctx; + + public: + Result(VectorizationContext &Ctx) : Ctx(Ctx) {} + VectorizationContext &getContext() { return Ctx; } + const VectorizationContext &getContext() const { return Ctx; } + + /// @brief Handle invalidation events from the new pass manager. + /// + /// @return false, as this analysis can never be invalidated. + bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &, + llvm::FunctionAnalysisManager::Invalidator &) { + return false; + } + }; + + /// @brief Retrieve the VectorizationContext for the requested function. + /// @param[in] F Function to analyze. + /// @return VectorizationContext corresponding to this function + Result run(llvm::Function &F, llvm::FunctionAnalysisManager &); + + /// @brief Return the name of the pass. + static llvm::StringRef name() { return "VectorizationContext analysis"; } + +private: + VectorizationContext &Context; + /// @brief Unique pass identifier. + static llvm::AnalysisKey Key; +}; +} // namespace vecz + +#endif // VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h new file mode 100644 index 0000000000000..cad9caaa7bead --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h @@ -0,0 +1,267 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief BOSCC control flow transformation. +/// +/// Style guideline 004 exemption note: This inner class declaration is in its +/// own header file, because it's quite large. + +#ifndef VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED +#define VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED + +#include +#include +#include +#include + +#include +#include + +#include "transform/control_flow_conversion_pass.h" + +namespace llvm { +class Instruction; +class BasicBlock; +class Function; +class Loop; +} // namespace llvm + +namespace vecz { + +class LivenessResult; + +class ControlFlowConversionState::BOSCCGadget final { +public: + BOSCCGadget(ControlFlowConversionState &Pass) + : PassState(Pass), F(Pass.F), AM(Pass.AM), DT(Pass.DT), PDT(Pass.PDT), + LI(Pass.LI), DR(Pass.DR), RC(Pass.RC.get()) {} + + /// @brief Region of code that will remain uniform after vectorization. + /// + /// Such regions won't have heir instructions predicated. A UniformRegion + /// is delimited by a single-entry-single-exit region and is represented + /// by the blocks it contains. + struct UniformRegion final { + /// @brief Predicated blocks duplicated in the region. + llvm::DenseSet predicatedBlocks; + /// @brief Uniform blocks created in the region. + llvm::DenseSet uniformBlocks; + /// @brief Divergent branches that need a connection from the uniform + /// region. + std::vector divergentBranches; + /// @brief The entry block of the uniform region. + llvm::BasicBlock *entryBlock; + /// @brief The exit block of the uniform region. + llvm::BasicBlock *exitBlock; + + /// @brief Mapping between a connection point of a predicated region + /// and the blend points of that region impacted by the former. + /// + /// Said "impacted blocks" are blocks with more than one predecessors that + /// need to have blend instructions because instructions defined within + /// that region may no longer dominate said "impacted blocks". + llvm::DenseMap> + blendPoints; + + /// @brief It stores up information about the connection points while + /// the CFG is being updated, to be applied afterwards. + struct ConnectionInfo { + llvm::BasicBlock *connectionPoint; + std::pair incoming; + }; + + /// @brief The list of ConnectionInfos to be applied at finalization. + std::vector connections; + + /// @brief It stores up information about new blocks created to contain + /// blend LCSSA PHI nodes, so they can be created after the CFG + /// has been updated. + struct StoreBlock { + llvm::BasicBlock *connectionPoint; + llvm::BasicBlock *target; + llvm::BasicBlock *runtimeCheckerBlock; + }; + + /// @brief The list of blend `StoreBlocks` to be applied at finalization. + llvm::SmallVector storeBlocks; + + /// @brief Find if a predicated block belongs to this region. + /// @param[in] B Block to look for in the region + /// @return Whether the block belong to the region or not. + bool contains(llvm::BasicBlock *B) const { + return predicatedBlocks.contains(B); + } + }; + /// @brief List of all duplicated uniform regions. + using UniformRegions = std::vector; + + /// @brief Create uniform regions to duplicate the blocks within such + /// regions. + /// + /// This allows to retain their uniform version to skip divergent branches + /// when the entry mask of a div causing block is dynamically uniform (i.e. + /// all true or all false). Nested uniform regions need not be duplicated + /// multiple times. + /// + /// @return true if no problem occurred, false otherwise. + bool duplicateUniformRegions(); + + /// @brief Connect the BOSCC regions. + /// @return true if no problem occured, false otherwise. + bool connectBOSCCRegions(); + + /// @brief Get the uniform version of 'B'. + /// @param[in] B The predicated block whose uniform version we want. + /// @return A uniform block if it exists, nullptr otherwise. + llvm::BasicBlock *getBlock(llvm::BasicBlock *B); + /// @brief Get the uniform version of 'L'. + /// @param[in] L The predicated loop whose uniform version we want. + /// @return A uniform loop if it exists, nullptr otherwise. + llvm::Loop *getLoop(llvm::Loop *L); + + /// @brief Get the region entry blocks that have not been duplicated. + /// @param[out] blocks SmallVector to hold the result + void getUnduplicatedEntryBlocks( + llvm::SmallVectorImpl &blocks) const; + + /// @brief Create an entry in the VMap so that 'uni' becomes a uniform + /// equivalent of 'pred'. + /// @param[in] pred Predicate value + /// @param[in] uni Uniform value + /// @param[in] needsMapping Whether 'uni' needs to me remapped + void createReference(llvm::Value *pred, llvm::Value *uni, + bool needsMapping = false); + /// @brief Add an entry in the VMap so that the uniform equivalent of + /// 'old' becomes the uniform equivalent of 'pred' as well. + /// @param[in] pred Predicate value + /// @param[in] old Predicate value whose uniform equivalent we want + void addReference(llvm::Value *pred, llvm::Value *old); + /// @brief Add a new block to all the regions the reference block is part + /// of. + /// @param[in] newB New block + /// @param[in] refB Rference block + void addInRegions(llvm::BasicBlock *newB, llvm::BasicBlock *refB); + + /// @brief Link the masks of the predicated regions to the uniform regions. + /// @return true on success, false on failure. + bool linkMasks(); + + /// @brief Retrieve the uniform version, if one exists, of predicatedV + /// defined in src. + /// @param[in] predicatedV The predicated value whose uniform version we + /// want to get. + /// @return the uniform version if it exists, null otherwise. + llvm::Value *getUniformV(llvm::Value *predicatedV); + /// @brief Update the value a uniform value should be a duplicate of. + /// @param[in] from The old value + /// @param[in] to The new value + void updateValue(llvm::Value *from, llvm::Value *to); + + /// @brief Clean up redundant PHI nodes created by BOSCC. + /// @return true if no problem occured, false otherwise. + bool cleanUp(); + +private: + ControlFlowConversionState &PassState; + llvm::Function &F; + llvm::FunctionAnalysisManager &AM; + llvm::DominatorTree *DT = nullptr; + llvm::PostDominatorTree *PDT = nullptr; + llvm::LoopInfo *LI = nullptr; + DivergenceResult *DR = nullptr; + Reachability *RC = nullptr; + + /// @brief Mapping between the uniform version and the predicated version + /// of the BOSCC. This is useful to keep information between both + /// versions shared, such as exit masks. + llvm::ValueToValueMapTy VMap; + + /// @brief Mapping between the predicated version and the uniform version + /// of the BOSCC loops. + llvm::DenseMap LMap; + + UniformRegions uniformRegions; + + /// @brief Original edges of the CFG. Used to connect the uniform regions + /// to their predicated version. + llvm::DenseMap> + uniformEdges; + + /// @brief Mapping between a block from which a value should be replaced by + /// its blended value. + using URVBlender = + std::vector>>; + + URVBlender URVB; + + LivenessResult *liveness = nullptr; + + /// @brief Create uniform regions + /// @return true if no problem occurred, false otherwise. + bool createUniformRegions( + const llvm::DenseSet &noDuplicateBlocks); + /// @brief Duplicate a loop, creating a new looptag and updating all the + /// relevant information. + /// @param[in] L The loop to duplicate + /// @return true if no problem occurred, false otherwise. + bool duplicateUniformLoops(llvm::Loop *L); + + /// @brief Connect the uniform blocks that belong to the uniform region + /// @param[in] region Uniform region we are connecting + /// @param[in] predicatedB Div causing block in the predicated version + /// @param[in] uniformB Div causing block in the uniform version + /// @return true if no problem occured, false otherwise. + bool connectUniformRegion(UniformRegion ®ion, + llvm::BasicBlock *predicatedB, + llvm::BasicBlock *uniformB); + + /// @brief Blend uniform region instructions into the predicated region + /// connection point 'CP'. + /// @param[in] CP Connection point between a uniform and predicated region. + /// @param[in] incoming Predicated and uniform incoming block of 'CP'. + /// @return true if no problem occured, false otherwise. + bool blendConnectionPoint( + llvm::BasicBlock *CP, + const std::pair &incoming); + + /// @brief Apply all the changes stored up by `connectUniformRegion` + /// and `blendConnectionPoint` once the CFG has been fully updated. + /// @return true if no problem occured, false otherwise. + bool blendFinalize(); + + /// @brief Update blend values in loop headers. + /// @param[in] LTag Loop whose blend values we update + /// @param[in] from The value we want to update + /// @param[in] to The value we update 'from' with. + /// @return true if no problem occured, false otherwise. + bool updateLoopBlendValues(LoopTag *LTag, llvm::Instruction *from, + llvm::Instruction *to); + + /// @brief Generate a block ordering. + /// + /// This ordering differs a little bit from the one in + /// ControlFlowConversionPass as we must process all the blocks that belong + /// in the same uniform region at once. + /// + /// @returns true if no errors occurred. + bool computeBlockOrdering(); +}; +} // namespace vecz + +#endif // VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h new file mode 100644 index 0000000000000..187299c997307 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h @@ -0,0 +1,56 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief ROSCC control flow transformation. +/// +/// Style guideline 004 exemption note: This inner class declaration is in its +/// own header to match `control_flow_boscc.h`. + +#ifndef VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED +#define VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED + +#include "transform/control_flow_conversion_pass.h" + +namespace llvm { +class Instruction; +class BasicBlock; +class Loop; +} // namespace llvm + +namespace vecz { + +/// @brief class that encapsulates the ROSCC transformation, which stands for +/// "Return On Superword Condition Code" and optimizes non-uniform +/// branches to the function return block(s). +class ControlFlowConversionState::ROSCCGadget final { +public: + ROSCCGadget(ControlFlowConversionState &Pass) + : UVR(Pass.UVR), DT(Pass.DT), PDT(Pass.PDT), LI(Pass.LI) {} + + /// @brief perform the ROSCC transformation + bool run(llvm::Function &F); + +private: + UniformValueResult *UVR = nullptr; + llvm::DominatorTree *DT = nullptr; + llvm::PostDominatorTree *PDT = nullptr; + llvm::LoopInfo *LI = nullptr; +}; +} // namespace vecz + +#endif // VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h new file mode 100644 index 0000000000000..0be9fa33de99a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h @@ -0,0 +1,200 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Functions, macros, etc used for debugging + +#ifndef VECZ_DEBUGGING_H_INCLUDED +#define VECZ_DEBUGGING_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace vecz { + +/// @brief Namespace used for vecz utils that we don't want to pollute the whole +/// vecz namespace +namespace internal { +/// @brief Helper type for signaling a failure from functions that return either +/// a pointer or a boolean to indicate if vectorization was successful or not +struct VeczFailResult { + /// @brief For functions that return a boolean value + operator bool() const { return false; } + /// @brief For functions that return a pointer + template operator T *() const { return nullptr; } + /// @brief For functions that return an std::shared_ptr + template operator std::shared_ptr() const { return nullptr; } + /// @brief For functions that return an std::unique_ptr + template operator std::unique_ptr() const { return nullptr; } + /// @brief For functions that return an llvm::Optional + template operator std::optional() const { + return std::nullopt; + } + + /// @brief For functions that return an llvm::Error + operator llvm::Error() const { + return llvm::make_error("Unknown VeczFailResult", + llvm::inconvertibleErrorCode()); + } +}; + +struct AnalysisFailResult : public internal::VeczFailResult { + AnalysisFailResult() = default; + ~AnalysisFailResult() = default; + // If an optimization failed we'd better not have altered the validity of any + // analysis... + operator llvm::PreservedAnalyses() const { + return llvm::PreservedAnalyses::all(); + } +}; + +/* + * The following macros are available: + * + * VECZ_FAIL: Return from the function with a failure value (e.g. `false` or + * `nullptr`). + * + * VECZ_FAIL_IF(cond): If (cond == true) then VECZ_FAIL + * + * VECZ_STAT_FAIL_IF(cond, stat): If (cond == true) then VECZ_FAIL and increment + * stat + * + * VECZ_ERROR_IF(cond, message): Similar to VECZ_FAIL_IF, but when NDEBUG is not + * set it aborts instead of returning a failure value. + * + * VECZ_ERROR(message): Similar to VECZ_ERROR_IF(true, message) + * + * VECZ_WARN_IF(cond, message): Similar to VECZ_ERROR_IF, but it doesn't abort + * but warns and carries on. + * + * VECZ_UNREACHABLE(message): Unconditionally terminate with an error message. + * + * For all the macros, the message is <<'d to llvm::errs(), so it is possible to + * print llvm Values etc. For example, this works: + * VECZ_WARN_IF(cond, "Warning: Value = " << *V) + */ + +#define VECZ_FAIL() return vecz::internal::VeczFailResult() + +#define VECZ_FAIL_IF(cond) \ + do { \ + if (cond) { \ + VECZ_FAIL(); \ + } \ + } while (false) + +#define VECZ_STAT_FAIL_IF(cond, stat) \ + do { \ + if (cond) { \ + ++stat; \ + VECZ_FAIL(); \ + } \ + } while (false) + +#define VECZ_ERROR_IF(cond, message) \ + do { \ + if (cond) { \ + VECZ_ERROR(message); \ + } \ + } while (false) + +#ifdef NDEBUG + +#define VECZ_ERROR(message) \ + do { \ + llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__ \ + << "\n"; \ + llvm::errs() << "!! Reason: " << message << "\n"; \ + VECZ_FAIL(); \ + } while (false) + +#define VECZ_WARN_IF(cond, message) /* Nothing */ +#define VECZ_UNREACHABLE(message) /* Nothing */ + +#else /* !NDEBUG */ + +#define VECZ_ERROR(message) \ + do { \ + llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__ \ + << "\n"; \ + llvm::errs() << "!! Reason: " << (message) << "\n"; \ + std::abort(); \ + } while (false) + +#define VECZ_WARN_IF(cond, message) \ + do { \ + if (cond) { \ + llvm::errs() << "!! Vecz: WARNING in " << __FILE__ << ":" << __LINE__ \ + << "\n"; \ + llvm::errs() << "!! Reason: " << (message) << "\n"; \ + } \ + } while (false) + +#define VECZ_UNREACHABLE(message) \ + do { \ + llvm::errs() << "!! Vecz: UNREACHABLE reached in " << __FILE__ << ":" \ + << __LINE__ << "\n"; \ + llvm::errs() << "!! Message: " << (message) << "\n"; \ + std::abort(); \ + } while (false) +#endif /* NDEBUG */ +} // namespace internal + +#define VECZ_UNUSED(x) ((void)(x)) + +/// @brief Emit a RemarkMissed message +/// +/// @param[in] F The function in which we are currently working +/// @param[in] V The value (can be `nullptr`) to be included in the message +/// @param[in] Msg The main remark message text +/// @param[in] Note An optional additional note to provide more context/info. +void emitVeczRemarkMissed(const llvm::Function *F, const llvm::Value *V, + llvm::StringRef Msg, llvm::StringRef Note = ""); +/// @brief Emit a RemarkMissed message +/// +/// @param[in] F The function in which we are currently working +/// @param[in] Msg The main remark message text +/// @param[in] Note An optional additional note to provide more context/info. +void emitVeczRemarkMissed(const llvm::Function *F, llvm::StringRef Msg, + llvm::StringRef Note = ""); +/// @brief Emit a Remark message +/// +/// @param[in] F The function in which we are currently working +/// @param[in] V The value (can be `nullptr`) to be included in the message +/// @param[in] Msg The main remark message text +void emitVeczRemark(const llvm::Function *F, const llvm::Value *V, + llvm::StringRef Msg); +/// @brief Emit a Remark message +/// +/// @param[in] F The function in which we are currently working +/// @param[in] Msg The main remark message text +void emitVeczRemark(const llvm::Function *F, llvm::StringRef Msg); + +} // namespace vecz + +#endif // VECZ_DEBUGGING_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h new file mode 100644 index 0000000000000..1321237311322 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h @@ -0,0 +1,52 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef VECZ_IR_CLEANUP_H_INCLUDED +#define VECZ_IR_CLEANUP_H_INCLUDED + +#include + +namespace llvm { +class Instruction; +} + +namespace vecz { +class IRCleanup { +public: + /// @brief Mark the instruction as needing deletion. It will only be deleted + /// if it is unused. This is used to mark instructions with side-effects + /// (e.g. call, load, store and leaves) that have been replaced and are no + /// longer needed. Dead Code Elimination will not touch such instructions. + /// + /// @param[in] I Instruction to mark as needing deletion. + void deleteInstructionLater(llvm::Instruction *I); + + /// @brief Get rid of instructions that have been marked for deletion. + void deleteInstructions(); + + /// @brief Immediately delete an instruction, and replace all uses with undef + /// + /// @param[in] I Instruction to delete. + static void deleteInstructionNow(llvm::Instruction *I); + +private: + /// @brief Instructions that have been marked for deletion. + llvm::SmallPtrSet InstructionsToDelete; +}; + +} // namespace vecz + +#endif // VECZ_VECTORIZATION_UNIT_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h new file mode 100644 index 0000000000000..d4aafaa610cc5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h @@ -0,0 +1,54 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief LLVM helper methods. + +#ifndef VECZ_LLVM_HELPERS_H_INCLUDED +#define VECZ_LLVM_HELPERS_H_INCLUDED + +#include +#include +#include +#include + +namespace vecz { + +/// @brief Determine if the value has vector type, and return it. +/// +/// @param[in] V Value to analyze. +/// +/// @return Vector type of V or null. +llvm::FixedVectorType *getVectorType(llvm::Value *V); + +/// @brief Get the default value for a type. +/// +/// @param[in] T Type to get default value of. +/// @param[in] V Default value to use for numeric type +/// +/// @return Default value, which will be poison for non-numeric types +llvm::Value *getDefaultValue(llvm::Type *T, uint64_t V = 0UL); + +/// @brief Get the shuffle mask as sequence of integers. +/// +/// @param[in] Shuffle Instruction +/// +/// @return Array of integers representing the Shuffle mask +llvm::ArrayRef getShuffleVecMask(llvm::ShuffleVectorInst *Shuffle); +} // namespace vecz + +#endif // VECZ_LLVM_HELPERS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h new file mode 100644 index 0000000000000..a02bb446174d4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h @@ -0,0 +1,615 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Manipulation of memory operations like loads and stores. + +#ifndef VECZ_MEMORY_OPERATIONS_H_INCLUDED +#define VECZ_MEMORY_OPERATIONS_H_INCLUDED + +#include +#include +#include +#include + +#include + +namespace llvm { +class CallInst; +class LoadInst; +class StoreInst; +class Argument; +class Function; +class Instruction; +class Value; +class Type; +} // namespace llvm + +namespace vecz { + +class VectorizationContext; +struct UniformValueResult; + +/// @brief Return or declare a masked memory operation builtin function. +/// +/// @param[in] Ctx Context used to manipulate internal builtins. +/// @param[in] DataTy Loaded type or stored value type. +/// @param[in] PtrTy Pointer type. Must either be opaque or have its pointee +/// type match DataTy. +/// @param[in] Alignment Alignment of the operation. +/// @param[in] IsLoad true if defined a masked load, false if a masked store. +/// @param[in] IsVP true if defining a vector-predicated operation +/// +/// @return Masked builtin function. +llvm::Function *getOrCreateMaskedMemOpFn(VectorizationContext &Ctx, + llvm::Type *DataTy, + llvm::PointerType *PtrTy, + unsigned Alignment, bool IsLoad, + bool IsVP); + +/// @brief Create a call to a masked load operation builtin function. +/// +/// @param[in] Ctx Context used to retrieve the builtin function. +/// @param[in] Ty Type to load from memory. +/// @param[in] Ptr Pointer. Internally bitcast to point to Ty. +/// @param[in] Mask Mask. +/// @param[in] EVL vector length as i32, else null (full width operation). +/// @param[in] Alignment Alignment +/// @param[in] Name Name to give to the call instruction. +/// +/// @return Call instruction or null on error. +llvm::CallInst *createMaskedLoad(VectorizationContext &Ctx, llvm::Type *Ty, + llvm::Value *Ptr, llvm::Value *Mask, + llvm::Value *EVL, unsigned Alignment, + llvm::Twine Name = ""); + +/// @brief Create a call to a masked store operation builtin function. +/// +/// @param[in] Ctx Context used to retrieve the builtin function. +/// @param[in] Data Stored value. +/// @param[in] Ptr Pointer. Internally bitcast to pointer to Data's type. +/// @param[in] Mask Mask. +/// @param[in] EVL vector length as i32, else null (full width operation). +/// @param[in] Alignment Alignment +/// @param[in] Name Name to give to the call instruction. +/// +/// @return Call instruction or null on error. +llvm::CallInst *createMaskedStore(VectorizationContext &Ctx, llvm::Value *Data, + llvm::Value *Ptr, llvm::Value *Mask, + llvm::Value *EVL, unsigned Alignment, + llvm::Twine Name = ""); + +/// @brief Return or declare a (masked) interleaved memory operation builtin +/// function. + +/// @param[in] Ctx Context used to manipulate internal builtins. +/// @param[in] DataTy Loaded type or stored value type. +/// @param[in] PtrTy Pointer type. Must either be opaque or have its pointee +/// type match DataTy's element type. +/// @param[in] Stride The stride of the access. May be null in which case the +/// default stride is used. +/// @param[in] MaskTy The mask type. May be null for an unmasked operation. +/// @param[in] Alignment Alignment of the operation. +/// @param[in] IsLoad true if defining a load, false if defining a store. +/// @param[in] IsVP true if defining a vector-predicated operation +/// +/// @return (Masked) interleaved builtin function. +llvm::Function * +getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx, llvm::Type *DataTy, + llvm::PointerType *PtrTy, llvm::Value *Stride, + llvm::Type *MaskTy, unsigned Alignment, + bool IsLoad, bool IsVP); + +/// @brief Create a call to a (masked) interleaved load builtin function. Also +/// known as a strided load. +/// +/// @param[in] Ctx Vectorization Context used to retrieve the builtin info. +/// @param[in] Ty Type to load from memory +/// @param[in] Ptr Pointer. Internally bitcast to a pointer to Ty's element +/// type. +/// @param[in] Stride The stride of the operation. May be null in which case +/// the default stride is used. +/// @param[in] Mask The mask controlling the operation. May be null in which +/// case an unmasked builtin is called. +/// @param[in] Alignment Alignment of the operation. +/// @param[in] Name Name to give to the call instruction. +/// +/// @return Call instruction or null on error. +llvm::CallInst *createInterleavedLoad(VectorizationContext &Ctx, llvm::Type *Ty, + llvm::Value *Ptr, llvm::Value *Stride, + llvm::Value *Mask, llvm::Value *EVL, + unsigned Alignment, + llvm::Twine Name = ""); + +/// @brief Create a call to a (masked) interleaved store builtin function. Also +/// known as a strided store. +/// +/// @param[in] Ctx Vectorization Context used to retrieve the builtin info. +/// @param[in] Data Data value to store to memory. +/// @param[in] Ptr Pointer. Internally bitcast to a pointer to Data's element +/// type. +/// @param[in] Stride The stride of the operation. May be null in which case +/// the default stride is used. +/// @param[in] Mask The mask controlling the operation. May be null in which +/// case an unmasked builtin is called. +/// @param[in] Alignment Alignment of the operation. +/// @param[in] Name Name to give to the call instruction. +/// +/// @return Call instruction or null on error. +llvm::CallInst *createInterleavedStore(VectorizationContext &Ctx, + llvm::Value *Data, llvm::Value *Ptr, + llvm::Value *Stride, llvm::Value *Mask, + llvm::Value *EVL, unsigned Alignment, + llvm::Twine Name = ""); + +/// @brief Return or declare a (masked) scatter/gather memory operation builtin +/// function. +/// +/// @param[in] Ctx Context used to manipulate internal builtins. +/// @param[in] DataTy Loaded type or stored value type. +/// @param[in] VecPtrTy Pointer type. Must be a vector of pointers, each of +/// which are either opaque or have a pointee type matching DataTy's element +/// type. +/// @param[in] MaskTy The mask type. May be null for an unmasked operation. +/// @param[in] Alignment Alignment of the operation. +/// @param[in] IsGather true if defining a gather (load), false if defining a +/// scatter (store). +/// @param[in] IsVP true if defining a vector-predicated operation +/// +/// @return Scatter/gather builtin function. +llvm::Function *getOrCreateScatterGatherMemOpFn(vecz::VectorizationContext &Ctx, + llvm::Type *DataTy, + llvm::VectorType *VecPtrTy, + llvm::Type *MaskTy, + unsigned Alignment, + bool IsGather, bool IsVP); + +/// @brief Create a call to a (masked) gather memory operation builtin +/// function. +/// +/// @param[in] Ctx Context used to retrieve the builtin function. +/// @param[in] Ty Type to load from memory. +/// @param[in] VecPtr Pointer value. Must be a vector of pointers, each of +/// which are either opaque or have a pointee type matching DataTy's element +/// type. +/// @param[in] Mask The predicate of the masked instruction. May be null in +/// which case an unmasked builtin is created. +/// @param[in] Alignment Alignment of the operation. +/// @param[in] EVL vector length as i32, else null (full width operation). +/// @param[in] Name Name to give to the call instruction. +/// +/// @return Call instruction or null on error. +llvm::CallInst *createGather(VectorizationContext &Ctx, llvm::Type *Ty, + llvm::Value *VecPtr, llvm::Value *Mask, + llvm::Value *EVL, unsigned Alignment, + llvm::Twine Name = ""); + +/// @brief Create a call to a (masked) scatter memory operation builtin +/// function. +/// +/// @param[in] Ctx Context used to retrieve the builtin function. +/// @param[in] VecData Value to store to memory. +/// @param[in] VecPtr Pointer value. Must be a vector of pointers, each of +/// which are either opaque or have a pointee type matching DataTy's element +/// type. +/// @param[in] Mask The predicate of the masked instruction. May be null in +/// which case an unmasked builtin is created. +/// @param[in] Alignment Alignment of the operation. +/// @param[in] EVL vector length as i32, else null (full width operation). +/// @param[in] Name Name to give to the call instruction. +/// +/// @return Call instruction or null on error. +llvm::CallInst *createScatter(VectorizationContext &Ctx, llvm::Value *VecData, + llvm::Value *VecPtr, llvm::Value *Mask, + llvm::Value *EVL, unsigned Alignment, + llvm::Twine Name = ""); + +/// @brief an enum to distinguish between loads and stores, and between builtin +/// memop calls and native IR memop instructions. +enum class MemOpKind : int { + /// @brief The object does not contain a valid memory operation. + Invalid = 0, + /// @brief The object contains a LLVM load instruction. + LoadInstruction, + /// @brief The object contains a LLVM store instruction. + StoreInstruction, + /// @brief The object contains a 'load-like' function call. + LoadCall, + /// @brief The object contains a 'store-like' function call. + StoreCall, +}; + +/// @brief an enum to distinguish between different memory access patterns +enum class MemOpAccessKind : int { + /// @brief The object does not represent a vecz memop call + Native = 0, + /// @brief The object represents a masked memory operation + Masked, + /// @brief The object represents an interleaved memory operation + Interleaved, + /// @brief The object represents a masked interleaved memory operation + MaskedInterleaved, + /// @brief The object represents a scatter/gather memory operation + ScatterGather, + /// @brief The object represents a masked scatter/gather memory operation + MaskedScatterGather, +}; + +struct MemOp; + +/// @brief Describes a memory operation such as a load or a store. +class MemOpDesc { + /// @brief Type of the data operand for stores, or memory type for loads. + llvm::Type *DataTy; + /// @brief Type of the pointer used to access memory. + llvm::Type *PtrTy; + /// @brief In the case of masked operations, type of the mask operand. + llvm::Type *MaskTy; + /// @brief Identifies the kind of memory operation which is performed. + MemOpKind Kind; + /// @brief Idenfities the kind of memory access pattern + MemOpAccessKind AccessKind; + /// @brief Whether or not the memory access is vector-length predicated. + bool IsVLOp; + /// @brief Memory alignment. + unsigned Alignment; + /// @brief Distance between consecutive elements in memory, in number of + /// elements. Zero means uniform access, one means sequential access. + /// Negative values mean the access is done is reverse order. + llvm::Value *Stride; + /// @brief Index of the data operand, for stores, or negative value. + int8_t DataOpIdx; + /// @brief Index of the pointer operand. + int8_t PtrOpIdx; + /// @brief Index of the mask operand, for masked operations, or negative + /// value. + int8_t MaskOpIdx; + /// @brief Index of vector length operand, or negative value. + int8_t VLOpIdx; + + friend struct MemOp; + +public: + /// @brief Create an invalid memory operation. + MemOpDesc(); + + bool isMaskedMemOp() const { return AccessKind == MemOpAccessKind::Masked; } + bool isInterleavedMemOp() const { + return AccessKind == MemOpAccessKind::Interleaved; + } + bool isMaskedInterleavedMemOp() const { + return AccessKind == MemOpAccessKind::MaskedInterleaved; + } + bool isScatterGatherMemOp() const { + return AccessKind == MemOpAccessKind::ScatterGather; + } + bool isMaskedScatterGatherMemOp() const { + return AccessKind == MemOpAccessKind::MaskedScatterGather; + } + + /// @brief In the case of stores, return the data element being stored. + llvm::Value *getDataOperand(llvm::Function *F) const { + return getOperand(F, DataOpIdx); + } + + /// @brief Return the pointer used by the memory operation. + llvm::Value *getPointerOperand(llvm::Function *F) const { + return getOperand(F, PtrOpIdx); + } + + /// @brief In the case of a masked memory operation, return the mask. + llvm::Value *getMaskOperand(llvm::Function *F) const { + return getOperand(F, MaskOpIdx); + } + + /// @brief In the case of a masked memory operation, return the vector + /// length. + llvm::Value *getVLOperand(llvm::Function *F) const { + return getOperand(F, VLOpIdx); + } + + /// @brief Index of the data operand of the MemOp + /// @return The index, or -1 if no data operand + int8_t getDataOperandIndex() const { return DataOpIdx; } + /// @brief Index of the pointer operand of the MemOp + /// @return The index, or -1 if no pointer operand + int8_t getPointerOperandIndex() const { return PtrOpIdx; } + /// @brief Index of the mask operand of the MemOp + /// @return The index, or -1 if no mask operand + int8_t getMaskOperandIndex() const { return MaskOpIdx; } + /// @brief Index of the vector-length operand of the MemOp + /// @return The index, or -1 if no mask operand + int8_t getVLOperandIndex() const { return VLOpIdx; } + + /// @brief Get what kind of memory operation this is. + /// @return The kind of the memory operation + MemOpKind getKind() const { return Kind; } + + /// @brief Get the alignment of the memory operation. + /// @return The alignment in bytes + unsigned getAlignment() const { return Alignment; } + + /// @brief In the case of a interleaved memory operation, return the stride. + /// @return The Value determining the stride + llvm::Value *getStride() const { return Stride; } + /// @brief Determine if the stride is an integer whose value can be determined + /// at compile time. + /// @return True is the stride is a compile time integer constant + bool isStrideConstantInt() const; + /// @brief Get the stride as a constant int. It assumes that it is possible + /// and valid to do so. + /// @return The stride in elements + int64_t getStrideAsConstantInt() const; + + /// @brief Return the type of data element being accessed in memory. + /// @return The type of the data element being accessed in memory. + llvm::Type *getDataType() const { return DataTy; } + + /// @brief Return the type of the pointer operand. + /// @return The type the pointer operand + llvm::Type *getPointerType() const { return PtrTy; } + + /// @brief Return the specified operand from the function. + /// + /// @param[in] F Function to retrieve the operand from. + /// @param[in] OpIdx Index of the operand to retrieve. + /// + /// @return Operand or null. + llvm::Argument *getOperand(llvm::Function *F, int OpIdx) const; + + /// @brief Determine whether the given function is a memory operation. + /// If that's the case, the descriptor is populated and returned. + /// + /// @param[in] F Function to analyze. + /// + /// @return A MemOpDesc if the given function is a memory operation. + /// std::nullopt otherwise. + static std::optional analyzeMemOpFunction(llvm::Function &F); + + /// @brief Determine whether the given function is a masked memory operation. + /// If that's the case, the descriptor is populated and returned. + /// + /// @param[in] F Function to analyze. + /// + /// @return A MemOpDesc if the given function is a masked memory operation. + /// std::nullopt otherwise. + static std::optional analyzeMaskedMemOp(llvm::Function &F); + + /// @brief Determine whether the given function is an interleaved memory + /// operation or not. If that's the case, the descriptor is populated and + /// returned. + /// + /// @param[in] F Function to analyze. + /// + /// @return A MemOpDesc if the given function is an interleaved memory + /// operation. std::nullopt otherwise. + static std::optional analyzeInterleavedMemOp(llvm::Function &F); + + /// @brief Determine whether the given function is a masked interleaved memory + /// operation or not. If that's the case, the descriptor is populated and + /// returned. + /// + /// @param[in] F Function to analyze. + /// + /// @return A MemOpDesc if the given function is a masked interleaved memory + /// operation. std::nullopt otherwise. + static std::optional + analyzeMaskedInterleavedMemOp(llvm::Function &F); + + /// @brief Determine whether the given function is a scatter/gather memory + /// operation or not. If that's the case, the descriptor is populated and + /// returned. + /// + /// @param[in] F Function to analyze. + /// + /// @return A MemOpDesc if the given function is a scatter/gather operation. + /// std::nullopt otherwise. + static std::optional analyzeScatterGatherMemOp(llvm::Function &F); + + /// @brief Determine whether the given function is a scatter/gather memory + /// operation or not. If that's the case, the descriptor is populated and + /// returned. + /// + /// @param[in] F Function to analyze. + /// + /// @return A MemOpDesc if the given function is a masked scatter/gather + /// operation. std::nullopt otherwise. + static std::optional + analyzeMaskedScatterGatherMemOp(llvm::Function &F); + + /// @brief Determine whether the operation is a load or not. + bool isLoad() const { + switch (Kind) { + default: + return false; + case MemOpKind::LoadInstruction: + case MemOpKind::LoadCall: + return true; + } + } + + /// @brief Determine whether the operation is a store or not. + bool isStore() const { + switch (Kind) { + default: + return false; + case MemOpKind::StoreInstruction: + case MemOpKind::StoreCall: + return true; + } + } + + /// @brief Determine whether the operation is an instruction or not. + bool isLoadStoreInst() const { + switch (Kind) { + default: + return false; + case MemOpKind::LoadInstruction: + case MemOpKind::StoreInstruction: + return true; + } + } + + bool isVLOp() const { return IsVLOp; } +}; + +/// @brief Wrapper that combines a memory operation descriptor and instruction. +/// This allows manipulating different kinds of memory operations (load and +/// store instructions, vecz builtins) in the same way. +struct MemOp { + /// @brief Create an invalid memory operation. + MemOp() {} + /// @brief Create a memory operation from an instruction and an existing + /// memory operation descriptor. + /// + /// @param[in] I Memory instruction. + /// @param[in] Desc Memory operation descriptor. + MemOp(llvm::Instruction *I, const MemOpDesc &Desc); + /// @brief Create a memory operation from an instruction. + /// @param[in] I Instruction that may be a memory operation. + static std::optional get(llvm::Instruction *I); + /// @brief Create a memory operation from an instruction and an existing + /// memory operation descriptor. + /// + /// @param[in] CI Memory builtin call instruction. + /// @param[in] AccessKind the kind of access to consider + static std::optional get(llvm::CallInst *CI, + MemOpAccessKind AccessKind); + + /// @brief Access the memory operation descriptor. + const MemOpDesc &getDesc() const { return Desc; } + + /// @brief Access the memory operation descriptor. + MemOpDesc &getDesc() { return Desc; } + + /// @brief Return the instruction that performs the memory operation. + llvm::Instruction *getInstr() const { return Ins; } + + /// @brief Return the alignment in bytes. + unsigned getAlignment() const { return Desc.getAlignment(); } + + /// @brief In the case of a interleaved memory operation, return the stride. + llvm::Value *getStride() const { return Desc.getStride(); } + + /// @brief Return the type of data element being accessed in memory. + llvm::Type *getDataType() const { return Desc.getDataType(); } + + /// @brief Return the type of the pointer operand. + llvm::Type *getPointerType() const { return Desc.getPointerType(); } + + /// @brief Determine whether the operation is a load or not. + bool isLoad() const { return Desc.isLoad(); } + + /// @brief Determine whether the operation is a store or not. + bool isStore() const { return Desc.isStore(); } + + /// @brief Determine whether the operation is an instruction or not. + bool isLoadStoreInst() const { return Desc.isLoadStoreInst(); } + + /// @brief Determine whether the operation is a masked memop call + bool isMaskedMemOp() const { return Desc.isMaskedMemOp(); } + + /// @brief Determine whether the operation is a scatter/gather memop call + bool isMaskedScatterGatherMemOp() const { + return Desc.isMaskedScatterGatherMemOp(); + } + + /// @brief Determine whether the operation is a masked interleaved memop call + bool isMaskedInterleavedMemOp() const { + return Desc.isMaskedInterleavedMemOp(); + } + + /// @brief In the case of stores, return the data element being stored. + /// @return Data operand or null. + llvm::Value *getDataOperand() const; + /// @brief Return the pointer used by the memory operation. + /// @return Pointer used by the memory operation or null for invalid + /// operations. + llvm::Value *getPointerOperand() const; + /// @brief In the case of a masked memory operation, return the mask. + /// @return Mask operand or null. + llvm::Value *getMaskOperand() const; + + /// @brief In the case of stores, set the data element being stored. + /// @return true on success. + bool setDataOperand(llvm::Value *V); + /// @brief Set the pointer used by the memory operation. + /// @return true on success. + bool setPointerOperand(llvm::Value *V); + /// @brief In the case of a masked memory operation, set the mask. + /// @return true on success. + bool setMaskOperand(llvm::Value *V); + + /// @brief In the case of a builtin memory operation, return the call. + /// @return Call instruction or null. + llvm::CallInst *getCall() const; + + /// @brief Determine if the stride is an integer whose value can be determined + /// at compile time. + /// @return True is the stride is a compile time integer constant + bool isStrideConstantInt() const { return Desc.isStrideConstantInt(); } + /// @brief Get the stride as a constant int. It assumes that it is possible + /// and valid to do so. + /// @return The stride in elements + int64_t getStrideAsConstantInt() const { + return Desc.getStrideAsConstantInt(); + } + +private: + /// @brief Access an operand of the call instruction. + /// + /// @param[in] OpIdx Index of the operand to access. + /// + /// @return Specified operand of the call instruction. + llvm::Value *getCallOperand(int OpIdx) const; + + /// @brief Set an operand of the call instruction. + /// + /// @param[in] OpIdx Index of the operand to access. + /// @param[in] V the Value to set + /// + /// @return true on success. + bool setCallOperand(int OpIdx, llvm::Value *V); + + /// @brief Instruction that performs the memory operation. + llvm::Instruction *Ins; + /// @brief Describes the memory operation. + MemOpDesc Desc; +}; + +namespace { +inline llvm::ConstantInt *getSizeInt(llvm::IRBuilder<> &B, int64_t val) { + if (B.GetInsertBlock()->getModule()->getDataLayout().getPointerSize() == 4) { + return B.getInt32(val); + } + return B.getInt64(val); +} + +inline llvm::IntegerType *getSizeTy(llvm::Module &M) { + if (M.getDataLayout().getPointerSize() == 4) { + return llvm::Type::getInt32Ty(M.getContext()); + } + return llvm::Type::getInt64Ty(M.getContext()); +} + +inline llvm::IntegerType *getSizeTy(llvm::IRBuilder<> &B) { + return getSizeTy(*(B.GetInsertBlock()->getModule())); +} +} // namespace +} // namespace vecz + +#endif // VECZ_MEMORY_OPERATIONS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h new file mode 100644 index 0000000000000..2ad2d60a3a78c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h @@ -0,0 +1,268 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Analysis of memory pointer offsets. + +#ifndef VECZ_OFFSET_INFO_H_INCLUDED +#define VECZ_OFFSET_INFO_H_INCLUDED + +#include +#include + +namespace llvm { +class CallInst; +class Value; +class Type; +} // namespace llvm + +namespace vecz { + +struct UniformValueResult; +class ValueTagMap; + +/// @brief Item ID dependence kinds that an expression can have. +/// Note that these are all mutually exclusive. +enum OffsetKind { + /// @brief The offset may diverge in unmodelled ways when vectorized. This + /// state is to be assumed unless it can be proved otherwise. + eOffsetMayDiverge, + /// @brief The offset is a compile-time constant. + eOffsetConstant, + /// @brief The offset is a uniform variable. + eOffsetUniformVariable, + /// @brief The offset has a work-item ID dependence. The ID might be scaled + /// by some stride != 1, in which case loads or stores dependent on it will + /// be interleaved. + eOffsetLinear +}; + +class StrideAnalysisResult; + +/// @brief Describes an offset used by a load or store instruction we want to +/// vectorize. +struct OffsetInfo { + /// @brief Properties of the offset, which may prevent vectorization. + OffsetKind Kind; + /// @brief The actual value of the analyzed expression. + llvm::Value *const ActualValue; + /// @brief The difference in this value between two consecutive work items, + /// as a constant integer. + /// When the stride is a pointer, the difference is in bytes. + int64_t StrideInt; + /// @brief The difference in this value between two consecutive work items, + /// as a uniform value. + /// When the stride is a pointer, the difference is in bytes. + /// This is nullptr after analysis and is set upon calling `manifest()`. + llvm::Value *ManifestStride; + + /// @brief A bit mask indicating which bits of the value it is possible to be + /// set, based on the expressions it depends on. + uint64_t BitMask; + + /// @brief Construct a new offset information object from a general value + /// @param[in] SAR The StrideAnalysisResult used to retrieve other + /// OffsetInfos. + /// @param[in] V Offset value to analyze. + OffsetInfo(StrideAnalysisResult &SAR, llvm::Value *V); + + OffsetInfo() = delete; + OffsetInfo(const OffsetInfo &) = default; + + /// @brief Return whether the offset has a non-analytical dedpendence on work + /// item ID. + bool mayDiverge() const { return Kind == eOffsetMayDiverge; } + + /// @brief Return whether the offset has a linear dependence on work item ID. + bool hasStride() const { return Kind == eOffsetLinear; } + + /// @brief Return whether the offset is a compile-time constant. + bool isConstant() const { return Kind == eOffsetConstant; } + + /// @brief Return whether the offset has no dependence on work item ID. + bool isUniform() const { + return Kind == eOffsetConstant || Kind == eOffsetUniformVariable; + } + + /// @brief Returns the actual value of the analyzed offset if it is uniform. + /// + /// @return The uniform Value or nullptr otherwise + llvm::Value *getUniformValue() const; + /// @brief Get the offset as a constant int. It assumes that it is possible to + /// do so. + /// @return The offset as an integer + int64_t getValueAsConstantInt() const; + /// @brief Get the Stride of the analyzed and manifested value. + /// @return The stride in number of elements + llvm::Value *getStride() const { return ManifestStride; } + /// @brief Determine whether the stride is simply a constant compile time + /// integer. + /// @return true if the stride is linear and constant, false otherwise. + bool isStrideConstantInt() const; + /// @brief Get the stride as a constant int. + /// @return The stride as an integer, or zero if the stride is not constant. + int64_t getStrideAsConstantInt() const; + + /// @brief Convert the bytewise stride into an element-wise stride based on + /// the data type and data layout, as an integer. + /// + /// @param[in] PtrEleTy The element data type. + /// @param[in] DL The Data Layout. + /// @return The memory stride as number of elements. + + uint64_t getConstantMemoryStride(llvm::Type *PtrEleTy, + const llvm::DataLayout *DL) const; + + /// @brief Convert the bytewise stride into an element-wise stride based on + /// the data type and data layout, building instructions where needed. Note + /// that the stride must be manifest first. + /// + /// @param[in] B an IRBuilder used for creating constants or instructions. + /// @param[in] PtrEleTy The element data type. + /// @param[in] DL The Data Layout. + /// @return The memory stride as number of elements. + llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Type *PtrEleTy, + const llvm::DataLayout *DL) const; + + /// @brief Create Values that represent or compute strides. + /// + /// @param[in] B an IRBuilder used for creating constants or instructions. + /// @return Reference to the current object for chaining. + OffsetInfo &manifest(llvm::IRBuilder<> &B, StrideAnalysisResult &SAR); + +private: + /// @brief Mark this offset with the given flag. + /// @return Reference to the current object for chaining. + OffsetInfo &setKind(OffsetKind Kind); + /// @brief Mark this offset as having a stride component. + /// @param[in] Stride Stride component applied to the item ID. + /// @return Reference to the current object for chaining. + OffsetInfo &setStride(llvm::Value *Stride); + /// @brief Mark this offset as having a stride component. + /// @param[in] Stride Stride component applied to the item ID. + /// @return Reference to the current object for chaining. + OffsetInfo &setStride(int64_t Stride); + /// @brief Mark this offset as possibly diverging. + /// @return Reference to the current object for chaining. + OffsetInfo &setMayDiverge(); + + /// @brief Analyse the given integer offset for properties that we need to + /// know in order to vectorize loads and stores. In particular we are + /// interested in knowing whether the offset can diverge (be different for + /// different items) or not. We can handle divergence in several cases but not + /// all. + /// + /// @param[in] Offset Offset value to analyze. + /// @param[in] SAR Result of the stride analysis. + /// + /// @return Reference to the current object for chaining. + OffsetInfo &analyze(llvm::Value *Offset, StrideAnalysisResult &SAR); + + /// @brief Analyse the given pointer for properties that we need to + /// know in order to vectorize loads and stores. In particular we are + /// interested in knowing whether the offset can diverge (be different for + /// different items) or not. We can handle divergence in several cases but not + /// all. + /// + /// @param[in] Address Pointer to analyze. + /// @param[in] SAR Result of the stride analysis. + /// + /// @return Reference to the current object for chaining. + OffsetInfo &analyzePtr(llvm::Value *Address, StrideAnalysisResult &SAR); + + /// @brief Combine the offset info of LHS and RHS operands of an add + /// operation. + /// @param[in] LHS Offset info for the LHS operand. + /// @param[in] RHS Offset info for the RHS operand. + /// @return Reference to the current object for chaining. + OffsetInfo &combineAdd(const OffsetInfo &LHS, const OffsetInfo &RHS); + OffsetInfo &manifestAdd(llvm::IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS); + + /// @brief Combine the offset info of LHS and RHS operands of a sub operation. + /// @param[in] LHS Offset info for the LHS operand. + /// @param[in] RHS Offset info for the RHS operand. + /// @return Reference to the current object for chaining. + OffsetInfo &combineSub(const OffsetInfo &LHS, const OffsetInfo &RHS); + OffsetInfo &manifestSub(llvm::IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS); + + /// @brief Combine the offset info of LHS and RHS operands of an and + /// operation. + /// @param[in] LHS Offset info for the LHS operand. + /// @param[in] RHS Offset info for the RHS operand. + /// @return Reference to the current object for chaining. + OffsetInfo &combineAnd(const OffsetInfo &LHS, const OffsetInfo &RHS); + OffsetInfo &manifestAnd(llvm::IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS); + + /// @brief Combine the offset info of LHS and RHS operands of an or operation. + /// @param[in] LHS Offset info for the LHS operand. + /// @param[in] RHS Offset info for the RHS operand. + /// @return Reference to the current object for chaining. + OffsetInfo &combineOr(const OffsetInfo &LHS, const OffsetInfo &RHS); + OffsetInfo &manifestOr(llvm::IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS); + + /// @brief Combine the offset info of LHS and RHS operands of a xor operation. + /// @param[in] LHS Offset info for the LHS operand. + /// @param[in] RHS Offset info for the RHS operand. + /// @return Reference to the current object for chaining. + OffsetInfo &combineXor(const OffsetInfo &LHS, const OffsetInfo &RHS); + OffsetInfo &manifestXor(llvm::IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS); + + /// @brief Combine the offset info of LHS and RHS operands of a shl operation. + /// @param[in] LHS Offset info for the LHS operand. + /// @param[in] RHS Offset info for the RHS operand. + /// @return Reference to the current object for chaining. + OffsetInfo &combineShl(const OffsetInfo &LHS, const OffsetInfo &RHS); + OffsetInfo &manifestShl(llvm::IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS); + + /// @brief Combine the offset info of LHS and RHS operands of a ashr + /// operation. + /// @param[in] LHS Offset info for the LHS operand. + /// @param[in] RHS Offset info for the RHS operand. + /// @return Reference to the current object for chaining. + OffsetInfo &combineAShr(const OffsetInfo &LHS, const OffsetInfo &RHS); + OffsetInfo &manifestAShr(llvm::IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS); + + /// @brief Combine the offset info of LHS and RHS operands of a mul operation. + /// @param[in] LHS Offset info for the LHS operand. + /// @param[in] RHS Offset info for the RHS operand. + /// @return Reference to the current object for chaining. + OffsetInfo &combineMul(const OffsetInfo &LHS, const OffsetInfo &RHS); + OffsetInfo &manifestMul(llvm::IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS); + + /// @brief Copies the stride information from another OffsetInfo into this one + /// @param[in] Other the other OffsetInfo to copy from + /// @return Reference to the current object for chaining. + OffsetInfo ©StrideFrom(const OffsetInfo &Other); + + /// @brief Copies the stride and bitmask information from another OffsetInfo + /// into this one + /// @param[in] Other the other OffsetInfo to copy from + /// @return Reference to the current object for chaining. + OffsetInfo ©StrideAndBitMaskFrom(const OffsetInfo &Other); +}; + +} // namespace vecz + +#endif // #define VECZ_OFFSET_INFO_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h new file mode 100644 index 0000000000000..2506c79921928 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h @@ -0,0 +1,116 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief A utility class to speed of reachability queries on a CFG + +#ifndef VECZ_REACHABILITY_H_INCLUDED +#define VECZ_REACHABILITY_H_INCLUDED + +#include +#include + +#include + +namespace llvm { +class BasicBlock; +class DominatorTree; +class Function; +class LoopInfo; +class PostDominatorTree; +} // namespace llvm + +namespace vecz { + +/// @brief A data structure to handle reachability queries +class Reachability { +public: + /// @brief Construct the Reachability computation from a Dominator Tree + /// and a Post-Dominator Tree, that are used to speed up the queries. + /// @param[in] DT the Dominator Tree + /// @param[in] PDT the Post-Dominator Tree + /// @param[in] LI the Loop Info + Reachability(llvm::DominatorTree &DT, llvm::PostDominatorTree &PDT, + llvm::LoopInfo &LI); + + /// @brief Destructor + ~Reachability() = default; + + /// @brief Computes a new data structure from the provided block tag list, + /// overwriting any data that was already present. + /// + /// Back edges are disregarded during this process. + void recalculate(llvm::Function &F); + + /// @brief Computes a new data structure from the provided block tag list, + /// only if the structure is currently empty. Otherwise, does nothing. + void update(llvm::Function &F); + + /// @brief Clears the data structure. + /// + /// Updating the underlying CFG invalidates the Reachability computations, + /// so it is required to clear the data ready to accept a new CFG. + void clear(); + + /// @brief Checks the internal consistency of the computed data structure. + bool validate() const; + + /// @brief Check if a block is reachable from another. + /// + /// @param[in] from the BasicBlock to start from + /// @param[in] to the BasicBlock we are trying to reach + /// + /// @return True if "to" is reachable from "from" + bool isReachable(llvm::BasicBlock *from, llvm::BasicBlock *to) const; + +private: + /// @brief Internal implementation of isReachable + /// + /// @param[in] from the graph node index to start from + /// @param[in] to the graph node index we are trying to reach + /// + /// @return True if "to" is reachable from "from" + bool isReachableImpl(size_t from, size_t to) const; + + /// @brief The Dominator Tree + llvm::DominatorTree &DT; + /// @brief The Post-Dominator Tree + llvm::PostDominatorTree &PDT; + /// @brief The Loop Info, used to determine back-edges + llvm::LoopInfo &LI; + + /// @brief Node structure containing implementational details + /// computed and used by the algorithm. + struct Rnode { + size_t X = 0; + size_t Y = 0; + size_t dom = 0; + size_t postDom = 0; + unsigned predTmp = 0; + unsigned predecessors = 0; + llvm::SmallVector successors; + }; + + /// @brief The list of graph nodes that encode the graph. + std::vector graph; + + /// @brief A mapping between BasicBlock pointers and graph node indices. + llvm::DenseMap indexMap; +}; +} // namespace vecz + +#endif // VECZ_REACHABILITY_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h new file mode 100644 index 0000000000000..40acd42336a0e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h @@ -0,0 +1,99 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief SIMD packets hold a value for each lane. + +#ifndef VECZ_SIMD_PACKET_H_INCLUDED +#define VECZ_SIMD_PACKET_H_INCLUDED + +#include "debugging.h" + +namespace llvm { +class Value; +} + +namespace vecz { + +/// @brief Represents the status of lanes within a packet. The most common +/// status would be that a lane can be either enabled or disabled. +struct PacketMask { + /// @brief Create a new mask where all lanes are disabled. + explicit PacketMask() : Value(0) {} + /// @brief Create a new mask using an existing bit field. + explicit PacketMask(uint64_t Mask) : Value(Mask) {} + + /// @brief Determine whether the given lane is enabled or not. + /// @param[in] Lane Index of the lane to test. + /// @return true if the lane is enabled, false otherwise. + bool isEnabled(unsigned Lane) const { + assert(Lane < CHAR_BIT * sizeof(Value) && + "Invalid lane, possible mask overflow"); + return (Value & (1ull << Lane)) != 0ull; + } + + /// @brief Enable the given lane. + /// @param[in] Lane Index of the lane to enable. + void enable(unsigned Lane) { + assert(Lane < CHAR_BIT * sizeof(Value) && + "Invalid lane, possible mask overflow"); + Value |= (1ull << Lane); + } + + /// @brief Disable the given lane. + /// @param[in] Lane Index of the lane to disable. + void disable(unsigned Lane) { + assert(Lane < CHAR_BIT * sizeof(Value) && + "Invalid lane, possible mask overflow"); + Value &= ~(1ull << Lane); + } + /// @brief Enable multiple lanes [0: NumLanes) + /// @param[in] NumLanes Number of lanes to enable. + void enableAll(unsigned NumLanes); + + /// @brief Bit field that describes which lanes are enabled. + /// NOTE: The length of bitfield is limited to sizeof(uint64_t) * CHAR_BIT(8) + uint64_t Value; +}; + +/// @brief Packet of LLVM values (e.g. instructions), one for each SIMD lane. +struct SimdPacket : public llvm::SmallVector { + using SmallVector::SmallVector; + + /// @brief Return the value at the given index. + /// @param[in] Index Index of the value to return. + /// @return Value at the given index or null. + llvm::Value *at(unsigned Index) const; + /// @brief Set the value at the given index and enable the corresponding lane. + /// @param[in] Index Index of the value to set. + /// @param[in] V Value to store at the given index. + void set(unsigned Index, llvm::Value *V); + /// @brief Copy all enabled lanes from the other packet and update the mask. + /// @param[in] Other Packet to copy values from. + /// @return Reference to the current packet. + SimdPacket &update(const SimdPacket &Other); + + /// @brief Bitmask of lanes that are 'enabled' in this packet. + /// This can mean different things depending on the context: + /// * By default, only lanes that are 'enabled' have a valid value. + /// * When scalarizing, only lanes that are 'enabled' will be scalarized. + PacketMask Mask; +}; + +} // namespace vecz + +#endif // VECZ_SIMD_PACKET_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h new file mode 100644 index 0000000000000..15f848257d446 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h @@ -0,0 +1,56 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Remove duplicate GEP instructions. + +#ifndef VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED +#define VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED + +#include +#include + +namespace vecz { + +class VectorizationUnit; + +/// @brief This pass removes every duplicate GEP instruction before the +/// packetization pass. +class CommonGEPEliminationPass + : public llvm::PassInfoMixin { +public: + static void *ID() { return (void *)&PassID; }; + + /// @brief Remove duplicate GEP instructions. + /// + /// @param[in] F Function to optimize. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Preserved passes. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + /// @brief Pass name. + static llvm::StringRef name() { return "Common GEP Elimination pass"; } + +private: + /// @brief Identifier for the pass. + static char PassID; +}; +} // namespace vecz + +#endif // VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h new file mode 100644 index 0000000000000..9cffc83720217 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h @@ -0,0 +1,155 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Control flow partial linearization transform. + +#ifndef VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED +#define VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED + +#include +#include + +#include + +namespace llvm { +class BasicBlock; +class Function; +class Instruction; +class Value; +class DominatorTree; +class PostDominatorTree; +class PreservedAnalyses; +class LoopInfo; +} // namespace llvm + +namespace vecz { +struct BasicBlockTag; +struct LoopTag; +struct UniformValueResult; +class DivergenceResult; +class VectorizationUnit; +class VectorizationContext; +class Reachability; + +/// \addtogroup cfg-conversion Control Flow Conversion Stage +/// @{ +/// \ingroup vecz + +/// @brief Pass that convert performs control-flow to data-flow conversion for +/// a function. +class ControlFlowConversionPass + : public llvm::PassInfoMixin { +public: + /// @brief Unique identifier for the pass. + static void *ID() { return (void *)&PassID; } + + /// @brief Perform control-flow to data-flow conversion on the function's CFG. + /// + /// @param[in] F Function to convert. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Preserved analyses. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + /// @brief Pass name. + static llvm::StringRef name() { + return "Control flow to data flow conversion"; + } + +private: + /// @brief Unique identifier for the pass. + static char PassID; +}; + +class ControlFlowConversionState { +public: + /// @brief The actual implementation of this pass + class Impl; + +protected: + ControlFlowConversionState(llvm::Function &, + llvm::FunctionAnalysisManager &AM); + + /// @brief BOSCC (Branch On Superword Codition Code) data structure that + /// encloses regions of the CFG that contain blocks that need to be + /// duplicated. + class BOSCCGadget; + + /// @brief ROSCC (Return On Superword Codition Code) utility class to + /// optimize conditional function return branches. + class ROSCCGadget; + + llvm::Function &F; + llvm::FunctionAnalysisManager &AM; + VectorizationUnit &VU; + VectorizationContext &Ctx; + llvm::DominatorTree *DT = nullptr; + llvm::PostDominatorTree *PDT = nullptr; + llvm::LoopInfo *LI = nullptr; + DivergenceResult *DR = nullptr; + UniformValueResult *UVR = nullptr; + std::unique_ptr BOSCC; + std::unique_ptr RC; + +private: + struct MaskInfo { + /// @brief Mask that describes which lanes have exited the block. + llvm::SmallDenseMap exitMasks; + /// @brief Mask that describes which lanes are active at the start of the + /// basic block. + llvm::Instruction *entryMask = nullptr; + }; + llvm::DenseMap MaskInfos; + + /// @brief get the Mask Info struct for a Basic Block. + /// Note that the returned reference may be invalidated by subsequent calls. + /// + /// @param[in] BB the BasicBlock + /// @returns a reference to the MaskInfo + const MaskInfo &getMaskInfo(llvm::BasicBlock *BB) const { + const auto found = MaskInfos.find(BB); + assert(found != MaskInfos.end() && + "Mask Info not constructed for Basic Block!"); + return found->second; + } + + /// @brief replaces reachable uses of a value + /// + /// @param[in] RC the reachability computation to use + /// @param[in] from the value to replace + /// @param[in] to the value to substitute + /// @param[in] src the basic block from which the value must be reachable + /// + /// @returns true + static bool replaceReachableUses(Reachability &RC, llvm::Instruction *from, + llvm::Value *to, llvm::BasicBlock *src); + + /// @brief Generate a block ordering. + /// + /// This is based on a dominance-compact block indexing (DCBI) where we + /// topologically order blocks that belong to the same dominator tree. + /// + /// @returns true if no errors occurred. + bool computeBlockOrdering(); +}; + +/// @} +} // namespace vecz + +#endif // VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h new file mode 100644 index 0000000000000..bcd63aa00cac6 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h @@ -0,0 +1,49 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Replace calls to certain builtins with an inline implementation after +/// vectorization. + +#ifndef VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED +#define VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED + +#include + +namespace vecz { + +/// @brief This pass replaces calls to builtins that require special attention +/// after vectorization. +class InlinePostVectorizationPass + : public llvm::PassInfoMixin { +public: + /// @brief Create a new pass object. + InlinePostVectorizationPass() {} + + /// @brief The entry point to the pass. + /// @param[in,out] F Function to optimize. + /// @param[in,out] AM FunctionAnalysisManager providing analyses. + /// @returns Whether or not the pass changed anything. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + /// @brief Retrieve the pass's name. + /// @return pointer to text description. + static llvm::StringRef name() { return "Inline Post Vectorization pass"; } +}; +} // namespace vecz + +#endif // VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h new file mode 100644 index 0000000000000..ce9140ad64586 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h @@ -0,0 +1,113 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Function instantiator. + +#ifndef VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED +#define VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED + +#include +#include + +namespace vecz { + +class Packetizer; +class VectorizationContext; +class PacketRange; +struct MemOp; + +/// @brief Instantiation pass where instructions that need it (vector or not) +/// are instantiated (i.e. duplicated with lane ID substitution), starting from +/// the leaves. +class InstantiationPass { +public: + /// @brief Create a new instantiation pass. + /// + /// @param[in] PP The packetizer object to call back to when required. + InstantiationPass(Packetizer &PP); + + /// @brief Instantiate the given value from the function. + /// The returned value is equivalent to a clone of the V 'expression' with any + /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID. + /// + /// @param[in] V Value to instantiate. + /// + /// @return Instantiated value. + PacketRange instantiate(llvm::Value *V); + +private: + /// @brief Duplicates an instruction across all SIMD Lanes. + /// + /// @param[in] I The instruction to duplicate across lanes + /// + /// @return The SIMD Packet + PacketRange instantiateByCloning(llvm::Instruction *I); + /// @brief Broadcasts an instruction across all SIMD Lanes. + /// + /// @param[in] I The instruction to extract elements from + /// + /// @return The SIMD Packet + PacketRange simdBroadcast(llvm::Instruction *I); + /// @brief Instantiate the given value from the function. + /// The returned value is equivalent to a clone of the V 'expression' with any + /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID. + /// + /// @param[in] V Value to instantiate. + /// + /// @return Instantiated value. + PacketRange instantiateInternal(llvm::Value *V); + /// @brief Instantiate the given intruction from the function. + /// The returned value is equivalent to a clone of the V 'expression' with any + /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID. + /// + /// @param[in] Ins instruction to instantiate. + /// + /// @return Instantiated value. + PacketRange instantiateInstruction(llvm::Instruction *Ins); + /// @brief Perform post-instantiation tasks. + /// + /// @param[in] P Packet that is the result of instantiation or null. + /// @param[in] V Value that was instantiated. + /// + /// @return Instantiated packet or null. + PacketRange assignInstance(const PacketRange P, llvm::Value *V); + /// @brief Create a packet where all lanes contain the same value. + /// + /// @param[in] V Value to broadcast. + /// + /// @return Packet with the broadcasted value. + PacketRange broadcast(llvm::Value *V); + /// @brief Instantiate a call instruction. + /// + /// @param[in] CI Instruction to instantiate. + /// + /// @return Instantiated packet for the given instruction. + PacketRange instantiateCall(llvm::CallInst *CI); + /// @brief Instantiate an alloca instruction. + /// + /// @param[in] Alloca Instruction to instantiate. + /// + /// @return Instantiated packet for the given instruction. + PacketRange instantiateAlloca(llvm::AllocaInst *Alloca); + + VectorizationContext &Ctx; + Packetizer &packetizer; +}; +} // namespace vecz + +#endif // VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h new file mode 100644 index 0000000000000..ae6deb613826c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h @@ -0,0 +1,94 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Combine groups of interleaved memory operations. + +#ifndef VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED +#define VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED + +#include + +#include "analysis/uniform_value_analysis.h" +#include "vecz/vecz_target_info.h" + +namespace llvm { +class ScalarEvolution; +} + +namespace vecz { + +class VectorizationUnit; + +/// @brief Combine groups of interleaved memory operations. +class InterleavedGroupCombinePass + : public llvm::PassInfoMixin { +public: + /// @brief Create a new pass object. + /// + /// @param[in] kind Kind of interleaved operation to combine. + InterleavedGroupCombinePass(InterleavedOperation kind) + : Kind(kind), scalarEvolution(nullptr) {} + + /// @brief Unique identifier for the pass. + static void *ID() { return (void *)&PassID; } + + /// @brief Combine groups of interleaved operations. + /// + /// @param[in] F Function to analyze. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Preserved analyses. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + /// @brief Pass name. + static llvm::StringRef name() { + return "Combine interleaved memory instructions"; + } + +private: + /// @brief Information about an interleaved operation. + struct InterleavedOpInfo; + + /// @brief Information about a group of interleaved operations. + struct InterleavedGroupInfo; + + /// @brief Try to find a group of interleaved instructions that have the same + /// stride and collectively access a consecutive chunk of memory. + /// + /// @param[in] Ops List of interleaved operations to analyze. + /// @param[in] UVR Result of uniform value analysis. + /// @param[out] Info information about a group of interleaved instructions. + /// + /// @return true if a group was found or false otherwise. + bool findGroup(const std::vector &Ops, + UniformValueResult &UVR, InterleavedGroupInfo &Info); + + /// @brief Unique identifier for the pass. + static char PassID; + /// @brief Kind of interleaved operation to combine. + InterleavedOperation Kind; + + /// @brief Scalar Evolution Analysis that allows us to subtract two pointers + /// to find any constant offset between them. + llvm::ScalarEvolution *scalarEvolution; +}; + +} // namespace vecz + +#endif // VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h new file mode 100644 index 0000000000000..1c9cfe79dac53 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h @@ -0,0 +1,261 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Function packetizer helper classes. + +#ifndef VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED +#define VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED + +#include +#include +#include +#include +#include +#include + +#include + +#include "debugging.h" + +namespace llvm { +class Value; +class ShuffleVectorInst; +class Twine; +} // namespace llvm + +namespace vecz { +class TargetInfo; +struct SimdPacket; + +/// @brief Determines the insertion point after the value V. If V has a position +/// in the function, (e.g., an Instruction), this method will return an +/// IRBuilder set to the next point after that. If V has no position (e.g., a +/// Constant or an Argument) then this method will return an IRBuilder set to a +/// suitable insertion point at the beginning of the function. +/// +/// @param[in] V Value to insert instructions after, if an llvm::Instruction. +/// @param[in] F Function to insert instructions into, if V is not an +/// llvm::Instruction. +/// @param[in] IsPhi true if the instructions to insert are phis, false if the +/// insertion point should be after all phis in the basic block. +/// +/// @return IRBuilder set to a suitable insertion point. +llvm::IRBuilder<> buildAfter(llvm::Value *V, llvm::Function &F, + bool IsPhi = false); + +/// @brief Utility function for building a shufflevector instruction, absorbing +/// its operands where possible. +/// +/// @param[in] B IRBuilder to build any new instruction created +/// @param[in] srcA the first vector operand of the new shuffle +/// @param[in] srcB the second vector operand of the new shuffle +/// @param[in] mask the shuffle mask +/// @param[in] name the name of the new instruction +/// +/// @return a value identical to the requested shufflevector +llvm::Value *createOptimalShuffle(llvm::IRBuilder<> &B, llvm::Value *srcA, + llvm::Value *srcB, + const llvm::SmallVectorImpl &mask, + const llvm::Twine &name = llvm::Twine()); + +/// @brief Utility function for splatting a vector of scalars to create a +/// "vector of vectors", being the concatenation of vector splats of its +/// elements. eg. subSplat("ABCD", 4) == "AAAABBBBCCCCDDDD" +/// +/// Only works on fixed vector types. +/// +/// @param[in] TI TargetInfo for target-dependent optimizations +/// @param[in] B IRBuilder to build any new instructions created +/// @param[in,out] srcs The packet of vectors to sub-splat +/// @param[in] subWidth The width of the individual splats +/// +/// @return true on success +bool createSubSplats(const vecz::TargetInfo &TI, llvm::IRBuilder<> &B, + llvm::SmallVectorImpl &srcs, + unsigned subWidth); + +/// @brief Utility function for creating a reduction operation. +/// +/// The value must be a vector. +/// +/// If VL is passed and is non-null, it is assumed to be the i32 value +/// representing the active vector length. The reduction will be +/// vector-predicated according to this length. +/// +/// Only works on RecurKind::And, Or, Xor, Add, Mul, FAdd, FMul, {S,U,F}Min, +/// {S,U,F}Max. +llvm::Value *createMaybeVPReduction(llvm::IRBuilderBase &B, llvm::Value *Val, + llvm::RecurKind Kind, + llvm::Value *VL = nullptr); + +/// @brief Utility function to obtain an indices vector to be used in a gather +/// operation. +/// +/// When accessing a vector using an indices vector, this must be +/// modified taking into account the SIMD width. +/// +/// @return An indices vector to be used in a gather operation; nullptr for LLVM +/// version < 13. +/// +/// @param[in] B IRBuilder to build any new instructions created +/// @param[in] Indices Original indices vector +/// @param[in] Ty Type of the output vector +/// @param[in] FixedVecElts Original vector length +/// @param[in] N Name of the output variable +llvm::Value *getGatherIndicesVector(llvm::IRBuilder<> &B, llvm::Value *Indices, + llvm::Type *Ty, unsigned FixedVecElts, + const llvm::Twine &N = ""); + +/// @brief Returns a boolean vector with all elements set to 'true'. +llvm::Value *createAllTrueMask(llvm::IRBuilderBase &B, llvm::ElementCount EC); + +/// @brief Returns an integer step vector, representing the sequence 0 ... N-1. +llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder, + llvm::VectorType *VecTy, + const llvm::Twine &Name = ""); + +/// @brief Class that represents a range in a vector of Value pointers. +/// The range is represented by its integer starting index and length, so that +/// it remains valid if the vector re-allocates its storage. +class PacketRange { +public: + using value_type = llvm::Value *; + using iterator = value_type *; + using const_iterator = const value_type *; + using reference = value_type &; + using const_reference = const value_type &; + + /// @brief Construct an empty range + constexpr PacketRange(std::vector &d) + : data(d), start(0), length(0) {} + /// @brief Construct a range with given start index and length + constexpr PacketRange(std::vector &d, size_t s, size_t l) + : data(d), start(s), length(l) {} + + /// @brief Copy constructor + constexpr PacketRange(const PacketRange &) = default; + /// @brief Move constructor + constexpr PacketRange(PacketRange &&) = default; + /// @brief Destructor + ~PacketRange() = default; + + /// @brief Return the length of the range + size_t size() const { return length; } + /// @brief Standard container begin iterator + iterator begin() { return &*data.begin() + start; } + /// @brief Standard container begin const iterator + const_iterator begin() const { return &*data.begin() + start; } + /// @brief Standard container end iterator + iterator end() { return begin() + length; } + /// @brief Standard container end const iterator + const_iterator end() const { return begin() + length; } + /// @brief Return a reference to the element at given index + reference at(size_t i) { return data[start + i]; } + /// @brief Return a const reference to the element at given index + const_reference at(size_t i) const { return data[start + i]; } + /// @brief Return a reference to the element at given index + reference operator[](size_t i) { return at(i); } + /// @brief Return a const reference to the element at given index + const_reference operator[](size_t i) const { return at(i); } + /// @brief Return a reference to the first element in the range + reference front() { return data[start]; } + /// @brief Return a const reference to the first element in the range + const_reference front() const { return data[start]; } + /// @brief Return a reference to the last element in the range + reference back() { return data[start + length - 1]; } + /// @brief Return a const reference to the last element in the range + const_reference back() const { return data[start + length - 1]; } + + /// @brief Convert to bool + /// @returns false if length is zero, true otherwise + operator bool() const { return length != 0; } + +private: + std::vector &data; + const size_t start; + const size_t length; +}; + +/// @brief Structure to hold the strategy-agnostic result of packetizing an +/// instruction (i.e. can represent either a vectorized or an instantiated +/// value) that enables the result to be converted on demand. +struct PacketInfo { + /// @brief The number of instances created during packetization + unsigned numInstances = 0; + + /// @brief Vectorized value. Each element in the vector represents a scalar + /// instance (SIMD lane). + llvm::Value *vector = nullptr; + + /// @brief Map of vector widths to packet range start indices + llvm::SmallDenseMap packets; + + /// @brief Default constructor + PacketInfo() = default; + /// @brief Deleted copy constructor + PacketInfo(const PacketInfo &) = delete; + /// @brief Move constructor + PacketInfo(PacketInfo &&) = default; + /// @brief Destructor + ~PacketInfo() = default; + /// @brief Deleted copy assignment operator + PacketInfo &operator=(const PacketInfo &) = delete; + /// @brief Move assignment operator + PacketInfo &operator=(PacketInfo &&) = default; + + /// @brief get the range of values for a given packet width + PacketRange getRange(std::vector &d, unsigned width) const; + + /// @brief get the range of values for the originally created packet. + PacketRange getRange(std::vector &d) const { + return getRange(d, numInstances); + } +}; + +inline llvm::Type *getWideType(llvm::Type *ty, llvm::ElementCount factor) { + if (!ty->isVectorTy()) { + // The wide type of a struct literal is the wide type of each of its + // elements. + if (auto *structTy = llvm::dyn_cast(ty); + structTy && structTy->isLiteral()) { + llvm::SmallVector wideElts(structTy->elements()); + for (unsigned i = 0, e = wideElts.size(); i != e; i++) { + wideElts[i] = getWideType(wideElts[i], factor); + } + return llvm::StructType::get(ty->getContext(), wideElts); + } else if (structTy) { + VECZ_ERROR("Can't create wide type for structure type"); + } + return llvm::VectorType::get(ty, factor); + } + const bool isScalable = llvm::isa(ty); + assert((!factor.isScalable() || !isScalable) && + "Can't widen a scalable vector by a scalable amount"); + auto *vecTy = llvm::cast(ty); + const unsigned elts = vecTy->getElementCount().getKnownMinValue(); + // If we're widening a scalable type then set the fixed factor to scalable + // here. + if (isScalable && !factor.isScalable()) { + factor = llvm::ElementCount::getScalable(factor.getKnownMinValue()); + } + ty = vecTy->getElementType(); + return llvm::VectorType::get(ty, factor * elts); +} +} // namespace vecz + +#endif // VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h new file mode 100644 index 0000000000000..fb5b49bc106ba --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h @@ -0,0 +1,77 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Function packetizer. + +#ifndef VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED +#define VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED + +#include + +namespace vecz { + +class VectorizationUnit; + +/// \addtogroup packetization Packetization Stage +/// @{ +/// \ingroup vecz + +/// @brief Vectorization pass where scalar instructions that need it are +/// packetized, starting from leaves. +class PacketizationPass : public llvm::PassInfoMixin { +public: + /// @brief Create a new packetization pass object. + PacketizationPass() = default; + + /// @brief Create a new packetization pass object. + /// + /// @param[in] P Pass to move. + PacketizationPass(PacketizationPass &&P) = default; + + // Mark default copy constructor as deleted + PacketizationPass(const PacketizationPass &) = delete; + + /// @brief Deleted move assignment operator. + /// + /// Also deletes the copy assignment operator. + PacketizationPass &operator=(PacketizationPass &&) = delete; + + /// @brief Unique identifier for the pass. + static void *ID() { return (void *)&PassID; } + + /// @brief Packetize the given function, duplicating its behaviour (defined + /// values and side effects) for each lane of a SIMD packet. + /// + /// @param[in] F Function to packetize. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Preserved passes. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + /// @brief Pass name. + static llvm::StringRef name() { return "Function packetization"; } + + /// @brief Unique identifier for the pass. + static char PassID; +}; + +/// @} +} // namespace vecz + +#endif // VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h new file mode 100644 index 0000000000000..4e9ff96a07e56 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h @@ -0,0 +1,234 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Function packetizer. + +#ifndef VECZ_TRANSFORM_PACKETIZER_H_INCLUDED +#define VECZ_TRANSFORM_PACKETIZER_H_INCLUDED + +#include +#include +#include +#include +#include +#include + +#include + +#include "ir_cleanup.h" +#include "transform/packetization_helpers.h" + +namespace vecz { + +struct MemOp; +class InstantiationPass; +class PacketizationAnalysisResult; +class StrideAnalysisResult; +struct UniformValueResult; +class VectorizationUnit; +class VectorizationContext; +class VectorizationChoices; + +/// \addtogroup packetization Packetization Stage +/// @{ +/// \ingroup vecz + +/// @brief The implementation of the packetization process +class Packetizer { +public: + class Result { + friend class Packetizer; + + public: + Result() = delete; + Result(const Result &) = default; + constexpr Result(Result &&) = default; + + Result(Packetizer &p) : packetizer(p), scalar(nullptr), info(nullptr) {} + Result(Packetizer &p, llvm::Value *s, PacketInfo *i) + : packetizer(p), scalar(s), info(i) {} + + operator bool() const { return info; } + + /// @brief Get a packetized/instantiated instruction as a vector value. + /// If the value was instantiated, this will construct and return a gather + /// of the SIMD lanes. + /// + /// @return Packetized value + llvm::Value *getAsValue() const; + + /// @brief Get a packetized/instantiated instruction as a SIMD packet. + /// If the value was packetized, this will construct a new packet by + /// extracting the elements. + /// + /// @param[in] width the width of the packet to get. + /// + /// @return Instantiated packet + PacketRange getAsPacket(unsigned width) const; + + /// @brief Get a copy of all the Values from the vector or packet, as + /// the width it was originally packetized to. + /// + /// @param[out] vals a vector of Values representing the result. + void getPacketValues(llvm::SmallVectorImpl &vals) const; + + /// @brief Get a copy of all the Values from the vector or packet. + /// When `width == 1` this will return a length-1 result containing the + /// vector valued result. Otherwise, it copies the values from the + /// packet of the requested width. + /// + /// @param[in] width the width of the packet to get. + /// @param[out] vals a vector of Values representing the result. + void getPacketValues(unsigned width, + llvm::SmallVectorImpl &vals) const; + + private: + Packetizer &packetizer; + llvm::Value *const scalar; + PacketInfo *const info; + + PacketRange createPacket(unsigned width) const; + PacketRange getRange(unsigned width) const; + PacketRange widen(unsigned width) const; + PacketRange narrow(unsigned width) const; + const Result &broadcast(unsigned width) const; + }; + + /// @brief Packetize the given function, duplicating its behaviour (defined + /// values and side effects) for each lane of a SIMD packet. + /// + /// @param[in] F Function to packetize. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// @param[in] Width the vectorization factor + /// @param[in] Dim the vectorization dimension + /// + /// @return true if the function was packetized, false otherwise. + static bool packetize(llvm::Function &F, llvm::FunctionAnalysisManager &AM, + llvm::ElementCount Width, unsigned Dim); + + /// @brief Packetize the given value from the function. + /// + /// @param[in] V Value to packetize. + /// + /// @return Packetized value. + Result packetize(llvm::Value *V); + + /// @brief Return an already packetized value. + /// + /// @param[in] V Value to query. + /// + /// @return Packetized value or nullptr. + Result getPacketized(llvm::Value *V); + + /// @brief Create a new SIMD packet to hold an instantiated value. + /// + /// @param[in] V the value the packet will represent + /// @param[in] width the SIMD width of the packet + /// + /// @returns a new packet + PacketRange createPacket(llvm::Value *V, unsigned width); + + /// @brief Get the Uniform Value Result + /// + /// @return the Uniform Value Result + const UniformValueResult &uniform() const { return UVR; } + + /// @brief get the vectorization factor. + llvm::ElementCount width() const { return SimdWidth; } + + /// @brief get the vectorization factor. + unsigned dimension() const { return Dimension; } + + /// @brief get the function being packetized + llvm::Function &function() { return F; } + + /// @brief get the Vectorization Context + VectorizationContext &context() { return Ctx; } + + /// @brief get the Vectorization Context + const VectorizationChoices &choices() const { return Choices; } + + PacketRange getEmptyRange() { return PacketRange(packetData); } + + /// @brief mark the instruction for deletion when packetization finishes + void deleteInstructionLater(llvm::Instruction *I) { + IC.deleteInstructionLater(I); + } + +private: + Packetizer(llvm::Function &, llvm::FunctionAnalysisManager &AM, + llvm::ElementCount Width, unsigned Dim); + Packetizer() = delete; + Packetizer(const Packetizer &) = delete; + Packetizer(Packetizer &&) = delete; + ~Packetizer() = default; + + llvm::FunctionAnalysisManager &AM; + VectorizationUnit &VU; + VectorizationContext &Ctx; + const VectorizationChoices &Choices; + UniformValueResult &UVR; + StrideAnalysisResult &SAR; + PacketizationAnalysisResult &PAR; + llvm::Function &F; + IRCleanup IC; + + /// @brief Vectorization factor + llvm::ElementCount SimdWidth; + + /// @brief Vectorization dimension + unsigned Dimension; + + /// @brief Map onto packetized versions of scalar values + llvm::DenseMap packets; + + /// @brief Central storage for all the packetized values + /// + /// This vector is a contiguous storage for all the wide packets created + /// during the packetization process. New packets get allocated to a + /// range at the end of the vector, and are referenced by index so that + /// they are not invalidated when the storage is re-allocated. Vector + /// elements will never be erased during packetization, and the data will + /// not be cleared until the packetizer itself is destroyed. + /* + /^ ^\ + "No take" / 0 0 \ + V\ Y /V */ + std::vector packetData; + /* | \ + || (__V "ONLY GROW" + */ + + /// @brief The value representing the current (dynamic) active vector length + /// for this kernel. This value is the *base* vector length for one scalar + /// work-item; vector operations must be scaled according to their vector + /// width. + /// If non-null, packetized operations are required to respect this active + /// length if they would produce side effects. + llvm::Value *VL = nullptr; + + /// @brief This class contains the private implementation of the packetizer. + /// Declaring it as an inner class of the Packetizer class allows it access + /// to its private members (including its constructor). + class Impl; +}; + +/// @} +} // namespace vecz + +#endif // VECZ_TRANSFORM_PACKETIZER_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h new file mode 100644 index 0000000000000..bbc9cd6428a2c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h @@ -0,0 +1,209 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Factory functions for some Vecz support passes + +#ifndef VECZ_TRANSFORM_PASSES_H_INCLUDED +#define VECZ_TRANSFORM_PASSES_H_INCLUDED + +#include +#include + +namespace compiler { +namespace utils { +class BuiltinInfo; +} // namespace utils +} // namespace compiler + +namespace vecz { +class SimplifyInfiniteLoopPass + : public llvm::PassInfoMixin { +public: + SimplifyInfiniteLoopPass() = default; + + llvm::PreservedAnalyses run(llvm::Loop &L, llvm::LoopAnalysisManager &, + llvm::LoopStandardAnalysisResults &, + llvm::LPMUpdater &); +}; + +/// @brief This pass replaces calls to builtins that require special attention +/// (e.g. there is no scalar or vector equivalent) with inline implementations. +class BuiltinInliningPass : public llvm::PassInfoMixin { +public: + /// @brief Create a new pass object. + BuiltinInliningPass() = default; + + /// @brief The entry point to the pass. + /// @param[in,out] M Module to optimize. + /// @param[in,out] AM ModuleAnalysisManager providing analyses. + /// @return Preserved analyses. + llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM); + + /// @brief Retrieve the pass's name. + /// @return pointer to text description. + static llvm::StringRef name() { return "OpenCL builtin inlining pass"; } + +private: + /// @brief Process a call site, inlining it or marking it as needing inlining + /// if required. + /// + /// @param[in] CI Call site to inspect. + /// @param[out] NeedLLVMInline Whether the call site needs LLVM inlining. + /// + /// @return New return value for the call instruction. + llvm::Value *processCallSite(llvm::CallInst *CI, bool &NeedLLVMInline); +}; + +/// @brief This pass tries to remove unecessary allocas that are not optimized +/// away by LLVM's Mem2Reg pass, for example in the presence of bitcasts. It is +/// however much simpler than LLVM's. +class BasicMem2RegPass : public llvm::PassInfoMixin { +public: + BasicMem2RegPass() {}; + + /// @brief The entry point to the pass. + /// @param[in,out] F Function to optimize. + /// @param[in,out] AM FunctionAnalysisManager providing analyses. + /// @return Preserved analyses. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + /// @brief Retrieve the pass's name. + /// @return pointer to text description. + static llvm::StringRef name() { return "Basic Mem2Reg Pass"; } + +private: + /// @brief Determine whether the alloca can be promoted or not. + /// + /// This is the case when it is inside the entry block, there is at most one + /// store to it and all other users are loads (possibly through bitcasts). + /// The store must also be in the entry block and precede all loads. + /// + /// @param[in] Alloca Alloca instruction to analyze. + /// @return true if the alloca can be promoted, false otherwise. + bool canPromoteAlloca(llvm::AllocaInst *Alloca) const; + /// @brief Try to promote the alloca, remove store users and replacing load + /// users by the stored values. The alloca itself isn't touched. + /// @param[in] Alloca Alloca instruction to promote. + /// @return true if the alloca was promoted, false otherwise. + bool promoteAlloca(llvm::AllocaInst *Alloca) const; +}; + +class PreLinearizePass : public llvm::PassInfoMixin { +public: + PreLinearizePass() = default; + + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + static llvm::StringRef name() { return "Prepare for SPMD linearization"; } +}; + +/// @brief Wraps llvm's LoopRotatePass but retricts the range of loops on which +/// it works. +class VeczLoopRotatePass : public llvm::PassInfoMixin { +public: + VeczLoopRotatePass() {} + + llvm::PreservedAnalyses run(llvm::Loop &L, llvm::LoopAnalysisManager &, + llvm::LoopStandardAnalysisResults &, + llvm::LPMUpdater &); + + static llvm::StringRef name() { return "Vecz Loop Rotation Wrapper"; }; +}; + +class RemoveIntPtrPass : public llvm::PassInfoMixin { +public: + RemoveIntPtrPass() = default; + + static llvm::StringRef name() { return "Remove IntPtr instructions"; } + + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &); +}; + +class SquashSmallVectorsPass + : public llvm::PassInfoMixin { +public: + SquashSmallVectorsPass() = default; + + static llvm::StringRef name() { return "Squash Small Vectors"; } + + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &); +}; + +/// @brief Try to replace or remove masked memory operations that are trivially +/// not needed or can be converted to non-masked operations. +class SimplifyMaskedMemOpsPass + : public llvm::PassInfoMixin { +public: + /// @brief Create a new pass object. + SimplifyMaskedMemOpsPass() = default; + + /// @brief Replace masked memory operations that use 'all true' masks by + /// regular memory operations, and remove masked operations that use 'all + /// false' masks. + /// + /// @param[in] F Function to optimize. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Preserved analyses. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + /// @brief Pass name. + static llvm::StringRef name() { return "Simplify masked memory operations"; } +}; + +/// @brief reassociate uniform binary operators and split branches +class UniformReassociationPass + : public llvm::PassInfoMixin { +public: + UniformReassociationPass() = default; + + static llvm::StringRef name() { return "Reassociate uniform binops"; } + + llvm::PreservedAnalyses run(llvm::Function &, + llvm::FunctionAnalysisManager &); +}; + +/// @brief Removes uniform divergence reductions created by CFG conversion +class DivergenceCleanupPass + : public llvm::PassInfoMixin { +public: + /// @brief Create a new pass object. + DivergenceCleanupPass() = default; + + /// @brief Remove uniform divergence reductions. + /// + /// @param[in] F Function to optimize. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Preserved analyses. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + /// @brief Pass name. + static llvm::StringRef name() { + return "Remove uniform divergence reductions"; + } +}; + +} // namespace vecz + +#endif // VECZ_TRANSFORM_PASSES_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h new file mode 100644 index 0000000000000..2d4885059b3db --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h @@ -0,0 +1,117 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file + +#ifndef VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED +#define VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED + +#include + +namespace llvm { +class Module; +class User; +class Instruction; +template class SmallVector; +class GlobalVariable; +class Value; +class CallInst; +} // namespace llvm + +namespace vecz { + +/// @brief An enumeration of errors that can occur when processing a format +/// string. +enum EnumPrintfError { + kPrintfError_success, + kPrintfError_fail, + kPrintfError_invalidFormatString +}; + +/// @brief Retrieves a module-level global variable for a printf format string +/// from an Value. +/// @param[in] op The value that uses a global variable representing a printf +/// format string. +/// @return The module-level global variable for the printf format string. +llvm::GlobalVariable *GetFormatStringAsValue(llvm::Value *op); + +/// @brief Extracts the raw string contents from a module-level global variable +/// containing a printf format string. +/// +/// The @p op parameter must be an GlobalVariable with an initializer. +/// +/// @param[in] op The module-level global variable for a printf format string. +/// @return The raw string contents of the format string global variable, or "" +/// if there was an error. +std::string GetFormatStringAsString(llvm::Value *op); + +/// @brief Creates a global variable for a scalarized format string. +/// @param[in,out] module The parent module given to the pass. +/// @param[in] string_value The GlobalVariable for the old format string, +/// used to copy attributes over. +/// @param[in] new_format_string The scalarized format string to create a +/// global variable from. +/// @return The newly created global variable for the format string. +llvm::GlobalVariable * +GetNewFormatStringAsGlobalVar(llvm::Module &module, + llvm::GlobalVariable *const string_value, + const std::string &new_format_string); + +/// @brief This function transforms an OpenCL printf format string into a +/// C99-conformant one. + +/// Its main job is to scalarize vector format specifiers into scalarized form. +/// It does this by taking a vector specifier and determining the specifier +/// corresponding to each vector element. It then emits the element specifier +/// into the new format string for each element in the vector, separated by a +/// comma. +/// +/// Special care needs to be taken for modifiers that aren't supported by C99 +/// such as the 'hl' length modifier. The new format string will have 'hl' +/// stripped out. +/// +/// Examples: +/// @code{.cpp} +/// // vector 2, 8-bit sized hexadecimal integers +/// "%v2hhx" --> "%hhx,%hhx" +/// // vector 4, 32-bit sized floats +/// "%v4hlf" --> "%f,%f,%f,%f" +/// @endcode +/// +/// It also does some checking to ensure the printf string is conformant to the +/// OpenCL 1.2 specification, and returns an error if it is not. +/// @param[in] str The format string to scalarize and check. +/// @param[out] new_str The new, scalarized, format string. +/// @return The status of the scalarization (kPrintfError_success on success, +/// otherwise kPrintfError_invalidFormatString if we detected an illegal OpenCL +/// printf format string). +EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str, + std::string &new_str); + +/// @brief Builds a new scalarized printf call given an existing call and a new +/// format string. +/// +/// @param[in,out] module The parent module given to the pass. +/// @param[in] old_inst The old call to the printf function. +/// @param[in] new_format_string_gvar The module-level global variable for the +/// new format string. +/// @return A new call instruction to the new printf function. +llvm::Instruction * +BuildNewPrintfCall(llvm::Module &module, llvm::CallInst *const old_inst, + llvm::GlobalVariable *const new_format_string_gvar); +} // namespace vecz + +#endif // VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h new file mode 100644 index 0000000000000..a494a1945e0a6 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h @@ -0,0 +1,68 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Function scalarizer. + +#ifndef VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED +#define VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED + +#include +#include + +namespace llvm { +class Function; +} // namespace llvm + +namespace vecz { + +class VectorizationUnit; + +/// \addtogroup scalarization Scalarization Stage +/// @{ +/// \ingroup vecz + +/// @brief Scalarization pass where vector instructions that need it are +/// scalarized, starting from leaves. +class ScalarizationPass : public llvm::PassInfoMixin { +public: + /// @brief Create a new scalarizaation pass. + ScalarizationPass(); + + /// @brief Unique identifier for the pass. + static void *ID() { return (void *)&PassID; } + + /// @brief Scalarize the given function. + /// + /// @param[in] F Function to scalarize. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// + /// @return Preserved analyses. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + /// @brief Name of the pass. + static llvm::StringRef name() { return "Function scalarization"; } + +private: + static char PassID; +}; + +/// @} +} // namespace vecz + +#endif // VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h new file mode 100644 index 0000000000000..ecb2136c6b73d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h @@ -0,0 +1,323 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Function scalarizer. + +#ifndef VECZ_TRANSFORM_SCALARIZER_H_INCLUDED +#define VECZ_TRANSFORM_SCALARIZER_H_INCLUDED + +#include +#include +#include + +#include + +#include "debugging.h" +#include "ir_cleanup.h" +#include "simd_packet.h" + +namespace llvm { +class Instruction; +class LoadInst; +class StoreInst; +class CastInst; +class BitCastInst; +class BinaryOperator; +class FreezeInst; +class GetElementPtrInst; +class UnaryOperator; +class ICmpInst; +class FCmpInst; +class SelectInst; +class CallInst; +class ShuffleVectorInst; +class InsertElementInst; +class PHINode; +class ExtractElementInst; +class IntrinsicInst; +} // namespace llvm + +namespace vecz { + +class VectorizationChoices; +class VectorizationContext; +struct MemOp; +struct PacketMask; +struct SimdPacket; + +/// \addtogroup scalarization Scalarization Stage +/// @{ +/// \ingroup vecz + +/// @brief Holds the result of scalarization analysis for a given function. +class Scalarizer { +public: + /// @brief Create new scalarization results for the function. + /// + /// @param[in] F Function to scalarize. + /// @param[in] Ctx VectorizationContext for this Function. + /// @param[in] DoubleSuport True if double-precision floating point is + /// supported + Scalarizer(llvm::Function &F, VectorizationContext &Ctx, bool DoubleSuport); + + /// @brief Mark the value as needing scalarization. + /// @param[in] V Value that needs scalarization. + void setNeedsScalarization(llvm::Value *V); + + /// @brief Scalarize everything that has been marked for scalarization + bool scalarizeAll(); + + /// @brief A container type for instructions that failed to scalarize + using FailureSet = llvm::DenseSet; + + /// @brief Get the list of instructions that failed to scalarize + const FailureSet &failures() const { return Failures; } + +private: + /// @brief Vectorization context for the function to scalarize. + VectorizationContext &Ctx; + llvm::Function &F; + IRCleanup IC; + bool DoubleSupport; + + /// @brief The values to scalarize, in order + std::vector ToScalarize; + + /// @brief The un-ordered set of values to scalarize for fast lookup + llvm::DenseSet ScalarizeSet; + + /// @brief Map of values to a gather of their scalarized elements + llvm::DenseMap Gathers; + + /// @brief Map onto packetized versions of scalar values + llvm::DenseMap> packets; + + /// @brief The number of instructions that failed to scalarize + FailureSet Failures; + + /// @brief Transform values that have non-vector types and vector operands + /// by scalarizing their operands. + /// + /// @param[in] I Instruction whose operands to scalarize. + /// + /// @return A different value than V if the operands were scalarized; null if + /// scalarization failed; or V if the value has no vector operand. + llvm::Value *scalarizeOperands(llvm::Instruction *I); + + /// @brief Scalarize the given value from the function. Multiple calls to this + /// function with the same value should return a cached result. + /// + /// @param[in] V Value to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarize(llvm::Value *V, PacketMask PM); + + /// @brief Get or create a packet for the given value. + /// + /// @param[in] V Value to retrieve a packet for. + /// @param[in] SimdWidth Number of lanes in the packet. + /// @param[in] Create true if a packet should be created if not present. + /// + /// @return SIMD packet for the given value. + SimdPacket *getPacket(const llvm::Value *V, unsigned SimdWidth, + bool Create = true); + + llvm::Value *getGather(llvm::Value *V); + + /// @brief Perform post-scalarization tasks for the given value. + /// + /// @param[in] P Packet resulting from scalarization or null. + /// @param[in] V Value to scalarize. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *assignScalar(SimdPacket *P, llvm::Value *V); + /// @brief Extract an element's values, for use by scalarized users + /// + /// @param[in] V Value to extract. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *extractLanes(llvm::Value *V, PacketMask PM); + /// @brief Scalarize a load instruction. + /// + /// @param[in] Load Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeLoad(llvm::LoadInst *Load, PacketMask PM); + /// @brief Scalarize a store instruction. + /// + /// @param[in] Store Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeStore(llvm::StoreInst *Store, PacketMask PM); + /// @brief Scalarize a cast instruction. + /// + /// @param[in] CastI Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeCast(llvm::CastInst *CastI, PacketMask PM); + /// @brief Scalarize a bitcast instruction. + /// + /// @param[in] BC Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeBitCast(llvm::BitCastInst *BC, PacketMask PM); + /// @brief Scalarize a binary operation instruction. + /// + /// @param[in] BinOp Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeBinaryOp(llvm::BinaryOperator *BinOp, PacketMask PM); + // Freeze instruction is not available in LLVM versions prior 10.0 + // and not used in LLVM versions prior to 11.0 + /// @brief Scalarize a freeze instruction. + /// + /// @param[in] FreezeInst Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeFreeze(llvm::FreezeInst *FreezeI, PacketMask PM); + /// @brief Scalarize a unary operation instruction. + /// + /// @param[in] UnOp Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeUnaryOp(llvm::UnaryOperator *UnOp, PacketMask PM); + /// @brief Scalarize an interger compare instruction. + /// + /// @param[in] ICmp Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeICmp(llvm::ICmpInst *ICmp, PacketMask PM); + /// @brief Scalarize a floating-point compare instruction. + /// + /// @param[in] FCmp Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeFCmp(llvm::FCmpInst *FCmp, PacketMask PM); + /// @brief Scalarize a select instruction. + /// + /// @param[in] Select Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeSelect(llvm::SelectInst *Select, PacketMask PM); + /// @brief Scalarize a call instruction. + /// + /// @param[in] CI Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeCall(llvm::CallInst *CI, PacketMask PM); + /// @brief Scalarize a call instruction to a masked mem op. + /// + /// @param[in] CI Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// @param[in] MaskedOp Masked memory operation to scalarize. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeMaskedMemOp(llvm::CallInst *CI, PacketMask PM, + MemOp &MaskedOp); + /// @brief Scalarize a shuffle vector instruction. + /// + /// @param[in] Shuffle Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeShuffleVector(llvm::ShuffleVectorInst *Shuffle, + PacketMask PM); + /// @brief Scalarize an insert element instruction. + /// + /// @param[in] Insert Instruction to scalarize. + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return Packet containing scalarized values or null. + SimdPacket *scalarizeInsertElement(llvm::InsertElementInst *Insert, + PacketMask PM); + /// @brief Scalarize GEPs with vector arguments + /// + /// @param[in] GEP The GEP to scalarize + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return The packet containing the scalarized values or null + SimdPacket *scalarizeGEP(llvm::GetElementPtrInst *GEP, PacketMask PM); + /// @brief Scalarize Phi nodes with vector arguments + /// + /// @param[in] Phi The Phi node to scalarize + /// @param[in] PM Mask indicating which lanes are required. + /// + /// @return The packet containing the scalarized values or null + SimdPacket *scalarizePHI(llvm::PHINode *Phi, PacketMask PM); + /// @brief Preserves debug information attached to old instruction + /// we have just scalarized before it is removed. + /// + /// @param[in] Original Vector instruction which has been scalarized. + /// @param[in] Packet Packetized instruction after scalarization. + /// @param[in] Width SIMD width of packet. + void scalarizeDI(llvm::Instruction *Original, const SimdPacket *Packet, + unsigned Width); + + // These functions work on scalar values that use vector values. + + /// @brief Scalarize the operands of an extract element instruction. + /// + /// @param[in] Extr Instruction to scalarize. + /// + /// @return A different value than Extr if the operands were scalarized; null + /// if scalarization failed; or Extr if the value has no vector operand. + llvm::Value *scalarizeOperandsExtractElement(llvm::ExtractElementInst *Extr); + /// @brief Scalarize the operands of a bitcast instruction. + /// + /// @param[in] BC Instruction to scalarize. + /// + /// @return A different value than BC if the operands were scalarized; null if + /// scalarization failed; or BC if the value has no vector operand. + llvm::Value *scalarizeOperandsBitCast(llvm::BitCastInst *BC); + + /// @brief Scalarize the operands of a printf call. + /// + /// @param[in] CI Instruction to scalarize. + /// + /// @return A different value than CI if the operands were scalarized; + /// null if scalarization failed; or CI if the value has no vector + /// operand. + llvm::Value *scalarizeOperandsPrintf(llvm::CallInst *CI); + + /// @brief Scalarize the operands of a binary operation instruction. + /// + /// @param[in] Intrin Instruction to scalarize. + /// + /// @return A different value than Intrin if the operands were scalarized; + /// null if scalarization failed; or Intrin if the value has no vector + /// operand. + llvm::Value *scalarizeReduceIntrinsic(llvm::IntrinsicInst *Intrin); +}; + +/// @} +} // namespace vecz + +#endif // VECZ_TRANSFORM_SCALARIZER_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h new file mode 100644 index 0000000000000..a428b84ba9aa9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h @@ -0,0 +1,49 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file +/// +/// @brief Transform the pattern generated by ternary operators to a +/// vectorizable instruction set + +#ifndef VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED +#define VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED + +#include + +namespace vecz { + +/// @brief This pass tries to transform selects with pointer operands, +/// transforms to individual GEPs followed by masked memory operations. +class TernaryTransformPass : public llvm::PassInfoMixin { +public: + TernaryTransformPass() = default; + + /// @brief The entry point to the pass. + // + /// @param[in] F Function to optimize. + /// @param[in] AM FunctionAnalysisManager providing analyses. + // + /// @return The preserved analyses. + llvm::PreservedAnalyses run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM); + + // @brief Pass name. + static llvm::StringRef name() { return "Ternary transform pass"; } +}; +} // namespace vecz + +#endif // VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h new file mode 100644 index 0000000000000..9d231c8b7b1d7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h @@ -0,0 +1,388 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file vectorization_context.h +/// +/// @brief Hold global state and objects used for vectorization. + +#ifndef VECZ_VECTORIZATION_CONTEXT_H_INCLUDED +#define VECZ_VECTORIZATION_CONTEXT_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace llvm { +class TargetTransformInfo; +} // namespace llvm + +namespace compiler { +namespace utils { +class BuiltinInfo; +} // namespace utils +} // namespace compiler + +namespace vecz { +class MemOpDesc; +class TargetInfo; +struct UniformValueResult; +class VectorizationChoices; +struct VectorizationResult; +class VectorizationUnit; + +using ActiveUnitMap = llvm::DenseMap, + VectorizationUnit *>; + +/// @brief Holds global (per-module) vectorization state. +class VectorizationContext { +public: + /// @brief Create a new vectorization context object. + /// + /// @param[in] target Module in which vectorization happens. + /// @param[in] vti Target information. + /// @param[in] bi Builtins information + VectorizationContext(llvm::Module &target, TargetInfo &vti, + compiler::utils::BuiltinInfo &bi); + + /// @brief Access the public vectorizer API. + + /// @brief Module in which vectorization happens. + llvm::Module &module() const { return Module; } + + /// @brief Data layout for the target. + const llvm::DataLayout *dataLayout() const { return DL; } + + /// @brief Information about the target. + TargetInfo &targetInfo() { return VTI; } + + /// @brief Information about the target. + const TargetInfo &targetInfo() const { return VTI; } + + llvm::TargetTransformInfo getTargetTransformInfo(llvm::Function &F) const; + + /// @brief Construct and initialize the PassManager to be used for + /// vectorizing. + /// @return true if no problem occurred, false otherwise. + bool buildPassPipeline(); + VectorizationUnit *getActiveVU(const llvm::Function *F) const; + + /// @brief Log the Function's VectorizationUnit as the one governing the + /// current vectorization. + void setActiveVU(llvm::Function *F, VectorizationUnit *VU) { + ActiveVUs[F] = VU; + } + /// @brief Log the Function's VectorizationUnit as the one governing the + /// current vectorization. + void clearActiveVU(llvm::Function *F) { ActiveVUs.erase(F); } + + /// @brief Builtin database. + compiler::utils::BuiltinInfo &builtins(); + + /// @brief Builtin database. + const compiler::utils::BuiltinInfo &builtins() const; + + /// @brief Determine whether the function is an internal builtin or not. + /// + /// @param[in] F Function to analyze. + /// + /// @return true if F is an internal builtin function, false otherwise. + static bool isInternalBuiltin(const llvm::Function *F); + /// @brief Create a new function with the given name and type, unless it + /// already exists in the module. Mark it as an internal builtin. + /// + /// @param[in] Name Name of the builtin function. + /// @param[in] FT Function type for the builtin. + /// + /// @return Internal builtin function with the given Name. + llvm::Function *getOrCreateInternalBuiltin(llvm::StringRef Name, + llvm::FunctionType *FT = nullptr); + /// @brief Define the internal builtin function, i.e. generate its body. + /// + /// @param[in] F Function declaration to emit a body for. + /// + /// @return true if the body of the builtin was emitted, false otherwise. + bool defineInternalBuiltin(llvm::Function *F); + /// @brief Given a scalar builtin function, return a vector equivalent if it + /// is an internal builtin. + /// + /// @param[in] ScalarFn Scalar builtin to map to a vector equivalent. + /// @param[in] SimdWidth SIMD width used to determine which vector equivalent + /// to select. + /// + /// @return Equivalent vector builtin function on success, or null. + llvm::Function *getInternalVectorEquivalent(llvm::Function *ScalarFn, + unsigned SimdWidth); + + /// @brief Check if the given function is a masked version of another function + /// + /// @param[in] F The function to check + /// @return true if the function is a masked version, or false otherwise + bool isMaskedFunction(const llvm::Function *F) const; + /// @brief Get the original non-masked function from a masked function + /// + /// @param[in] F The masked function + /// @return Original masked function if it exists, or null + llvm::Function *getOriginalMaskedFunction(llvm::Function *F); + /// @brief Get (if it exists already) or create the masked version of a + /// function + /// + /// @param[in] CI Call to the function to be masked + /// @return The masked version of the function + llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI); + + /// @brief Represents either an atomicrmw or cmpxchg operation. + /// + /// Most fields are shared, with the exception of CmpXchgFailureOrdering and + /// IsWeak, which are only to be set for cmpxchg, and BinOp, which is only to + /// be set to a valid value for atomicrmw. + struct MaskedAtomic { + llvm::Type *PointerTy; + llvm::Type *ValTy; + /// @brief Must be set to BAD_BINOP for cmpxchg instructions + llvm::AtomicRMWInst::BinOp BinOp; + llvm::Align Align; + bool IsVolatile = false; + llvm::SyncScope::ID SyncScope; + llvm::AtomicOrdering Ordering; + /// @brief Must be set for cmpxchg instructions + std::optional CmpXchgFailureOrdering = std::nullopt; + /// @brief Must only be set for cmpxchg instructions + bool IsWeak = false; + // Vectorization info + llvm::ElementCount VF; + bool IsVectorPredicated = false; + + /// @brief Returns true if this MaskedAtomic represents a cmpxchg operation. + bool isCmpXchg() const { + if (CmpXchgFailureOrdering.has_value()) { + // 'binop' only applies to atomicrmw + assert(BinOp == llvm::AtomicRMWInst::BAD_BINOP && + "Invalid MaskedAtomic state"); + return true; + } + // 'weak' only applies to cmpxchg + assert(!IsWeak && "Invalid MaskedAtomic state"); + return false; + } + }; + + /// @brief Check if the given function is a masked version of an atomicrmw or + /// cmpxchg operation. + /// + /// @param[in] F The function to check + /// @return A MaskedAtomic instance detailing the atomic operation if the + /// function is a masked atomic, or std::nullopt otherwise + std::optional + isMaskedAtomicFunction(const llvm::Function &F) const; + /// @brief Get (if it exists already) or create the function representing the + /// masked version of an atomicrmw/cmpxchg operation. + /// + /// @param[in] I Atomic to be masked + /// @param[in] Choices Choices to mangle into the function name + /// @param[in] VF The vectorization factor of the atomic operation + /// @return The masked version of the function + llvm::Function * + getOrCreateMaskedAtomicFunction(MaskedAtomic &I, + const VectorizationChoices &Choices, + llvm::ElementCount VF); + + /// @brief Create a VectorizationUnit to use to vectorize the given scalar + /// function. + /// + /// The lifetime of the returned VectorizationUnit is managed by the + /// VectorizationContext. + /// + /// @param[in] F Function to vectorize. + /// @param[in] VF vectorization factor to use. + /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z). + /// @param[in] Ch Vectorization Choices for the vectorization. + VectorizationUnit *createVectorizationUnit(llvm::Function &F, + llvm::ElementCount VF, + unsigned Dimension, + const VectorizationChoices &Ch); + + /// @brief Vectorizes all Vectorization Units in the context + void vectorize(); + + /// @brief Try to get a vectorization result for the scalar builtin function. + /// + /// @param[in] F Builtin function to create or retrieve an unit for. + /// @param[in] SimdWidth Vectorization factor to use. + /// + /// @return a VectorizationResult representing the vectorized function. + VectorizationResult &getOrCreateBuiltin(llvm::Function &F, + unsigned SimdWidth); + + /// @brief Vectorize a builtin function by a given factor + /// + /// @param[in] F the function to vectorize. + /// @param[in] factor the vectorization factor. + /// + /// @return a VectorizationResult representing the vectorized function. + VectorizationResult getVectorizedFunction(llvm::Function &F, + llvm::ElementCount factor); + + /// @brief Determine whether I is a vector instruction or not, i.e. it has any + /// vector operand. + /// + /// @param[in] I Instruction to analyze. + /// + /// @return true if I is a vector instruction. + static bool isVector(const llvm::Instruction &I); + + static const char *InternalBuiltinPrefix; + +private: + /// @brief Determine whether this scalar builtin function can be safely + /// expanded at vector call sites, i.e. it has not side effects. + /// + /// @param[in] ScalarFn Builtin function to analyze. + /// + /// @return true if the function can be expanded. + bool canExpandBuiltin(const llvm::Function *ScalarFn) const; + + /// @brief Emit the body for the masked load or store internal builtins + /// + /// @param[in] F The empty (declaration only) function to emit the body in + /// @param[in] Desc The MemOpDesc for the memory operation + /// @returns true on success, false otherwise + bool emitMaskedMemOpBody(llvm::Function &F, const MemOpDesc &Desc) const; + /// @brief Emit the body for the interleaved load or store internal builtins + /// + /// @param[in] F The empty (declaration only) function to emit the body in + /// @param[in] Desc The MemOpDesc for the memory operation + /// @returns true on success, false otherwise + bool emitInterleavedMemOpBody(llvm::Function &F, const MemOpDesc &Desc) const; + /// @brief Emit the body for the masked interleaved load/store internal + /// builtins + /// + /// @param[in] F The empty (declaration only) function to emit the body in + /// @param[in] Desc The MemOpDesc for the memory operation + /// @returns true on success, false otherwise + bool emitMaskedInterleavedMemOpBody(llvm::Function &F, + const MemOpDesc &Desc) const; + /// @brief Emit the body for the scatter or gather internal builtins + /// + /// @param[in] F The empty (declaration only) function to emit the body in + /// @param[in] Desc The MemOpDesc for the memory operation + /// @returns true on success, false otherwise + bool emitScatterGatherMemOpBody(llvm::Function &F, + const MemOpDesc &Desc) const; + /// @brief Emit the body for the masked scatter or gather internal builtins + /// + /// @param[in] F The empty (declaration only) function to emit the body in + /// @param[in] Desc The MemOpDesc for the memory operation + /// @returns true on success, false otherwise + bool emitMaskedScatterGatherMemOpBody(llvm::Function &F, + const MemOpDesc &Desc) const; + /// @brief Add the masked function to the tracking set + /// + /// @param[in] F The function to add + /// @param[in] WrappedF The original function being masked + /// @return false if the function was already in the set, or true otherwise + bool insertMaskedFunction(llvm::Function *F, llvm::Function *WrappedF); + + /// @brief Emit the body for the subgroup scan builtins + /// + /// @param[in] F The empty (declaration only) function to emit the body in + /// @param[in] IsInclusive whether the scan should be inclusive (on true) or + /// exclusive (on false). + /// @param[in] OpKind the kind of scan to emit. Note: not all values of + /// llvm::RecurKind are supported scan operations. + /// @param[in] IsVP whether the scan is vector-predicated. + /// @returns true on success, false otherwise + bool emitSubgroupScanBody(llvm::Function &F, bool IsInclusive, + llvm::RecurKind OpKind, bool IsVP) const; + + /// @brief Emit the body for a masked atomic builtin + /// + /// @param[in] F The empty (declaration only) function to emit the body in + /// @param[in] MA The MaskedAtomic information + /// @returns true on success, false otherwise + bool emitMaskedAtomicBody(llvm::Function &F, const MaskedAtomic &MA) const; + + /// @brief Helper for non-vectorization tasks. + TargetInfo &VTI; + /// @brief Module in which the vectorization happens. + llvm::Module &Module; + /// @brief Builtins database. + compiler::utils::BuiltinInfo &BI; + /// @brief Data layout object used to determine the size and alignment of + /// types. + const llvm::DataLayout *DL; + /// @brief Persistent storage for Kernel Vectorization Units + std::vector> KernelUnits; + /// @brief Mapping between functions in the module and vectorization units. + llvm::DenseMap> + VectorizedBuiltins; + /// @brief Maps vector functions to their VectorizationUnits + ActiveUnitMap ActiveVUs; + /// @brief Map of masked functions used in the module to their original + /// non-masked function. + llvm::ValueToValueMapTy MaskedFunctionsMap; + /// @brief All the masked versions of functions generated by Vecz + /// + /// Keeps track of all the functions we already have masked versions of. We + /// use the name of the masked function instead of just the Function pointer + /// because vararg functions have different masked versions for different + /// argument types. + std::map MaskedVersions; +}; + +/// \addtogroup passes Passes +/// @{ +/// \ingroup vecz + +/// @brief Implement internal builtins. +class DefineInternalBuiltinsPass + : public llvm::PassInfoMixin { +public: + /// @brief Create a new pass object. + DefineInternalBuiltinsPass() {} + + static void *ID() { return (void *)&PassID; } + + /// @brief Define all used internal builtins in the module, expanding bodies + /// for declaration only references. + /// + /// @param[in] M Module in which to define internal builtins. + /// @param[in] AM ModuleAnalysisManager providing analyses. + /// + /// @return Set of preserved analyses (all analyses). + llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM); + + static llvm::StringRef name() { return "Define internal builtins"; } + +private: + /// @brief Identifier for the DefineInternalBuiltin pass. + static char PassID; +}; + +/// @} +} // namespace vecz + +#endif // VECZ_VECTORIZATION_CONTEXT_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h new file mode 100644 index 0000000000000..c865601b90a55 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h @@ -0,0 +1,82 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef VECZ_VECTORIZATION_HELPERS_H_INCLUDED +#define VECZ_VECTORIZATION_HELPERS_H_INCLUDED + +#include + +#include + +namespace llvm { +class Function; +class StringRef; +} // namespace llvm + +namespace vecz { +class VectorizationUnit; +class VectorizationChoices; + +/// @brief Generate a name for the vectorized function, which depends on the +/// original function name and SIMD width. +/// +/// @param[in] ScalarName Name of the original function. +/// @param[in] VF vectorization factor of the vectorized function. +/// @param[in] Choices choices used for vectorization +/// @param[in] IsBuiltin True if this is an internal builtin. +/// +/// @return Name for the vectorized function. +std::string getVectorizedFunctionName(llvm::StringRef ScalarName, + llvm::ElementCount VF, + VectorizationChoices Choices, + bool IsBuiltin = false); + +/// @brief Parses a name generated for a vectorized function +/// +/// @see getVectorizedFunctionName. +/// +/// @param[in] Name Name of the vectorized function. +/// +/// @return A tuple containing the original name of the function, and the +/// element count and choices it was encoded with. Returns std::nullopt on +/// failure. +std::optional> +decodeVectorizedFunctionName(llvm::StringRef Name); + +/// @brief Clone the scalar function's body into the function to vectorize, +/// vectorizing function argument types where required. +/// +/// @param[in] VU the Vectorization Unit of the scalar function to clone. +/// +/// @return The cloned function. +llvm::Function *cloneFunctionToVector(const VectorizationUnit &VU); + +/// @brief Create a copy of the scalar functions debug info metatadata +// nodes and set the scope of the copied DI to the vectorized +// function. +void cloneDebugInfo(const VectorizationUnit &VU); + +/// @brief Clone OpenCL related metadata from the scalar kernel to the +/// vectorized one. +/// +/// This function will copy any 'opencl.kernels' or +/// 'opencl.kernel_wg_size_info' metadata from the scalar kernel to the +/// vectorized one. Obviously, the kernel itself has to be cloned before +/// calling this function. +void cloneOpenCLMetadata(const VectorizationUnit &VU); +} // namespace vecz + +#endif // VECZ_VECTORIZATION_HELPERS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h new file mode 100644 index 0000000000000..e80949be23143 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h @@ -0,0 +1,43 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED +#define VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED + +#include + +namespace llvm { +class Function; +} // namespace llvm + +namespace vecz { +class VectorizationContext; + +/// @brief Decide whether a function is worth vectorizing for a given +/// vectorization factor. +/// +/// @param[in] F the function to analyze +/// @param[in] Ctx the vectorization context +/// @param[in] VF the vectorization factor +/// @param[in] SimdDimIdx the vectorization dimension +/// +/// @return Whether we should vectorize the function or not. +bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx, + llvm::ElementCount VF, unsigned SimdDimIdx); + +} // namespace vecz + +#endif // VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h new file mode 100644 index 0000000000000..820b83d53ad86 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h @@ -0,0 +1,258 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef VECZ_VECTORIZATION_UNIT_H_INCLUDED +#define VECZ_VECTORIZATION_UNIT_H_INCLUDED + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace llvm { +class Function; +class FunctionType; +class Module; +class Instruction; +class Argument; +} // namespace llvm + +namespace vecz { +namespace internal { +struct VeczFailResult; +struct AnalysisFailResult; +} // namespace internal + +struct SimdPacket; +struct UniformValueResult; +class ValueTagMap; +class VectorizationContext; +class VectorizationChoices; + +template class AnalysisWrapper; + +/// @brief Describe an argument of a function that needs to be vectorized. +struct VectorizerTargetArgument { + /// @brief Argument of the scalar function. + llvm::Argument *OldArg; + /// @brief Argument of the vectorized function. Might be scalar or vector. + llvm::Argument *NewArg; + /// @brief Whether the argument needs to be vectorized or not. + bool IsVectorized; + /// @brief If the argument is a 'byref' pointer used to return a value, this + /// is the type of that value. Else it is null. + llvm::Type *PointerRetPointeeTy; + /// @brief Placeholder instruction for arguments needing vectorization. + llvm::Instruction *Placeholder; +}; + +/// @brief Analysis flags that can be attached to LLVM functions. +enum FunctionFlags { + eFunctionNoFlag = 0, + /// @brief The function has been analyzed. + /// Set by the preliminary vectorization analysis (canVectorize). Set once. + eFunctionAnalysisDone = (1 << 0), + /// @brief The function can be vectorized. + /// Set by the preliminary vectorization analysis (canVectorize). Set once. + eFunctionVectorizable = (1 << 1), + /// @brief Vectorization of the function failed. + /// Can be set by any pass. Set once. + eFunctionVectorizationFailed = (1 << 2), +}; + +/// @brief struct to hold only the data needed to use a vectorized function +struct VectorizationResult { + struct Arg { + enum Kind { SCALAR, VECTORIZED, POINTER_RETURN } kind; + llvm::Type *type; + llvm::Type *pointerRetPointeeTy = nullptr; + constexpr Arg(Kind k, llvm::Type *ty, llvm::Type *ptrRetTy) + : kind(k), type(ty), pointerRetPointeeTy(ptrRetTy) {} + }; + + llvm::Function *func = nullptr; + llvm::SmallVector args; + + operator bool() const { return func; } + llvm::Function *get() const { return func; } +}; + +/// @brief Describe a function that needs to be vectorized. +class VectorizationUnit { +public: + /// @brief Create a new vectorization unit for the given scalar function. + /// + /// @param[in] F Function to vectorize. + /// @param[in] Width SIMD width (i.e. vectorization factor) to use. + /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z). + /// @param[in] Ctx Context for vectorization. + /// @param[in] Ch Vectorization Choices for the vectorization. + VectorizationUnit(llvm::Function &F, llvm::ElementCount Width, + unsigned Dimension, VectorizationContext &Ctx, + const VectorizationChoices &Ch); + /// @brief Free up any resource used by the function. + ~VectorizationUnit(); + + /// @brief Access the vectorization context linked to this function. + VectorizationContext &context() { return Ctx; } + + /// @brief Access the vectorization context linked to this function. + const VectorizationContext &context() const { return Ctx; } + + /// @brief Number of available SIMD lanes, i.e. vectorization factor. + llvm::ElementCount width() const { return SimdWidth; } + + /// @brief Get the work group size along the vectorization dimension. + uint64_t getLocalSize() const { return LocalSize; } + + /// @brief Whether to run the SIMD Width Analysis during vectorization. + bool autoWidth() const { return AutoSimdWidth; } + + /// @brief Index of SIMD dimension used in vectorization. + unsigned dimension() const { return SimdDimIdx; } + + /// @brief Set the SIMD width, i.e. vectorization factor. After changing this + /// value a possible existing vectorized function is looked up in the module. + /// + /// @param[in] NewWidth New SIMD width. + void setWidth(llvm::ElementCount NewWidth); + + /// @brief Set the work group size along the vectorization dimension. + /// + /// @param[in] LS the local work group size + void setLocalSize(uint64_t LS) { LocalSize = LS; } + + /// @brief Set whether to use the SIMD width analysis + /// + /// @param[in] Auto true to use auto SIMD width, false otherwise + void setAutoWidth(bool Auto) { AutoSimdWidth = Auto; } + + /// @brief Determine whether vectorizing the function failed or not. + bool failed() const { return hasFlag(eFunctionVectorizationFailed); } + + /// @brief Mark this function as failing vectorization. + /// @param[in] Remark Message to print into the optimization remarks + /// @param[in] F Function to pass to emitVeczRemarkMissed + /// @param[in] V Value to pass to emitVeczRemarkMissed + /// @return unconditionally returns a VeczFailResult which can be safely + /// ignored. This can help cut down on some boilerplate in contexts where + /// we'll immediately return, via the following idiom: + /// ``` + /// if (!thing) { + /// return setFailed("thing wasn't"); + /// } + /// ``` + internal::AnalysisFailResult setFailed(const char *Remark, + const llvm::Function *F = nullptr, + const llvm::Value *V = nullptr); + + /// @brief Check whether the function has the given flag or not. + /// + /// @param[in] Flag Flag to check. + /// + /// @return true if the function has the given flag, false otherwise. + bool hasFlag(FunctionFlags Flag) const { return (FnFlags & Flag) == Flag; } + + /// @brief Set the given flag to the function. + /// + /// @param[in] Flag Flag to set. + void setFlag(FunctionFlags Flag) { + FnFlags = (FunctionFlags)(FnFlags | Flag); + } + + /// @brief Clear the given flag from the function. + /// + /// @param[in] Flag Flag to set. + void clearFlag(FunctionFlags Flag) { + FnFlags = (FunctionFlags)(FnFlags & ~Flag); + } + + /// @brief Access the arguments of the function to vectorize. + const llvm::SmallVectorImpl &arguments() const { + return Arguments; + } + + /// @brief Return the vectorized function if it exists, otherwise the original + /// function. + llvm::Function &function(); + + /// @brief Return the vectorized function if it exists, otherwise the original + /// function. + const llvm::Function &function() const; + + /// @brief Original function to vectorize. + llvm::Function *scalarFunction() const { return ScalarFn; } + + /// @brief Set the function to vectorize. This updates the function arguments. + /// + /// @param[in] NewFunction Original function. + void setScalarFunction(llvm::Function *NewFunction); + + /// @brief Vectorized function. + llvm::Function *vectorizedFunction() const { return VectorizedFn; } + + /// @brief Set the vectorized function. This updates the function arguments. + /// + /// @param[in] NewFunction Vectorized function. + void setVectorizedFunction(llvm::Function *NewFunction); + + /// @brief Name of the current function. + llvm::StringRef getName() const { return function().getName(); } + + /// @brief Get the result of the vectorization + /// @return The VectorizationResult respresenting the vectorized function + VectorizationResult getResult() const; + + /// @brief Get the Vecz optimizations tracker class + /// @return The Choices + const VectorizationChoices &choices() const { return Choices; }; + +private: + /// @brief Context this function is vectorized in. + VectorizationContext &Ctx; + /// @brief Which Vecz code generation choices are enabled and which not + const VectorizationChoices &Choices; + /// @brief Function to vectorize. + llvm::Function *ScalarFn; + /// @brief Target (vectorized) function. + llvm::Function *VectorizedFn; + /// @brief Arguments of the function to vectorize. + llvm::SmallVector Arguments; + /// @brief Vectorization factor to use. + llvm::ElementCount SimdWidth; + /// @brief The work group size along the vectorization dimension, if known, + /// zero otherwise. For our purposes, this only need be an upper bound. + uint64_t LocalSize; + /// @brief Use the SIMD Width Analysis to determine the SIMD width + bool AutoSimdWidth; + /// @brief SimdDimIdx Index of vectorization dimension to use. + unsigned SimdDimIdx; + /// @brief Name of the builtin function, if the function to vectorize is one. + std::string BuiltinName; + /// @brief Per-function analysis flags. + FunctionFlags FnFlags; + /// @brief Placeholder instructions for arguments that will be vectorized. + llvm::SmallPtrSet ArgumentPlaceholders; +}; + +} // namespace vecz + +#endif // VECZ_VECTORIZATION_UNIT_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h new file mode 100644 index 0000000000000..483a46af5c681 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h @@ -0,0 +1,74 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file vectorizer.h +/// +/// @brief Entry point for the kernel vectorizer. + +#ifndef VECZ_VECTORIZER_H_INCLUDED +#define VECZ_VECTORIZER_H_INCLUDED + +#include +#include + +namespace llvm { +class Function; +} // namespace llvm + +namespace vecz { + +/// @brief The maximum number of vectorization dimension that Vecz can handle. +/// +/// The current limitation is due to the assumption that work groups are +/// being represented as 1- 2- or 3-dimensional arrays or work items. +const unsigned MAX_SIMD_DIM = 3; + +class VectorizationContext; +class VectorizationUnit; +struct VeczPassOptions; + +/// @brief Try to create a vectorization unit for the given kernel function, +/// with the given vectorization factor and vectorization options. +/// +/// @param[in] Ctx VectorizationContext used to perform the vectorization. +/// @param[in] Kernel kernel function to vectorize. +/// @param[in] Opts Vecz Pass Options struct for this vectorization. +/// @param[in] FAM Function Analysis Manager for running analyses +/// @param[in] Check check for vectorizability before creating the VU +/// +/// @return Pointer to a vectorization unit on success, or nullptr on failure. +VectorizationUnit *createVectorizationUnit(VectorizationContext &Ctx, + llvm::Function *Kernel, + const VeczPassOptions &Opts, + llvm::FunctionAnalysisManager &FAM, + bool Check); + +/// @brief Create metadata for the vectorization unit relating the vectorized +/// function to the scalar function. +/// +/// @param[in] VU the vectorization Unit of to create metadata for +/// @returns true iff vectorization succeeded. +bool createVectorizedFunctionMetadata(VectorizationUnit &VU); + +/// @brief Register failure, success, and update statistics for the given +/// VectorizationUnit. +/// +/// @param[in] VU the vectorization Unit of to create metadata for +/// @returns true iff vectorization succeeded. +void trackVeczSuccessFailure(VectorizationUnit &VU); +} // namespace vecz + +#endif // VECZ_VECTORIZER_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h new file mode 100644 index 0000000000000..a51e66c4ec024 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h @@ -0,0 +1,68 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +/// @file vecz_pass_builder.h +/// +/// @brief class to initialize a Module Pass Manager to perform vectorization. + +#ifndef VECZ_VECZ_PASS_BUILDER_H_INCLUDED +#define VECZ_VECZ_PASS_BUILDER_H_INCLUDED + +#include +#include + +namespace llvm { +class Module; +class TargetTransformInfo; +class TargetMachine; +} // namespace llvm + +namespace vecz { +class VectorizationContext; + +/// @brief A class that manages the lifetime and initialization of all +/// components required to set up an LLVM pass manager to run Vecz passes. +class VeczPassMachinery final : public compiler::utils::PassMachinery { +public: + /// @brief Construct the pass machinery. + /// The base class method `initialize(TargetInfo)` must also be called. + /// + /// @param[in] TM TargetMachine to be used for passes. May be nullptr + /// @param[in] ctx the vectorization context object for the module. + /// @param[in] verifyEach true if each pass should be verified + /// @param[in] debugLogLevel debug logging verbosity. + VeczPassMachinery(llvm::LLVMContext &llvmCtx, llvm::TargetMachine *TM, + VectorizationContext &ctx, bool verifyEach, + compiler::utils::DebugLogging debugLogLevel = + compiler::utils::DebugLogging::None); + + virtual void registerPasses() override; + +private: + virtual void addClassToPassNames() override; + virtual void registerPassCallbacks() override; + + VectorizationContext &Ctx; +}; + +/// @brief Add the full Vecz pass pipeline to the given pass manager. +/// +/// @param[in] PM The Module Pass Manager to build. +/// @return true on success. +bool buildPassPipeline(llvm::ModulePassManager &PM); +} // namespace vecz + +#endif // VECZ_VECZ_PASS_BUILDER_H_INCLUDED diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp new file mode 100644 index 0000000000000..28ec40fc6f7c7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp @@ -0,0 +1,143 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "ir_cleanup.h" + +#include +#include +#include +#include +#include +#include + +#include "memory_operations.h" + +#define DEBUG_TYPE "vecz" + +using namespace llvm; +using namespace vecz; + +namespace { + +/// @brief Determine whether all users of the instructions are dead. An user is +/// dead if it has no use, if it is present in the 'to delete' list or if it is +/// a phi node whose only use keeps it alive is the 'backedge'. +/// +/// @param[in] I Instruction to check for deletion. +/// @param[in] DeadList Instructions marked for deletion. +/// @param[in,out] WorkList Newly detected Instructions marked for deletion. +/// @param[in,out] Visited Instructions visited for deletion. +/// +/// @return true if all users of the instructions are dead, false otherwise. +bool AreUsersDead(Instruction *I, + const SmallPtrSetImpl &DeadList, + SmallPtrSetImpl &WorkList, + SmallPtrSetImpl &Visited) { + for (User *U : I->users()) { + // Ignore non-instructions. + Instruction *UserI = dyn_cast(U); + if (!UserI) { + continue; + } + + // Trivially dead users can be removed, even if we haven't explicitly marked + // them for deletion. The DCE pass would have removed these later on anyway, + // and by marking them for deletion here we can be more aggressive about + // what we delete. + if (isInstructionTriviallyDead(UserI)) { + WorkList.insert(UserI); + } + + // I is held by a non-dead user. + if (!DeadList.contains(UserI) && !WorkList.contains(UserI)) { + return false; + } + + // Recurse over the user's users. + if (!UserI->user_empty() && Visited.insert(UserI).second && + !AreUsersDead(UserI, DeadList, WorkList, Visited)) { + return false; + } + } + return true; +} + +} // namespace + +void IRCleanup::deleteInstructionLater(llvm::Instruction *I) { + if (InstructionsToDelete.insert(I).second) { + LLVM_DEBUG(dbgs() << "Marking for deletion: " << *I << "\n"); + } +} + +void IRCleanup::deleteInstructions() { + SmallPtrSet WorkList; + SmallPtrSet VisitedForCycles; + bool progress = true; + while (progress && !InstructionsToDelete.empty()) { + progress = false; + for (Instruction *I : InstructionsToDelete) { + WorkList.erase(I); + if (I->use_empty()) { + I->eraseFromParent(); + progress = true; + } else if (PHINode *Phi = dyn_cast(I)) { + if (AreUsersDead(Phi, InstructionsToDelete, WorkList, + VisitedForCycles)) { + Phi->replaceAllUsesWith(PoisonValue::get(Phi->getType())); + Phi->eraseFromParent(); + progress = true; + } else { + WorkList.insert(Phi); + } + VisitedForCycles.clear(); + } else if (CallInst *CI = dyn_cast(I)) { + // MemOps make deleting unnecessary instructions harder, because they + // cannot be trivially dead instructions, thus breaking our recursive + // deletion. However, if we have packetized a load or a store, we + // definitely want to remove the scalar one, as it will be + // reading/writing to invalid pointers. To make things simpler, here we + // detect internal builtins that perform memory operations and erase + // them. Since stores have no users, they will be removed earlier on and + // we do not need to check here. + auto Op = MemOp::get(CI); + if (Op && Op->isLoad()) { + // We need to replace loads with nops, as we need to have a value for + // their users, which will be removed later on. + I->replaceAllUsesWith(PoisonValue::get(Op->getDataType())); + I->eraseFromParent(); + } else { + WorkList.insert(I); + } + } else { + WorkList.insert(I); + } + } + InstructionsToDelete = std::move(WorkList); + WorkList.clear(); + } + + // Remove remaining instructions from the list. + LLVM_DEBUG(for (Instruction *I : InstructionsToDelete) { + dbgs() << "vecz: could not delete " << *I << "\n"; + }); + InstructionsToDelete.clear(); +} + +void IRCleanup::deleteInstructionNow(Instruction *I) { + I->replaceAllUsesWith(PoisonValue::get(I->getType())); + I->eraseFromParent(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp new file mode 100644 index 0000000000000..a6252e834ad43 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp @@ -0,0 +1,73 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "llvm_helpers.h" + +#include +#include +#include + +#include "debugging.h" +#include "memory_operations.h" + +using namespace llvm; + +/// @brief Determine if the value has vector type, and return it. +/// +/// @param[in] V Value to analyze. +/// +/// @return Vector type of V or null. +FixedVectorType *vecz::getVectorType(Value *V) { + if (StoreInst *Store = dyn_cast(V)) { + auto *VO = Store->getValueOperand(); + assert(VO && "Could not get value operand"); + return dyn_cast(VO->getType()); + } else if (CallInst *Call = dyn_cast(V)) { + if (auto MaskedOp = MemOp::get(Call, MemOpAccessKind::Masked)) { + if (MaskedOp->isMaskedMemOp() && MaskedOp->isStore()) { + return dyn_cast(MaskedOp->getDataType()); + } + } + } + return dyn_cast(V->getType()); +} + +/// @brief Get the default value for a type. +/// +/// @param[in] T Type to get default value of. +/// @param[in] V Default value to use for numeric type +/// +/// @return Default value, which will be poison for non-numeric types +Value *vecz::getDefaultValue(Type *T, uint64_t V) { + if (T->isIntegerTy()) { + return ConstantInt::get(T, V); + } + + if (T->isFloatTy() || T->isDoubleTy()) { + return ConstantFP::get(T, V); + } + + return PoisonValue::get(T); +} + +/// @brief Get the shuffle mask as sequence of integers. +/// +/// @param[in] Shuffle Instruction +/// +/// @return Array of integers representing the Shuffle mask +ArrayRef vecz::getShuffleVecMask(ShuffleVectorInst *Shuffle) { + return Shuffle->getShuffleMask(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp new file mode 100644 index 0000000000000..aedcd49128678 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp @@ -0,0 +1,966 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "memory_operations.h" + +#include +#include +#include +#include +#include + +#include + +#include "analysis/instantiation_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "vectorization_context.h" +#include "vectorization_unit.h" + +using namespace vecz; +using namespace llvm; + +static std::string getMaskedMemOpName(Type *DataTy, PointerType *PtrTy, + Type *MaskTy, unsigned Alignment, + bool IsLoad, bool IsVP) { + if (!DataTy) { + return std::string(); + } + compiler::utils::NameMangler Mangler(&DataTy->getContext()); + const char *BaseName = IsLoad ? "masked_load" : "masked_store"; + const compiler::utils::TypeQualifiers DataQuals( + compiler::utils::eTypeQualNone); + const compiler::utils::TypeQualifiers PtrQuals( + compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone); + const compiler::utils::TypeQualifiers MaskQuals( + compiler::utils::eTypeQualNone); + std::string Name; + raw_string_ostream O(Name); + O << VectorizationContext::InternalBuiltinPrefix << BaseName << Alignment + << "_"; + if (IsVP) { + O << "vp_"; + } + if (!Mangler.mangleType(O, DataTy, DataQuals) || + !Mangler.mangleType(O, PtrTy, PtrQuals) || + !Mangler.mangleType(O, MaskTy, MaskQuals)) { + return std::string(); + } + if (IsVP) { + const compiler::utils::TypeQualifiers VLQuals( + compiler::utils::eTypeQualNone); + if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()), + VLQuals)) { + return std::string(); + } + } + O.flush(); + return Name; +} + +Function *vecz::getOrCreateMaskedMemOpFn(VectorizationContext &Ctx, + Type *DataTy, PointerType *PtrTy, + unsigned Alignment, bool IsLoad, + bool IsVP) { + const Module &M = Ctx.module(); + LLVMContext &LLVMCtx = M.getContext(); + Type *MaskTy = IntegerType::getInt1Ty(LLVMCtx); + if (auto *VecTy = dyn_cast(DataTy)) { + MaskTy = VectorType::get(MaskTy, multi_llvm::getVectorElementCount(VecTy)); + } + + // Try to retrieve the builtin if it already exists. + const std::string Name = + getMaskedMemOpName(DataTy, PtrTy, MaskTy, Alignment, IsLoad, IsVP); + VECZ_FAIL_IF(Name.empty()); + Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr); + if (!F) { + // Declare it if it doesn't exist. + SmallVector Tys; + if (!IsLoad) { + Tys.push_back(DataTy); + } + Tys.push_back(PtrTy); + Tys.push_back(MaskTy); + if (IsVP) { + Tys.push_back(IntegerType::getInt32Ty(LLVMCtx)); + } + + Type *RetTy = IsLoad ? DataTy : Type::getVoidTy(LLVMCtx); + FunctionType *FT = FunctionType::get(RetTy, Tys, false); + F = Ctx.getOrCreateInternalBuiltin(Name, FT); + } + return F; +} + +static CallInst *createMaskedMemOp(VectorizationContext &Ctx, Value *Data, + Type *DataTy, Value *Ptr, Value *Mask, + Value *EVL, unsigned Alignment, Twine Name) { + VECZ_FAIL_IF(!DataTy); + VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy()); + VECZ_FAIL_IF(!Mask); + assert(!Data || Data->getType() == DataTy); + auto *PtrTy = cast(Ptr->getType()); + Function *F = + getOrCreateMaskedMemOpFn(Ctx, DataTy, PtrTy, Alignment, + /*IsLoad*/ Data == nullptr, EVL != nullptr); + VECZ_FAIL_IF(!F); + SmallVector Ops; + if (Data) { + Ops.push_back(Data); + } + Ops.push_back(Ptr); + Ops.push_back(Mask); + if (EVL) { + Ops.push_back(EVL); + } + return CallInst::Create(F, Ops, Name); +} + +CallInst *vecz::createMaskedLoad(VectorizationContext &Ctx, Type *Ty, + Value *Ptr, Value *Mask, Value *EVL, + unsigned Alignment, Twine Name) { + return createMaskedMemOp(Ctx, /*Data*/ nullptr, Ty, Ptr, Mask, EVL, Alignment, + Name); +} + +CallInst *vecz::createMaskedStore(VectorizationContext &Ctx, Value *Data, + Value *Ptr, Value *Mask, Value *EVL, + unsigned Alignment, Twine Name) { + return createMaskedMemOp(Ctx, Data, Data->getType(), Ptr, Mask, EVL, + Alignment, Name); +} + +static std::string getInterleavedMemOpName(Type *DataTy, PointerType *PtrTy, + Value *Stride, Type *MaskTy, + unsigned Alignment, bool IsLoad, + bool IsVP) { + if (!DataTy) { + return std::string(); + } + compiler::utils::NameMangler Mangler(&DataTy->getContext()); + const char *BaseName = IsLoad ? "interleaved_load" : "interleaved_store"; + std::string Name; + const compiler::utils::TypeQualifiers VecQuals( + compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone); + const compiler::utils::TypeQualifiers PtrQuals( + compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone); + raw_string_ostream O(Name); + O << VectorizationContext::InternalBuiltinPrefix; + if (MaskTy) { + O << "masked_"; + } + O << BaseName << Alignment << "_"; + if (IsVP) { + O << "vp_"; + } + if (auto *CVal = dyn_cast(Stride)) { + O << CVal->getSExtValue(); + } else { + O << "V"; + } + O << "_"; + if (!Mangler.mangleType(O, DataTy, VecQuals) || + !Mangler.mangleType(O, PtrTy, PtrQuals)) { + return std::string(); + } + if (MaskTy) { + const compiler::utils::TypeQualifiers MaskQuals( + compiler::utils::eTypeQualNone); + if (!Mangler.mangleType(O, MaskTy, MaskQuals)) { + return std::string(); + } + } + if (IsVP) { + const compiler::utils::TypeQualifiers VLQuals( + compiler::utils::eTypeQualNone); + if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()), + VLQuals)) { + return std::string(); + } + } + O.flush(); + return Name; +} + +Function *vecz::getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx, + Type *DataTy, PointerType *PtrTy, + Value *Stride, Type *MaskTy, + unsigned Alignment, bool IsLoad, + bool IsVP) { + Module &M = Ctx.module(); + LLVMContext &LLVMCtx = M.getContext(); + + // Try to retrieve the builtin if it already exists. + const std::string Name = getInterleavedMemOpName( + DataTy, PtrTy, Stride, MaskTy, Alignment, IsLoad, IsVP); + VECZ_FAIL_IF(Name.empty()); + Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr); + if (!F) { + // Declare it if it doesn't exist. + SmallVector Tys; + if (!IsLoad) { + VECZ_FAIL_IF(!DataTy); + Tys.push_back(DataTy); + } + VECZ_FAIL_IF(!PtrTy); + Tys.push_back(PtrTy); + if (MaskTy) { + Tys.push_back(MaskTy); + } + if (IsVP) { + Tys.push_back(IntegerType::getInt32Ty(LLVMCtx)); + } + if (!isa(Stride)) { + Tys.push_back(getSizeTy(M)); + } + Type *RetTy = IsLoad ? DataTy : Type::getVoidTy(LLVMCtx); + FunctionType *FT = FunctionType::get(RetTy, Tys, false); + F = Ctx.getOrCreateInternalBuiltin(Name, FT); + } + return F; +} + +static CallInst *createInterleavedMemOp(VectorizationContext &Ctx, Value *Data, + Type *DataTy, Value *Ptr, Value *Stride, + Value *Mask, Value *EVL, + unsigned Alignment, llvm::Twine Name) { + VECZ_FAIL_IF(!DataTy); + VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy()); + assert(!Data || Data->getType() == DataTy); + auto *PtrTy = cast(Ptr->getType()); + Type *MaskTy = Mask ? Mask->getType() : nullptr; + Function *F = getOrCreateInterleavedMemOpFn( + Ctx, DataTy, PtrTy, Stride, MaskTy, Alignment, + /*IsLoad*/ Data == nullptr, EVL != nullptr); + VECZ_FAIL_IF(!F); + SmallVector Ops; + if (Data) { + Ops.push_back(Data); + } + Ops.push_back(Ptr); + if (Mask) { + Ops.push_back(Mask); + } + if (EVL) { + Ops.push_back(EVL); + } + if (!isa(Stride)) { + Ops.push_back(Stride); + } + return CallInst::Create(F, Ops, Name); +} + +CallInst *vecz::createInterleavedLoad(VectorizationContext &Ctx, Type *Ty, + Value *Ptr, Value *Stride, Value *Mask, + Value *EVL, unsigned Alignment, + Twine Name) { + return createInterleavedMemOp(Ctx, /*Data*/ nullptr, Ty, Ptr, Stride, Mask, + EVL, Alignment, Name); +} + +CallInst *vecz::createInterleavedStore(VectorizationContext &Ctx, Value *Data, + Value *Ptr, Value *Stride, Value *Mask, + Value *EVL, unsigned Alignment, + Twine Name) { + return createInterleavedMemOp(Ctx, Data, Data->getType(), Ptr, Stride, Mask, + EVL, Alignment, Name); +} + +static std::string getScatterGatherMemOpName(Type *DataTy, VectorType *VecPtrTy, + Type *MaskTy, unsigned Alignment, + bool IsGather, bool IsVP) { + if (!DataTy) { + return std::string(); + } + compiler::utils::NameMangler Mangler(&DataTy->getContext()); + const char *BaseName = IsGather ? "gather_load" : "scatter_store"; + std::string Name; + const compiler::utils::TypeQualifiers VecQuals( + compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone); + compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone, + compiler::utils::eTypeQualNone); + const compiler::utils::TypeQualifiers MaskQuals( + compiler::utils::eTypeQualNone); + PtrQuals.push_back(compiler::utils::eTypeQualNone); + raw_string_ostream O(Name); + O << VectorizationContext::InternalBuiltinPrefix; + if (MaskTy) { + O << "masked_"; + } + O << BaseName << Alignment << "_"; + if (IsVP) { + O << "vp_"; + } + if (!Mangler.mangleType(O, DataTy, VecQuals) || + !Mangler.mangleType(O, VecPtrTy, PtrQuals)) { + return std::string(); + } + if (MaskTy && !Mangler.mangleType(O, MaskTy, MaskQuals)) { + return std::string(); + } + if (IsVP) { + const compiler::utils::TypeQualifiers VLQuals( + compiler::utils::eTypeQualNone); + if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()), + VLQuals)) { + return std::string(); + } + } + O.flush(); + return Name; +} + +Function *vecz::getOrCreateScatterGatherMemOpFn(vecz::VectorizationContext &Ctx, + llvm::Type *DataTy, + llvm::VectorType *VecPtrTy, + llvm::Type *MaskTy, + unsigned Alignment, + bool IsGather, bool IsVP) { + const Module &M = Ctx.module(); + LLVMContext &LLVMCtx = M.getContext(); + assert(VecPtrTy); + assert(!MaskTy || multi_llvm::getVectorElementCount(MaskTy) == + multi_llvm::getVectorElementCount(DataTy)); + + // Try to retrieve the builtin if it already exists. + const std::string Name = getScatterGatherMemOpName(DataTy, VecPtrTy, MaskTy, + Alignment, IsGather, IsVP); + VECZ_FAIL_IF(Name.empty()); + Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr); + if (!F) { + // Declare it if it doesn't exist. + SmallVector Tys; + if (!IsGather) { + VECZ_FAIL_IF(!DataTy); + Tys.push_back(DataTy); + } + Tys.push_back(VecPtrTy); + if (MaskTy) { + Tys.push_back(MaskTy); + } + if (IsVP) { + Tys.push_back(IntegerType::getInt32Ty(LLVMCtx)); + } + + Type *RetTy = IsGather ? DataTy : Type::getVoidTy(LLVMCtx); + FunctionType *FT = FunctionType::get(RetTy, Tys, false); + F = Ctx.getOrCreateInternalBuiltin(Name, FT); + } + return F; +} + +static CallInst *createScatterGatherMemOp(VectorizationContext &Ctx, + Value *VecData, Type *DataTy, + Value *VecPtr, Value *Mask, + Value *EVL, unsigned Alignment, + Twine Name) { + VECZ_FAIL_IF(!DataTy); + VECZ_FAIL_IF(!VecPtr || !VecPtr->getType()->isVectorTy() || + !VecPtr->getType()->getScalarType()->isPointerTy()); + Type *MaskTy = Mask ? Mask->getType() : nullptr; + Function *F = getOrCreateScatterGatherMemOpFn( + Ctx, DataTy, cast(VecPtr->getType()), MaskTy, Alignment, + /*IsGather*/ VecData == nullptr, EVL != nullptr); + VECZ_FAIL_IF(!F); + SmallVector Ops; + if (VecData) { + Ops.push_back(VecData); + } + Ops.push_back(VecPtr); + if (Mask) { + Ops.push_back(Mask); + } + if (EVL) { + Ops.push_back(EVL); + } + return CallInst::Create(F, Ops, Name); +} + +llvm::CallInst *vecz::createGather(VectorizationContext &Ctx, llvm::Type *Ty, + llvm::Value *VecPtr, llvm::Value *Mask, + llvm::Value *EVL, unsigned Alignment, + llvm::Twine Name) { + return createScatterGatherMemOp(Ctx, /*Data*/ nullptr, Ty, VecPtr, Mask, EVL, + Alignment, Name); +} + +llvm::CallInst *vecz::createScatter(VectorizationContext &Ctx, + llvm::Value *VecData, llvm::Value *VecPtr, + llvm::Value *Mask, llvm::Value *EVL, + unsigned Alignment, llvm::Twine Name) { + return createScatterGatherMemOp(Ctx, VecData, VecData->getType(), VecPtr, + Mask, EVL, Alignment, Name); +} + +MemOpDesc::MemOpDesc() + : DataTy(nullptr), PtrTy(nullptr), MaskTy(nullptr), + Kind(MemOpKind::Invalid), AccessKind(MemOpAccessKind::Native), + IsVLOp(false), Alignment(1), Stride(nullptr), DataOpIdx(-1), PtrOpIdx(-1), + MaskOpIdx(-1), VLOpIdx(-1) {} + +bool MemOpDesc::isStrideConstantInt() const { + return Stride && isa(Stride); +} + +int64_t MemOpDesc::getStrideAsConstantInt() const { + return cast(Stride)->getSExtValue(); +} + +Argument *MemOpDesc::getOperand(Function *F, int OpIdx) const { + VECZ_FAIL_IF(!F || (OpIdx < 0) || ((size_t)OpIdx >= F->arg_size())); + return F->getArg(OpIdx); +} + +std::optional MemOpDesc::analyzeMemOpFunction(Function &F) { + if (auto Op = MemOpDesc::analyzeMaskedMemOp(F)) { + return Op; + } + if (auto Op = MemOpDesc::analyzeInterleavedMemOp(F)) { + return Op; + } + if (auto Op = MemOpDesc::analyzeMaskedInterleavedMemOp(F)) { + return Op; + } + if (auto Op = MemOpDesc::analyzeScatterGatherMemOp(F)) { + return Op; + } + if (auto Op = MemOpDesc::analyzeMaskedScatterGatherMemOp(F)) { + return Op; + } + return std::nullopt; +} + +std::optional MemOpDesc::analyzeMaskedMemOp(Function &F) { + const StringRef MangledName = F.getName(); + compiler::utils::Lexer L(MangledName); + if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) { + return std::nullopt; + } + + MemOpDesc Desc; + if (L.Consume("masked_store")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + Desc.IsVLOp = L.Consume("vp_"); + if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.DataTy = Arg->getType(); + ++Arg; + Desc.PtrTy = Arg->getType(); + Desc.Kind = MemOpKind::StoreCall; + Desc.DataOpIdx = 0; + Desc.PtrOpIdx = 1; + Desc.MaskOpIdx = 2; + Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType(); + Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1; + Desc.AccessKind = MemOpAccessKind::Masked; + return Desc; + } + + if (L.Consume("masked_load")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + Desc.IsVLOp = L.Consume("vp_"); + if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.PtrTy = Arg->getType(); + Desc.DataTy = F.getReturnType(); + Desc.Kind = MemOpKind::LoadCall; + Desc.DataOpIdx = -1; + Desc.PtrOpIdx = 0; + Desc.MaskOpIdx = 1; + Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType(); + Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1; + Desc.AccessKind = MemOpAccessKind::Masked; + return Desc; + } + return std::nullopt; +} + +std::optional MemOpDesc::analyzeInterleavedMemOp(Function &F) { + const StringRef MangledName = F.getName(); + compiler::utils::Lexer L(MangledName); + if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) { + return std::nullopt; + } + MemOpDesc Desc; + int ConstantStride{}; + if (L.Consume("interleaved_store")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + if (L.ConsumeSignedInteger(ConstantStride)) { + VECZ_ERROR_IF(F.arg_size() != 2, + "Wrong argument list size for interleaved store"); + Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride); + } else if (L.Consume("V")) { + VECZ_ERROR_IF(F.arg_size() != 3, + "Wrong argument list size for interleaved store"); + auto ArgIt = F.arg_begin(); + std::advance(ArgIt, 2); + Desc.Stride = &*ArgIt; + } else { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.DataTy = Arg->getType(); + ++Arg; + Desc.PtrTy = Arg->getType(); + Desc.Kind = MemOpKind::StoreCall; + Desc.DataOpIdx = 0; + Desc.PtrOpIdx = 1; + Desc.AccessKind = MemOpAccessKind::Interleaved; + return Desc; + } + + if (L.Consume("interleaved_load")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + if (L.ConsumeSignedInteger(ConstantStride)) { + VECZ_ERROR_IF(F.arg_size() != 1, + "Wrong argument list size for interleaved load"); + Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride); + } else if (L.Consume("V")) { + VECZ_ERROR_IF(F.arg_size() != 2, + "Wrong argument list size for interleaved load"); + auto ArgIt = F.arg_begin(); + std::advance(ArgIt, 1); + Desc.Stride = &*ArgIt; + } else { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.PtrTy = Arg->getType(); + Desc.DataTy = F.getReturnType(); + Desc.Kind = MemOpKind::LoadCall; + Desc.DataOpIdx = -1; + Desc.PtrOpIdx = 0; + Desc.AccessKind = MemOpAccessKind::Interleaved; + return Desc; + } + + return std::nullopt; +} + +std::optional MemOpDesc::analyzeMaskedInterleavedMemOp(Function &F) { + const StringRef MangledName = F.getName(); + compiler::utils::Lexer L(MangledName); + if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) { + return std::nullopt; + } + MemOpDesc Desc; + if (L.Consume("masked_interleaved_store")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + Desc.IsVLOp = L.Consume("vp_"); + // KLOCWORK "UNINIT.STACK.MUST" possible false positive + // Initialization of ConstantStride looks like an uninitialized access to + // Klocwork + int ConstantStride; + if (L.ConsumeSignedInteger(ConstantStride)) { + if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) { + return std::nullopt; + } + Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride); + } else if (L.Consume("V")) { + if (F.arg_size() != 4 + (unsigned)Desc.IsVLOp) { + return std::nullopt; + } + auto ArgIt = F.arg_begin(); + std::advance(ArgIt, 3 + Desc.IsVLOp); + Desc.Stride = &*ArgIt; + } else { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.DataTy = Arg->getType(); + ++Arg; + Desc.PtrTy = Arg->getType(); + Desc.Kind = MemOpKind::StoreCall; + Desc.DataOpIdx = 0; + Desc.PtrOpIdx = 1; + Desc.MaskOpIdx = 2; + Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType(); + Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1; + Desc.AccessKind = MemOpAccessKind::MaskedInterleaved; + return Desc; + } + if (L.Consume("masked_interleaved_load")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + Desc.IsVLOp = L.Consume("vp_"); + // KLOCWORK "UNINIT.STACK.MUST" possible false positive + // Initialization of ConstantStride looks like an uninitialized access to + // Klocwork + int ConstantStride; + if (L.ConsumeSignedInteger(ConstantStride)) { + if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) { + return std::nullopt; + } + Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride); + } else if (L.Consume("V")) { + if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) { + return std::nullopt; + } + auto ArgIt = F.arg_begin(); + std::advance(ArgIt, 2 + Desc.IsVLOp); + Desc.Stride = &*ArgIt; + } else { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.PtrTy = Arg->getType(); + Desc.DataTy = F.getReturnType(); + Desc.Kind = MemOpKind::LoadCall; + Desc.DataOpIdx = -1; + Desc.PtrOpIdx = 0; + Desc.MaskOpIdx = 1; + Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType(); + Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1; + Desc.AccessKind = MemOpAccessKind::MaskedInterleaved; + return Desc; + } + + return std::nullopt; +} + +std::optional MemOpDesc::analyzeScatterGatherMemOp(Function &F) { + const StringRef MangledName = F.getName(); + compiler::utils::Lexer L(MangledName); + if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) { + return std::nullopt; + } + MemOpDesc Desc; + if (L.Consume("scatter_store")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + if (F.arg_size() != 2) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.DataTy = Arg->getType(); + ++Arg; + Desc.PtrTy = Arg->getType(); + Desc.Kind = MemOpKind::StoreCall; + Desc.DataOpIdx = 0; + Desc.PtrOpIdx = 1; + Desc.AccessKind = MemOpAccessKind::ScatterGather; + return Desc; + } + + if (L.Consume("gather_load")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + if (F.arg_size() != 1) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.PtrTy = Arg->getType(); + Desc.DataTy = F.getReturnType(); + Desc.Kind = MemOpKind::LoadCall; + Desc.DataOpIdx = -1; + Desc.PtrOpIdx = 0; + Desc.AccessKind = MemOpAccessKind::ScatterGather; + return Desc; + } + + return std::nullopt; +} + +std::optional +MemOpDesc::analyzeMaskedScatterGatherMemOp(Function &F) { + const StringRef MangledName = F.getName(); + compiler::utils::Lexer L(MangledName); + if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) { + return std::nullopt; + } + + MemOpDesc Desc; + if (L.Consume("masked_scatter_store")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + Desc.IsVLOp = L.Consume("vp_"); + if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.DataTy = Arg->getType(); + ++Arg; + Desc.PtrTy = Arg->getType(); + Desc.Kind = MemOpKind::StoreCall; + Desc.DataOpIdx = 0; + Desc.PtrOpIdx = 1; + Desc.MaskOpIdx = 2; + Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType(); + Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1; + Desc.AccessKind = MemOpAccessKind::MaskedScatterGather; + return Desc; + } + + if (L.Consume("masked_gather_load")) { + if (!L.ConsumeInteger(Desc.Alignment)) { + return std::nullopt; + } + if (!L.Consume("_")) { + return std::nullopt; + } + Desc.IsVLOp = L.Consume("vp_"); + if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) { + return std::nullopt; + } + + Function::arg_iterator Arg = F.arg_begin(); + Desc.PtrTy = Arg->getType(); + Desc.DataTy = F.getReturnType(); + Desc.Kind = MemOpKind::LoadCall; + Desc.DataOpIdx = -1; + Desc.PtrOpIdx = 0; + Desc.MaskOpIdx = 1; + Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType(); + Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1; + Desc.AccessKind = MemOpAccessKind::MaskedScatterGather; + return Desc; + } + + return std::nullopt; +} + +//////////////////////////////////////////////////////////////////////////////// + +std::optional MemOp::get(llvm::Instruction *I) { + if (LoadInst *LI = dyn_cast(I)) { + MemOpDesc Desc; + Desc.Kind = MemOpKind::LoadInstruction; + Desc.Alignment = LI->getAlign().value(); + Desc.DataTy = LI->getType(); + auto *PO = LI->getPointerOperand(); + assert(PO && "Could not get pointer operand"); + Desc.PtrTy = PO->getType(); + return MemOp(I, Desc); + } + if (StoreInst *SI = dyn_cast(I)) { + MemOpDesc Desc; + Desc.Kind = MemOpKind::StoreInstruction; + Desc.Alignment = SI->getAlign().value(); + assert(SI->getValueOperand() && "Could not get value operand"); + Desc.DataTy = SI->getValueOperand()->getType(); + auto *PO = SI->getPointerOperand(); + assert(PO && "Could not get pointer operand"); + Desc.PtrTy = PO->getType(); + return MemOp(I, Desc); + } + if (CallInst *CI = dyn_cast(I)) { + if (Function *Caller = CI->getCalledFunction()) { + if (auto FnOp = MemOpDesc::analyzeMemOpFunction(*Caller)) { + return MemOp(I, *FnOp); + } + } + } + return std::nullopt; +} + +std::optional MemOp::get(llvm::CallInst *CI, + MemOpAccessKind AccessKind) { + if (!CI->getCalledFunction()) { + return std::nullopt; + } + std::optional Desc; + if (Function *Caller = CI->getCalledFunction()) { + switch (AccessKind) { + default: + return std::nullopt; + case MemOpAccessKind::Masked: + Desc = MemOpDesc::analyzeMaskedMemOp(*Caller); + break; + case MemOpAccessKind::Interleaved: + Desc = MemOpDesc::analyzeInterleavedMemOp(*Caller); + break; + case MemOpAccessKind::MaskedInterleaved: + Desc = MemOpDesc::analyzeMaskedInterleavedMemOp(*Caller); + break; + case MemOpAccessKind::ScatterGather: + Desc = MemOpDesc::analyzeScatterGatherMemOp(*Caller); + break; + case MemOpAccessKind::MaskedScatterGather: + Desc = MemOpDesc::analyzeMaskedScatterGatherMemOp(*Caller); + break; + } + } + if (!Desc) { + return std::nullopt; + } + return MemOp(CI, *Desc); +} + +MemOp::MemOp(Instruction *I, const MemOpDesc &desc) { + Ins = I; + Desc = desc; +} + +llvm::Value *MemOp::getCallOperand(int OpIdx) const { + VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) && + (Desc.getKind() != MemOpKind::StoreCall)); + CallInst *CI = dyn_cast(Ins); + VECZ_FAIL_IF(!CI || (OpIdx < 0) || ((unsigned)OpIdx >= CI->arg_size())); + return CI->getArgOperand((unsigned)OpIdx); +} + +bool MemOp::setCallOperand(int OpIdx, Value *V) { + VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) && + (Desc.getKind() != MemOpKind::StoreCall)); + CallInst *CI = dyn_cast(Ins); + VECZ_FAIL_IF(!CI || (OpIdx < 0) || ((unsigned)OpIdx >= CI->arg_size())); + CI->setArgOperand((unsigned)OpIdx, V); + return true; +} + +llvm::Value *MemOp::getDataOperand() const { + if (Desc.getKind() == MemOpKind::StoreInstruction) { + return cast(Ins)->getValueOperand(); + } else if (Desc.getKind() == MemOpKind::StoreCall) { + return getCallOperand(Desc.getDataOperandIndex()); + } else { + return nullptr; + } +} + +llvm::Value *MemOp::getPointerOperand() const { + switch (Desc.getKind()) { + default: + return nullptr; + case MemOpKind::LoadInstruction: + return cast(Ins)->getPointerOperand(); + case MemOpKind::StoreInstruction: + return cast(Ins)->getPointerOperand(); + case MemOpKind::LoadCall: + case MemOpKind::StoreCall: + return getCallOperand(Desc.getPointerOperandIndex()); + } +} + +llvm::Value *MemOp::getMaskOperand() const { + switch (Desc.getKind()) { + default: + return nullptr; + case MemOpKind::LoadCall: + case MemOpKind::StoreCall: + return getCallOperand(Desc.getMaskOperandIndex()); + } +} + +bool MemOp::setDataOperand(Value *V) { + if (Desc.getKind() == MemOpKind::StoreInstruction) { + cast(Ins)->setOperand(0, V); + return true; + } else if (Desc.getKind() == MemOpKind::StoreCall) { + return setCallOperand(Desc.getDataOperandIndex(), V); + } else { + return false; + } +} + +bool MemOp::setPointerOperand(Value *V) { + switch (Desc.getKind()) { + default: + return false; + case MemOpKind::LoadInstruction: + cast(Ins)->setOperand(0, V); + return true; + case MemOpKind::StoreInstruction: + cast(Ins)->setOperand(1, V); + return true; + case MemOpKind::LoadCall: + case MemOpKind::StoreCall: + return setCallOperand(Desc.getPointerOperandIndex(), V); + } +} + +bool MemOp::setMaskOperand(Value *V) { + switch (Desc.getKind()) { + default: + return false; + case MemOpKind::LoadCall: + case MemOpKind::StoreCall: + return setCallOperand(Desc.getMaskOperandIndex(), V); + } +} + +CallInst *MemOp::getCall() const { + VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) && + (Desc.getKind() != MemOpKind::StoreCall)); + return dyn_cast(Ins); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp new file mode 100644 index 0000000000000..a93fdacbdc9a2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp @@ -0,0 +1,1070 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "offset_info.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "analysis/instantiation_analysis.h" +#include "analysis/stride_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "memory_operations.h" +#include "vectorization_context.h" +#include "vectorization_unit.h" + +using namespace vecz; +using namespace llvm; + +namespace { +inline uint64_t SizeOrZero(TypeSize &&T) { + return T.isScalable() ? 0 : T.getFixedValue(); +} + +uint8_t highbit(const uint32_t x) { + assert(isPowerOf2_32(x) && "Value must be a power of two"); + // This is a De Bruijn hash table, it returns the index of the highest + // bit, which works when x is a power of 2. For details, see + // https://en.wikipedia.org/wiki/De_Bruijn_sequence#Uses + static const uint32_t deBruijn_magic = 0x06EB14F9U; + static const uint8_t tab[32] = { + 0, 1, 16, 2, 29, 17, 3, 22, 30, 20, 18, 11, 13, 4, 7, 23, + 31, 15, 28, 21, 19, 10, 12, 6, 14, 27, 9, 5, 26, 8, 25, 24, + }; + return tab[(uint32_t)(x * deBruijn_magic) >> 27]; +} + +// Returns a value extended or truncated to match the size type of the target. +// This will return the original value if it is already the correct size. +Value *matchSizeType(IRBuilder<> &B, Value *V, bool sext) { + auto *const sizeTy = getSizeTy(B); + + if (sext) { + return B.CreateSExtOrTrunc(V, sizeTy, "stride_conv"); + } else { + return B.CreateZExtOrTrunc(V, sizeTy, "stride_conv"); + } +} + +uint64_t getTypeMask(Type *Ty) { + const auto bits = Ty->getIntegerBitWidth(); + return bits < 64 ? ((uint64_t(1) << bits) - 1) : ~uint64_t(0); +} + +// The index size potentially depends on the address space of the pointer, +// but let's just use the pointer size for now. +uint64_t getSizeTypeMask(const DataLayout &DL) { + const auto bits = DL.getPointerSizeInBits(); + return bits < 64 ? ((uint64_t(1) << bits) - 1) : ~uint64_t(0); +} + +OffsetKind combineKinds(OffsetKind LHS, OffsetKind RHS) { + assert(LHS != eOffsetLinear && RHS != eOffsetLinear && + "OffsetInfo analysis functions should handle all linear cases"); + + if (LHS == RHS) { + return LHS; + } + + if (LHS == eOffsetMayDiverge || RHS == eOffsetMayDiverge) { + return eOffsetMayDiverge; + } + + // Uniform values are all that's left. + return eOffsetUniformVariable; +} +} // namespace + +OffsetInfo::OffsetInfo(StrideAnalysisResult &SAR, Value *V) + : Kind(eOffsetMayDiverge), ActualValue(V), StrideInt(0), + ManifestStride(nullptr), BitMask(~uint64_t(0)) { + auto *const ty = V->getType(); + if (ty->isIntegerTy()) { + analyze(V, SAR); + } else if (ty->isPointerTy()) { + analyzePtr(V, SAR); + } else { + setMayDiverge(); + } +} + +Value *OffsetInfo::getUniformValue() const { + return isUniform() ? ActualValue : nullptr; +} + +int64_t OffsetInfo::getValueAsConstantInt() const { + ConstantInt *CInt = cast(ActualValue); + return CInt->getSExtValue(); +} + +bool OffsetInfo::isStrideConstantInt() const { + return (Kind == eOffsetLinear && StrideInt != 0); +} + +int64_t OffsetInfo::getStrideAsConstantInt() const { return StrideInt; } + +OffsetInfo &OffsetInfo::setMayDiverge() { return setKind(eOffsetMayDiverge); } + +OffsetInfo &OffsetInfo::setStride(Value *Stride) { + if (auto *const CInt = dyn_cast_or_null(Stride)) { + StrideInt = CInt->getSExtValue(); + } else { + StrideInt = 0; + } + ManifestStride = Stride; + Kind = eOffsetLinear; + return *this; +} + +OffsetInfo &OffsetInfo::setStride(int64_t Stride) { + if (Stride == 0) { + Kind = eOffsetUniformVariable; + } else { + StrideInt = Stride; + ManifestStride = nullptr; + Kind = eOffsetLinear; + } + return *this; +} + +OffsetInfo &OffsetInfo::setKind(OffsetKind K) { + Kind = K; + return *this; +} + +OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) { + Type *OffsetTy = Offset->getType(); + if (!OffsetTy->isIntegerTy() || OffsetTy->isVectorTy()) { + return setMayDiverge(); + } + + if (auto *const CInt = dyn_cast(Offset)) { + BitMask = CInt->getZExtValue(); + return setKind(eOffsetConstant); + } + BitMask = getTypeMask(OffsetTy); + + if (isa(Offset)) { + return setKind(eOffsetUniformVariable); + } + + Instruction *Ins = dyn_cast(Offset); + if (!Ins) { + return setMayDiverge(); + } + + // If we have a uniform value here we don't need to analyse any further. + if (!SAR.UVR.isVarying(Ins)) { + SimplifyQuery SQ(SAR.F.getParent()->getDataLayout()); + SQ.AC = &SAR.AC; + const WithCache InsWithCache(Ins); + const auto &KB = InsWithCache.getKnownBits(SQ); + const auto bitWidth = OffsetTy->getIntegerBitWidth(); + + // We are interested in the bits that are not known to be zero. + BitMask &= ~KB.Zero.extractBitsAsZExtValue(bitWidth, 0); + return setKind(eOffsetUniformVariable); + } + + // Analyse binary instructions. + if (BinaryOperator *BOp = dyn_cast(Offset)) { + // Copy these values into local variables, because `SAR.analyze()` can + // invalidate any previously obtained references. + const auto LHS = SAR.analyze(BOp->getOperand(0)); + const auto RHS = SAR.analyze(BOp->getOperand(1)); + if (LHS.mayDiverge() || RHS.mayDiverge()) { + return setMayDiverge(); + } + + if (isa(BOp) && !BOp->hasNoUnsignedWrap()) { + // This operation can over/underflow, therefore all bets are off on + // which bits are on. We set it to all ones so a ZExt will catch it. + // SExt does not care since overflow is UB. + BitMask = ~uint64_t(0); + } + + switch (BOp->getOpcode()) { + default: + return setMayDiverge(); + case Instruction::Add: + return combineAdd(LHS, RHS); + case Instruction::Sub: + return combineSub(LHS, RHS); + case Instruction::And: + return combineAnd(LHS, RHS); + case Instruction::Or: + return combineOr(LHS, RHS); + case Instruction::Xor: + return combineXor(LHS, RHS); + case Instruction::Mul: + return combineMul(LHS, RHS); + case Instruction::Shl: + return combineShl(LHS, RHS); + case Instruction::AShr: + return combineAShr(LHS, RHS); + } + } + + // Consider that integer casts cannot scale item IDs. + if (CastInst *Cast = dyn_cast(Offset)) { + const auto &Src = SAR.analyze(Cast->getOperand(0)); + if (Src.mayDiverge()) { + return setMayDiverge(); + } + + // However, a Zero-extended offset can underflow. + if (isa(Cast)) { + // A zero-extended offset could underflow and result in an invalid base + // address, rendering the entire strided MemOp invalid, even when masked + // such that the read from the base address is not meant to execute. + // Note that we don't care about overflowing the index type. + const auto typeMask = getTypeMask(Cast->getSrcTy()); + const auto bitMaskSized = + Src.BitMask & getSizeTypeMask(Cast->getModule()->getDataLayout()); + if ((bitMaskSized & typeMask) != bitMaskSized) { + return setMayDiverge(); + } + BitMask = Src.BitMask & typeMask; + } else if (isa(Cast)) { + const uint64_t widthMask = getTypeMask(Cast->getSrcTy()); + const uint64_t signMask = (widthMask >> 1) + 1; + if (Src.BitMask & signMask) { + // If it's possible for the source value to be negative, all of the + // bits in the extended value might be set. + BitMask = Src.BitMask | ~widthMask; + } else { + BitMask = Src.BitMask & widthMask; + } + } else { + // We don't truncate the bitmask here, since we don't know if it's going + // to be sign extended or zero extended later, which affects whether we + // can ignore overflow or not. + BitMask = Src.BitMask; + } + return copyStrideFrom(Src); + } + + if (auto *Select = dyn_cast(Offset)) { + if (SAR.UVR.isVarying(Select->getCondition())) { + return setMayDiverge(); + } + + // If the condition isn't varying and both operands have the same + // constant stride, the result will also have the same constant stride. + const auto LHS = SAR.analyze(Select->getOperand(1)); + const auto RHS = SAR.analyze(Select->getOperand(2)); + if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt && + LHS.isStrideConstantInt()) { + // Merge the bitmasks from either source - we are selecting one of them. + BitMask = LHS.BitMask | RHS.BitMask; + return copyStrideFrom(LHS); + } + return setMayDiverge(); + } + + if (auto *Phi = dyn_cast(Offset)) { + if (auto *const CVal = Phi->hasConstantValue()) { + return copyStrideAndBitMaskFrom(SAR.analyze(CVal)); + } + + auto NumIncoming = Phi->getNumIncomingValues(); + if (NumIncoming == 1) { + // LCSSA Phi, just go right through it.. + return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0))); + } else if (NumIncoming == 2) { + auto identifyIncrement = [&](Value *incoming) -> bool { + if (auto *BOp = dyn_cast(incoming)) { + auto Opcode = BOp->getOpcode(); + // If it's a simple loop iterator, the stride can be analyzed from the + // initial value. + return ((Opcode == Instruction::Add || Opcode == Instruction::Sub) && + BOp->getOperand(0) == Phi && + !SAR.UVR.isVarying(BOp->getOperand(1))) || + (Opcode == Instruction::Add && BOp->getOperand(1) == Phi && + !SAR.UVR.isVarying(BOp->getOperand(0))); + } + return false; + }; + + // Try the PHI node's incoming values both ways round. + if (identifyIncrement(Phi->getIncomingValue(1))) { + return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0))); + } else if (identifyIncrement(Phi->getIncomingValue(0))) { + return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(1))); + } + } + return setMayDiverge(); + } + + // Analyse function calls. + if (CallInst *CI = dyn_cast(Offset)) { + const auto &BI = SAR.UVR.Ctx.builtins(); + if (const auto Builtin = BI.analyzeBuiltinCall(*CI, SAR.UVR.dimension)) { + switch (Builtin->uniformity) { + default: + case compiler::utils::eBuiltinUniformityMaybeInstanceID: + case compiler::utils::eBuiltinUniformityNever: + return setMayDiverge(); + case compiler::utils::eBuiltinUniformityLikeInputs: + break; + case compiler::utils::eBuiltinUniformityAlways: + return setKind(eOffsetUniformVariable); + case compiler::utils::eBuiltinUniformityInstanceID: + if (Builtin->properties & compiler::utils::eBuiltinPropertyLocalID) { + // If the local size is unknown (represented by zero), the resulting + // mask will be ~0ULL (all ones). Potentially, it is possible to use + // the CL_DEVICE_MAX_WORK_ITEM_SIZES property as an upper bound in + // this case. + uint64_t LocalBitMask = SAR.UVR.VU.getLocalSize() - 1; + LocalBitMask |= LocalBitMask >> 32; + LocalBitMask |= LocalBitMask >> 16; + LocalBitMask |= LocalBitMask >> 8; + LocalBitMask |= LocalBitMask >> 4; + LocalBitMask |= LocalBitMask >> 2; + LocalBitMask |= LocalBitMask >> 1; + BitMask = LocalBitMask; + } + return setStride(1); + } + } + } + + return setMayDiverge(); +} + +OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) { + if (BitCastInst *BCast = dyn_cast(Address)) { + return copyStrideAndBitMaskFrom(SAR.analyze(BCast->getOperand(0))); + } else if (auto *ASCast = dyn_cast(Address)) { + return copyStrideAndBitMaskFrom(SAR.analyze(ASCast->getOperand(0))); + } else if (auto *IntPtr = dyn_cast(Address)) { + return copyStrideAndBitMaskFrom(SAR.analyze(IntPtr->getOperand(0))); + } else if (auto *Arg = dyn_cast(Address)) { + // 'Pointer return' arguments should be treated as having an implicit ItemID + // offset. This allows memory operations to be packetized instead of + // instantiated. + if (Arg->getType()->isPointerTy()) { + for (const VectorizerTargetArgument &VUArg : SAR.UVR.VU.arguments()) { + if (((VUArg.OldArg == Arg) || (VUArg.NewArg == Arg)) && + VUArg.PointerRetPointeeTy) { + Type *MemTy = VUArg.PointerRetPointeeTy; + const uint64_t MemSize = + SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy); + return setStride(MemSize); + } + } + } + return setKind(eOffsetUniformVariable); + } else if (isa(Address)) { + return setKind(eOffsetUniformVariable); + } else if (!SAR.UVR.isVarying(Address)) { + // If it's uniform we can just return the uniform address. + // Check this condition before bothering to descend into Phi nodes or GEPs, + // since we know stride is zero anyway. + return setKind(eOffsetUniformVariable); + } else if (auto *const Alloca = dyn_cast(Address)) { + if (needsInstantiation(SAR.UVR.Ctx, *Alloca)) { + // Instantiated allocas result in scatter/gather + return setMayDiverge(); + } + + Type *MemTy = Alloca->getAllocatedType(); + const uint64_t MemSize = SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy); + return setStride(MemSize); + } else if (auto *const Phi = dyn_cast(Address)) { + // If all the incoming values are the same, we can trace through it. In + // the general case, it's not trivial to check that the stride is the same + // from every incoming block, and since incoming values may not dominate + // the IRBuilder insert point, we might not even be able to build the + // offset expression instructions there. + if (auto *const CVal = Phi->hasConstantValue()) { + return copyStrideAndBitMaskFrom(SAR.analyze(CVal)); + } + + // In the simple case of a loop-incremented pointer using a GEP, we can + // handle it thus: + auto NumIncoming = Phi->getNumIncomingValues(); + if (NumIncoming != 2) { + // Perhaps we can handle more than one loop latch, but not yet. + return setMayDiverge(); + } + + if (auto *const GEP = + dyn_cast(Phi->getIncomingValue(1))) { + // If it's a simple loop iterator, the stride can be analyzed from the + // initial value. + if (GEP->getPointerOperand() == Phi) { + for (const auto &index : GEP->indices()) { + if (SAR.UVR.isVarying(index.get())) { + return setMayDiverge(); + } + } + return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0))); + } + } else if (auto *const GEP = + dyn_cast(Phi->getIncomingValue(0))) { + // If it's a simple loop iterator, the stride can be analyzed from the + // initial value. + if (GEP->getPointerOperand() == Phi) { + for (const auto &index : GEP->indices()) { + if (SAR.UVR.isVarying(index.get())) { + return setMayDiverge(); + } + } + return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(1))); + } + } + + return setMayDiverge(); + } else if (auto *GEP = dyn_cast(Address)) { + { + auto *const Ptr = GEP->getPointerOperand(); + const auto &PtrInfo = SAR.analyze(Ptr); + if (PtrInfo.mayDiverge()) { + if (isa(Ptr)) { + // For the benefit of the Ternary Transform Pass + for (Value *idx : GEP->indices()) { + SAR.analyze(idx); + } + } + return setMayDiverge(); + } + copyStrideFrom(PtrInfo); + } + + PointerType *GEPPtrTy = dyn_cast(GEP->getPointerOperandType()); + if (!GEPPtrTy) { + // A GEP base can be a vector of pointers, for instance. (Unexpected!) + return setMayDiverge(); + } + + int64_t GEPStrideInt = StrideInt; + bool StrideVariable = (hasStride() && StrideInt == 0); + SmallVector Indices; + for (unsigned i = 0; i < GEP->getNumIndices(); i++) { + // Analyze each GEP offset. + Value *GEPIndex = GEP->getOperand(1 + i); + assert(GEPIndex && "Could not get operand from GEP"); + + const auto &idxOffset = SAR.analyze(GEPIndex); + if (idxOffset.mayDiverge()) { + return setMayDiverge(); + } + + Indices.push_back(GEPIndex); + if (!idxOffset.hasStride()) { + continue; + } + + Type *MemTy = GetElementPtrInst::getIndexedType( + GEP->getSourceElementType(), Indices); + if (!MemTy) { + // A somewhat unlikely scenario...? + return setMayDiverge(); + } + + if (idxOffset.isStrideConstantInt()) { + // Add all the strides together, + // since `Base + (A * X) + (B * X) == Base + (A + B) * X` + const uint64_t MemSize = SizeOrZero( + GEP->getModule()->getDataLayout().getTypeAllocSize(MemTy)); + GEPStrideInt += idxOffset.StrideInt * MemSize; + } else { + StrideVariable = true; + } + } + + if (StrideVariable) { + // We don't know what the stride is yet, + // but we know it's linear and variable. + setStride(nullptr); + } else { + setStride(GEPStrideInt); + } + return *this; + } else if (auto *Select = dyn_cast(Address)) { + const auto LHS = SAR.analyze(Select->getOperand(1)); + const auto RHS = SAR.analyze(Select->getOperand(2)); + if (SAR.UVR.isVarying(Select->getCondition())) { + // Note that we analyze the operands before returning here, for the + // benefit of the Ternary Transform Pass, which does its work ONLY + // when the condition is varying. + return setMayDiverge(); + } + + // If the condition isn't varying and both operands have the same + // constant stride, the result will also have the same constant stride. + if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt && + LHS.isStrideConstantInt()) { + // Merge the bitmasks from either source - we are selecting one of them. + BitMask = LHS.BitMask | RHS.BitMask; + return copyStrideFrom(LHS); + } + return setMayDiverge(); + } + + // If it's varying we can't analyze it any further. + return setMayDiverge(); +} + +OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) { + if (ManifestStride || Kind != eOffsetLinear) { + // If we already manifested the stride, or if it's not a linear value, + // there is nothing to do. + return *this; + } + + if (StrideInt != 0) { + // It's an integer stride so we can just create a `ConstantInt`. + ManifestStride = getSizeInt(B, StrideInt); + return *this; + } + + Instruction *Offset = cast(ActualValue); + // Analyse binary instructions. + if (BinaryOperator *BOp = dyn_cast(Offset)) { + const auto &LHS = SAR.manifest(B, BOp->getOperand(0)); + const auto &RHS = SAR.manifest(B, BOp->getOperand(1)); + + // Build strides immediately before their instructions + B.SetInsertPoint(BOp); + switch (BOp->getOpcode()) { + default: + return *this; + case Instruction::Add: + return manifestAdd(B, LHS, RHS); + case Instruction::Sub: + return manifestSub(B, LHS, RHS); + case Instruction::And: + return manifestAnd(B, LHS, RHS); + case Instruction::Or: + return manifestOr(B, LHS, RHS); + case Instruction::Xor: + return manifestXor(B, LHS, RHS); + case Instruction::Mul: + return manifestMul(B, LHS, RHS); + case Instruction::Shl: + return manifestShl(B, LHS, RHS); + case Instruction::AShr: + return manifestAShr(B, LHS, RHS); + } + } + + // Consider that integer casts cannot scale item IDs. + if (CastInst *Cast = dyn_cast(Offset)) { + return copyStrideFrom(SAR.manifest(B, Cast->getOperand(0))); + } + + if (auto *Phi = dyn_cast(Offset)) { + auto NumIncoming = Phi->getNumIncomingValues(); + Value *SrcVal = nullptr; + if (NumIncoming == 1) { + // LCSSA Phi, just go right through it.. + SrcVal = Phi->getIncomingValue(0); + } else if (auto *const CVal = Phi->hasConstantValue()) { + SrcVal = CVal; + } else if (NumIncoming == 2) { + auto identifyIncrement = [&](Value *incoming) -> bool { + if (auto *BOp = dyn_cast(incoming)) { + // If this consumes the Phi node, we have found the increment. + return BOp->getOperand(0) == Phi || BOp->getOperand(1) == Phi; + } else if (auto *GEP = dyn_cast(incoming)) { + return GEP->getPointerOperand() == Phi; + } + return false; + }; + + // Try the PHI node's incoming values both ways round. + if (identifyIncrement(Phi->getIncomingValue(1))) { + SrcVal = Phi->getIncomingValue(0); + } else if (identifyIncrement(Phi->getIncomingValue(0))) { + SrcVal = Phi->getIncomingValue(1); + } + } + assert(SrcVal && "Unexpected Phi node during stride manifestation"); + return copyStrideFrom(SAR.manifest(B, SrcVal)); + } + + if (auto *GEP = dyn_cast(Offset)) { + const auto &Ptr = SAR.manifest(B, GEP->getPointerOperand()); + copyStrideFrom(Ptr); + + PointerType *GEPPtrTy = dyn_cast(GEP->getPointerOperandType()); + if (!GEPPtrTy) { + // A GEP base can be a vector of pointers, for instance. (Unexpected!) + return setMayDiverge(); + } + + Value *GEPStride = nullptr; + SmallVector Indices; + for (unsigned i = 0; i < GEP->getNumIndices(); i++) { + // Analyze each GEP offset. + Value *GEPIndex = GEP->getOperand(1 + i); + assert(GEPIndex && "Could not get operand from GEP"); + + const auto &idxOffset = SAR.manifest(B, GEPIndex); + + Indices.push_back(GEPIndex); + if (!idxOffset.hasStride()) { + continue; + } + + Type *MemTy = GetElementPtrInst::getIndexedType( + GEP->getSourceElementType(), Indices); + + // Build stride instructions immediately before the GEP. Note that the + // process of manifesting the indices can change the insert point. + B.SetInsertPoint(GEP); + Value *idxStride = nullptr; + const uint64_t MemSize = + SizeOrZero(GEP->getModule()->getDataLayout().getTypeAllocSize(MemTy)); + if (MemSize == 1) { + // Don't need to do anything if the size is 1 + idxStride = idxOffset.ManifestStride; + } else { + if (isPowerOf2_64(MemSize)) { + // the size is a power of two, so shift to get the offset in bytes + auto *const SizeVal = getSizeInt(B, highbit(MemSize)); + idxStride = B.CreateShl(idxOffset.ManifestStride, SizeVal); + } else { + // otherwise, multiply + auto *const SizeVal = getSizeInt(B, MemSize); + idxStride = B.CreateMul(idxOffset.ManifestStride, SizeVal); + } + } + + // Add all the strides together, + // since `Base + (A * X) + (B * X) == Base + (A + B) * X` + if (GEPStride) { + GEPStride = B.CreateAdd(GEPStride, idxStride); + } else { + GEPStride = idxStride; + } + } + + if (GEPStride) { + setStride(GEPStride); + } + } + + return *this; +} + +uint64_t OffsetInfo::getConstantMemoryStride(Type *PtrEleTy, + const DataLayout *DL) const { + const uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy)); + VECZ_FAIL_IF(!PtrEleSize); + + // It's not a valid stride if it's not divisible by the element size. + // Can't generate a valid interleaved MemOp from it! + if (StrideInt != 0 && StrideInt % PtrEleSize != 0) { + return 0; + } + return StrideInt / PtrEleSize; +} + +Value *OffsetInfo::buildMemoryStride(IRBuilder<> &B, Type *PtrEleTy, + const DataLayout *DL) const { + if (!ManifestStride) { + assert(Kind != eOffsetLinear && + "buildMemoryStride: linear stride not manifest"); + return nullptr; + } + + const uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy)); + VECZ_FAIL_IF(!PtrEleSize); + + // It's not a valid stride if it's not divisible by the element size. + // Can't generate a valid interleaved MemOp from it! + if (StrideInt != 0 && StrideInt % PtrEleSize != 0) { + return nullptr; + } + + if (isPowerOf2_64(PtrEleSize)) { + auto ShiftVal = highbit(PtrEleSize); + if (auto *BinOp = dyn_cast(ManifestStride)) { + if (BinOp->getOpcode() == Instruction::Shl) { + if (auto *ConstSize = dyn_cast(BinOp->getOperand(1))) { + if (ConstSize->getZExtValue() == ShiftVal) { + return BinOp->getOperand(0); + } + } + } + } + + auto *const stride = + B.CreateAShr(ManifestStride, ConstantInt::get(getSizeTy(B), ShiftVal)); + return stride; + } else { + if (auto *BinOp = dyn_cast(ManifestStride)) { + if (BinOp->getOpcode() == Instruction::Mul) { + if (auto *ConstSize = dyn_cast(BinOp->getOperand(1))) { + if (ConstSize->getZExtValue() == PtrEleSize) { + return BinOp->getOperand(0); + } + } + } + } + + auto *const stride = B.CreateSDiv( + ManifestStride, ConstantInt::get(getSizeTy(B), PtrEleSize)); + return stride; + } +} + +OffsetInfo &OffsetInfo::combineAdd(const OffsetInfo &LHS, + const OffsetInfo &RHS) { + BitMask &= LHS.BitMask | RHS.BitMask | (LHS.BitMask + RHS.BitMask); + + if (LHS.hasStride()) { + if (RHS.hasStride()) { + // Linear + Linear + if (LHS.isStrideConstantInt() && RHS.isStrideConstantInt()) { + return setStride(LHS.StrideInt + RHS.StrideInt); + } else { + return setStride(nullptr); + } + } else { + // Linear + Uniform + return copyStrideFrom(LHS); + } + } else if (RHS.hasStride()) { + // Uniform + Linear + return copyStrideFrom(RHS); + } + + Kind = combineKinds(LHS.Kind, RHS.Kind); + return *this; +} + +OffsetInfo &OffsetInfo::manifestAdd(IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (LHS.hasStride()) { + if (RHS.hasStride()) { + // Linear + Linear + auto *const newAdd = B.CreateAdd(LHS.ManifestStride, RHS.ManifestStride); + return setStride(newAdd); + } else { + // Linear + Uniform + return copyStrideFrom(LHS); + } + } else if (RHS.hasStride()) { + // Uniform + Linear + return copyStrideFrom(RHS); + } + return *this; +} + +OffsetInfo &OffsetInfo::combineSub(const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (LHS.hasStride()) { + if (RHS.hasStride()) { + // Linear - Linear + if (LHS.isStrideConstantInt() && RHS.isStrideConstantInt()) { + return setStride(LHS.StrideInt - RHS.StrideInt); + } else { + return setStride(nullptr); + } + } else { + // Linear - Uniform + return copyStrideFrom(LHS); + } + } else if (RHS.hasStride()) { + // Uniform - Linear + // Subtracting an item ID results in a negative stride. + if (RHS.isStrideConstantInt()) { + return setStride(-RHS.StrideInt); + } else { + return setStride(nullptr); + } + } + Kind = combineKinds(LHS.Kind, RHS.Kind); + return *this; +} + +OffsetInfo &OffsetInfo::manifestSub(IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (LHS.hasStride()) { + if (RHS.hasStride()) { + // Linear - Linear + auto *const newSub = B.CreateSub(LHS.ManifestStride, RHS.ManifestStride); + return setStride(newSub); + } else { + // Linear - Uniform + return copyStrideFrom(LHS); + } + } else if (RHS.hasStride()) { + // Uniform - Linear + // Subtracting an item ID results in a negative stride. + auto *const newNeg = B.CreateNeg(RHS.ManifestStride); + return setStride(newNeg); + } + return *this; +} + +OffsetInfo &OffsetInfo::combineAnd(const OffsetInfo &LHS, + const OffsetInfo &RHS) { + BitMask = LHS.BitMask & RHS.BitMask; + if (LHS.hasStride()) { + if (RHS.hasStride()) { + // Linear & Linear -> can't analyze + return setMayDiverge(); + } else { + // Linear & Uniform + // If we didn't lose any bits of the LHS, we can do it. + if (BitMask == LHS.BitMask) { + return copyStrideFrom(LHS); + } else { + return setMayDiverge(); + } + } + } else if (RHS.hasStride()) { + // Uniform & Linear + // If we didn't lose any bits of the RHS, we can do it. + if (BitMask == RHS.BitMask) { + return copyStrideFrom(RHS); + } else { + return setMayDiverge(); + } + } + + Kind = combineKinds(LHS.Kind, RHS.Kind); + return *this; +} + +OffsetInfo &OffsetInfo::manifestAnd(IRBuilder<> &, const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (LHS.hasStride()) { + return copyStrideFrom(LHS); + } else if (RHS.hasStride()) { + return copyStrideFrom(RHS); + } + return *this; +} + +OffsetInfo &OffsetInfo::combineOr(const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if ((LHS.BitMask & RHS.BitMask) == 0) { + // An Or is equivalent to an Add if the operands have no bits in common. + return combineAdd(LHS, RHS); + } + + if (LHS.hasStride() || RHS.hasStride()) { + return setMayDiverge(); + } + + BitMask = LHS.BitMask | RHS.BitMask; + Kind = combineKinds(LHS.Kind, RHS.Kind); + return *this; +} + +OffsetInfo &OffsetInfo::manifestOr(IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if ((LHS.BitMask & RHS.BitMask) == 0) { + // An Or is equivalent to an Add if the operands have no bits in common. + return manifestAdd(B, LHS, RHS); + } + return *this; +} + +OffsetInfo &OffsetInfo::combineXor(const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if ((LHS.BitMask & RHS.BitMask) == 0) { + // An Xor is equivalent to an Add if the operands have no bits in common. + return combineAdd(LHS, RHS); + } + + if (LHS.hasStride() || RHS.hasStride()) { + return setMayDiverge(); + } + + BitMask = LHS.BitMask | RHS.BitMask; + Kind = combineKinds(LHS.Kind, RHS.Kind); + return *this; +} + +OffsetInfo &OffsetInfo::manifestXor(IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if ((LHS.BitMask & RHS.BitMask) == 0) { + // An Xor is equivalent to an Add if the operands have no bits in common. + return manifestAdd(B, LHS, RHS); + } + return *this; +} + +OffsetInfo &OffsetInfo::combineShl(const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (RHS.hasStride()) { + return setMayDiverge(); + } else if (LHS.hasStride()) { + auto *const Shift = RHS.getUniformValue(); + if (!Shift) { + return setMayDiverge(); + } + + if (ConstantInt *CShift = dyn_cast(Shift)) { + const auto CVal = CShift->getZExtValue(); + BitMask = LHS.BitMask << CVal; + return setStride(LHS.StrideInt << CVal); + } + + BitMask = ~uint64_t(0); + return setStride(nullptr); + } + + Kind = combineKinds(LHS.Kind, RHS.Kind); + return *this; +} + +OffsetInfo &OffsetInfo::manifestShl(IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS) { + auto *const Shift = RHS.getUniformValue(); + if (Shift && LHS.hasStride()) { + auto *const sizeShift = matchSizeType(B, Shift, false); + auto *const newShl = B.CreateShl(LHS.ManifestStride, sizeShift); + return setStride(newShl); + } + return *this; +} + +OffsetInfo &OffsetInfo::combineAShr(const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (RHS.hasStride()) { + return setMayDiverge(); + } else if (LHS.hasStride()) { + auto *const Shift = RHS.getUniformValue(); + if (!Shift) { + return setMayDiverge(); + } + + // We have to be careful with right shifts, because some bits of the stride + // could get shifted out of the right-hand-side, causing it not to be + // uniform anymore. + if (RHS.Kind == eOffsetConstant) { + auto CShift = RHS.getValueAsConstantInt(); + if (CShift < 0 || CShift >= 64) { + // Unlikely, but just in case.. + return setMayDiverge(); + } + + // Note that we shift the bitmask as a signed value. + // Note also that the BitMask is been initialized to the width of the + // integer type. + const uint64_t signMask = (BitMask >> 1) + 1; + if (LHS.BitMask & signMask) { + // If it's possible for the source value to be negative, all of the + // bits in the extended value might be set. + BitMask &= (LHS.BitMask >> CShift) | ~(BitMask >> CShift); + } else { + BitMask &= LHS.BitMask >> CShift; + } + + if (LHS.isStrideConstantInt()) { + const auto lostBits = ((uint64_t(1) << CShift) - 1); + if ((LHS.StrideInt & lostBits) == 0 || (LHS.BitMask & lostBits) == 0) { + return setStride(LHS.StrideInt >> CShift); + } + } else if ((LHS.BitMask & ((uint64_t(1) << CShift) - 1)) == 0) { + return setStride(nullptr); + } + } + return setMayDiverge(); + } + Kind = combineKinds(LHS.Kind, RHS.Kind); + return *this; +} + +OffsetInfo &OffsetInfo::manifestAShr(IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (RHS.Kind == eOffsetConstant) { + auto *const Shift = RHS.getUniformValue(); + const auto CShift = RHS.getValueAsConstantInt(); + + if (!LHS.isStrideConstantInt() && + (LHS.BitMask & ((uint64_t(1) << CShift) - 1)) == 0) { + auto *const sizeShift = matchSizeType(B, Shift, false); + auto *const newAShr = B.CreateAShr(LHS.ManifestStride, sizeShift); + return setStride(newAShr); + } + } + return *this; +} + +OffsetInfo &OffsetInfo::combineMul(const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (LHS.hasStride() && RHS.hasStride()) { + // Linear * Linear = not Linear + return setMayDiverge(); + } + + if (LHS.hasStride()) { + // Linear * Uniform + if (LHS.isStrideConstantInt() && RHS.Kind == eOffsetConstant) { + return setStride(LHS.StrideInt * RHS.getValueAsConstantInt()); + } else { + return setStride(nullptr); + } + } else if (RHS.hasStride()) { + // Uniform * Linear + if (RHS.isStrideConstantInt() && LHS.Kind == eOffsetConstant) { + return setStride(RHS.StrideInt * LHS.getValueAsConstantInt()); + } else { + return setStride(nullptr); + } + } + + Kind = combineKinds(LHS.Kind, RHS.Kind); + return *this; +} + +OffsetInfo &OffsetInfo::manifestMul(IRBuilder<> &B, const OffsetInfo &LHS, + const OffsetInfo &RHS) { + if (LHS.hasStride()) { + // Linear * Uniform + if (auto *const RHSUniform = RHS.getUniformValue()) { + auto *const sizeMul = matchSizeType(B, RHSUniform, true); + auto *const newMul = B.CreateMul(LHS.ManifestStride, sizeMul); + return setStride(newMul); + } + } else if (RHS.hasStride()) { + // Uniform * Linear + if (auto *const LHSUniform = LHS.getUniformValue()) { + auto *const sizeMul = matchSizeType(B, LHSUniform, true); + auto *const newMul = B.CreateMul(RHS.ManifestStride, sizeMul); + return setStride(newMul); + } + } + return *this; +} + +OffsetInfo &OffsetInfo::copyStrideFrom(const OffsetInfo &Other) { + Kind = Other.Kind; + StrideInt = Other.StrideInt; + ManifestStride = Other.ManifestStride; + return *this; +} + +OffsetInfo &OffsetInfo::copyStrideAndBitMaskFrom(const OffsetInfo &Other) { + BitMask = Other.BitMask; + return copyStrideFrom(Other); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp new file mode 100644 index 0000000000000..5d27b424d9d00 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp @@ -0,0 +1,364 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "vecz/pass.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "vectorization_context.h" +#include "vectorization_helpers.h" +#include "vectorization_unit.h" +#include "vectorizer.h" +#include "vecz/vecz_choices.h" +#include "vecz/vecz_target_info.h" +#include "vecz_pass_builder.h" + +#define DEBUG_TYPE "vecz" + +using namespace llvm; + +/// @brief Provide debug logging for Vecz's PassManager +/// +/// This flag is intended for testing and debugging purposes. +static cl::opt + DebugVeczPipeline("debug-vecz-pipeline", + cl::desc("Enable debug logging of the vecz PassManager")); + +/// @brief Provide debug logging for Vecz's PassManager +/// +/// This flag specifies a textual description of the optimization pass pipeline +/// to run over the kernel. +static cl::opt VeczPassPipeline( + "vecz-passes", + cl::desc( + "A textual description of the pass pipeline. To have analysis passes " + "available before a certain pass, add 'require'.")); + +namespace vecz { +using FnVectorizationResult = std::pair; +AnalysisKey VeczPassOptionsAnalysis::Key; + +PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) { + auto getVeczOptions = MAM.getResult(M); + auto preserved = PreservedAnalyses::none(); + // Cache the current set of functions as the vectorizer will insert new ones, + // which we don't want to revisit. + SmallVector>, 4> + FnOpts; + for (auto &Fn : M.functions()) { + llvm::SmallVector Opts; + if (!getVeczOptions(Fn, MAM, Opts)) { + continue; + } + FnOpts.emplace_back(std::make_pair(&Fn, std::move(Opts))); + } + + ModulePassManager PM; + + auto &device_info = MAM.getResult(M); + TargetInfo *target_info = MAM.getResult(M); + assert(target_info && "Missing TargetInfo"); + auto &builtin_info = MAM.getResult(M); + + VectorizationContext Ctx(M, *target_info, builtin_info); + VeczPassMachinery Mach(M.getContext(), target_info->getTargetMachine(), Ctx, + /*verifyEach*/ false, + DebugVeczPipeline + ? compiler::utils::DebugLogging::Normal + : compiler::utils::DebugLogging::None); + Mach.initializeStart(); + Mach.getMAM().registerPass([&device_info] { + return compiler::utils::DeviceInfoAnalysis(device_info); + }); + Mach.initializeFinish(); + + // Forcibly compute the DeviceInfoAnalysis so that cached retrievals work. + PM.addPass( + RequireAnalysisPass()); + + const bool Check = VeczPassPipeline.empty(); + if (Check) { + if (!buildPassPipeline(PM)) { + return PreservedAnalyses::all(); + } + } else { + if (auto Err = Mach.getPB().parsePassPipeline(PM, VeczPassPipeline)) { + // NOTE this is a command line user error print, not a debug print. + // We may want to hoist this out of Vecz once replacing RunVeczPass with + // a passbuilder is resolved. + errs() << "vecz pipeline: " << toString(std::move(Err)) << "\n"; + return PreservedAnalyses::all(); + } + } + + // Create the vectorization units and clone the kernels + using ResultTy = + SmallVector, 2>; + SmallDenseMap Results; + for (auto &P : FnOpts) { + Function *Fn = P.first; + ResultTy T; + Results.insert(std::make_pair(Fn, std::move(T))); + for (auto &Opts : P.second) { + // If we've been given an auto width, try and fit it to any requirements + // that the kernel/device places on its sub-groups. + if (Opts.vecz_auto) { + if (auto AutoSGOpts = getAutoSubgroupSizeOpts(*Fn, MAM)) { + Opts = *AutoSGOpts; + } + } + + auto *const VU = + createVectorizationUnit(Ctx, Fn, Opts, Mach.getFAM(), Check); + if (!VU) { + LLVM_DEBUG(llvm::dbgs() << Fn->getName() << " was not vectorized\n"); + continue; + } + Results[Fn].emplace_back(std::make_pair(VU, &Opts)); + + if (auto *const VecFn = vecz::cloneFunctionToVector(*VU)) { + VU->setVectorizedFunction(VecFn); + + // Allows the Vectorization Unit Analysis to work on the vector kernel + Ctx.setActiveVU(VecFn, VU); + } else { + LLVM_DEBUG(llvm::dbgs() << Fn->getName() << " could not be cloned\n"); + } + } + } + + // Vectorize everything + PM.run(M, Mach.getMAM()); + + auto AllOnModule = llvm::PreservedAnalyses::allInSet>(); + auto eraseFailed = [&](VectorizationUnit *VU) { + Function *VectorizedFn = VU->vectorizedFunction(); + if (VectorizedFn) { + // If we fail to vectorize a function, we still cloned and then + // deleted it which affects internal addresses. The module has changed + // and we can't cache any analyses. + Mach.getFAM().invalidate(*VectorizedFn, llvm::PreservedAnalyses::none()); + // Remove the partially-vectorized function if something went wrong. + Ctx.clearActiveVU(VectorizedFn); + VU->setVectorizedFunction(nullptr); + VectorizedFn->eraseFromParent(); + } + MAM.invalidate(M, AllOnModule); + }; + + // Fix up the metadata and clean out any dead kernels + for (auto &P : Results) { + auto &Result = P.second; + for (auto &R : Result) { + VectorizationUnit *VU = R.first; + trackVeczSuccessFailure(*VU); + if (!createVectorizedFunctionMetadata(*VU)) { + LLVM_DEBUG(dbgs() << P.first->getName() << " failed to vectorize\n"); + eraseFailed(VU); + } + } + } + return PreservedAnalyses::none(); +} + +PreservedAnalyses VeczPassOptionsPrinterPass::run(Module &M, + ModuleAnalysisManager &MAM) { + auto getVeczOptions = MAM.getResult(M); + for (auto &F : M.functions()) { + OS << "Function '" << F.getName() << "'"; + llvm::SmallVector Opts; + if (!getVeczOptions(F, MAM, Opts)) { + OS << " will not be vectorized\n"; + continue; + } + + OS << " will be vectorized {\n"; + for (auto &O : Opts) { + OS << " VF = "; + if (O.factor.isScalable()) { + OS << "vscale x "; + } + OS << O.factor.getKnownMinValue(); + + if (O.vecz_auto) { + OS << ", (auto)"; + } + + OS << ", vec-dim = " << O.vec_dim_idx; + + if (O.local_size) { + OS << ", local-size = " << O.local_size; + } + + OS << ", choices = ["; + OS.tell(); + auto AvailChoices = VectorizationChoices::queryAvailableChoices(); + unsigned NumChoices = 0; + + for (auto &C : AvailChoices) { + if (!O.choices.isEnabled(C.number)) { + continue; + } + if (!NumChoices) { + OS << "\n "; + } else { + OS << ","; + } + OS << C.name; + NumChoices++; + } + // Pretty-print the list of choices on one line if empty, else formatted + // across several lines. Always end with a newline, meaning the options + // are closed off with a '}' on the first column. + if (NumChoices) { + OS << "\n ]\n"; + } else { + OS << "]\n"; + } + } + OS << "}\n"; + } + + return PreservedAnalyses::all(); +} + +std::optional getReqdSubgroupSizeOpts(Function &F) { + if (auto reqd_sg_size = compiler::utils::getReqdSubgroupSize(F)) { + vecz::VeczPassOptions vecz_opts; + // Disable auto - we want a specific width + vecz_opts.vecz_auto = false; + vecz_opts.vec_dim_idx = 0; + // If we can't vectorize to the required sub-group size then we must bail. + if (*reqd_sg_size % compiler::utils::getMuxSubgroupSize(F)) { + return std::nullopt; + } + // Else we must vectorize such that we multiply the existing mux sub-group + // size up to the required one. + vecz_opts.factor = ElementCount::getFixed( + *reqd_sg_size / compiler::utils::getMuxSubgroupSize(F)); + vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions); + return vecz_opts; + } + return std::nullopt; +} + +std::optional +getAutoSubgroupSizeOpts(Function &F, ModuleAnalysisManager &AM) { + // If there's a required sub-group size, we must return a vectorization + // factor that gets us there. + if (auto opts = getReqdSubgroupSizeOpts(F)) { + return opts; + } + + auto &M = *F.getParent(); + const auto &GSGI = AM.getResult(M); + + // If the function doesn't use sub-groups (from the user's perspective) then + // we don't need to adhere to a specific sub-group size. + if (!GSGI.usesSubgroups(F)) { + return std::nullopt; + } + + // Use the device's sub-group sizes to determine which to vectorize to. + auto &DI = AM.getResult(M); + + // We don't force devices to support any sub-group sizes. + if (DI.reqd_sub_group_sizes.empty()) { + return std::nullopt; + } + + vecz::VeczPassOptions vecz_opts; + vecz_opts.vec_dim_idx = 0; + // Disable auto - we want a specific width + vecz_opts.vecz_auto = false; + // Enable some default choices + vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions); + + // Now try and choose the best width. + std::optional best_width; + const auto mux_sub_group_size = compiler::utils::getMuxSubgroupSize(F); + + auto can_produce_legal_width = [&mux_sub_group_size](unsigned size) { + // We only support vectorization widths where there's a clean multiple, and + // we can vectorize *up* to the desired size - we can't shrink the + // sub-group size by vectorizing. + return size >= mux_sub_group_size && (size % mux_sub_group_size) == 0; + }; + + for (auto size : DI.reqd_sub_group_sizes) { + if (!can_produce_legal_width(size)) { + continue; + } + const unsigned candidate_width = size / mux_sub_group_size; + // Try and choose at least one width. + if (!best_width) { + best_width = candidate_width; + continue; + } + + // Prefer non-scalar widths. + if (best_width == 1 && candidate_width > 1) { + best_width = candidate_width; + continue; + } + + // If we have a required work-group size, prefer one that will fit well + // with that. + if (auto wgs = compiler::utils::parseRequiredWGSMetadata(F)) { + const uint64_t local_size_x = wgs.value()[0]; + const bool best_fits = !(local_size_x % *best_width); + const bool cand_fits = !(local_size_x % candidate_width); + if (!best_fits && cand_fits) { + best_width = candidate_width; + continue; + } else if (best_fits && !cand_fits) { + continue; + } + } + + // Else, prefer powers of two. + if (!isPowerOf2_32(*best_width) && isPowerOf2_32(candidate_width)) { + best_width = candidate_width; + continue; + } + } + + // Return nothing if we couldn't find a good, legal, width. + if (!best_width) { + return std::nullopt; + } + + vecz_opts.factor = ElementCount::getFixed(*best_width); + + return vecz_opts; +} + +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def new file mode 100644 index 0000000000000..0cba927e215da --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def @@ -0,0 +1,53 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// This is a simplified version of LLVMs llvm/lib/Passes/PassRegistry.def. It +// outlines all vecz-specific passes (FIXME: not analyses). +#ifndef MODULE_PASS +#define MODULE_PASS(NAME, CREATE_PASS) +#endif +MODULE_PASS("builtin-inlining", BuiltinInliningPass()) +MODULE_PASS("define-builtins", DefineInternalBuiltinsPass()) +#undef MODULE_PASS + +#ifndef FUNCTION_PASS +#define FUNCTION_PASS(NAME, CREATE_PASS) +#endif +FUNCTION_PASS("vecz-mem2reg", BasicMem2RegPass()) +FUNCTION_PASS("pre-linearize", PreLinearizePass()) +FUNCTION_PASS("remove-int-ptr", RemoveIntPtrPass()) +FUNCTION_PASS("squash-small-vecs", SquashSmallVectorsPass()) +FUNCTION_PASS("uniform-reassoc", UniformReassociationPass()) +FUNCTION_PASS("ternary-transform", TernaryTransformPass()) +FUNCTION_PASS("cfg-convert", ControlFlowConversionPass()) +FUNCTION_PASS("cleanup-divergence", DivergenceCleanupPass()) +FUNCTION_PASS("gep-elim", CommonGEPEliminationPass()) +FUNCTION_PASS("scalarize", ScalarizationPass()) +FUNCTION_PASS("mask-memops", SimplifyMaskedMemOpsPass()) +FUNCTION_PASS("packetizer", PacketizationPass()) +FUNCTION_PASS("inline-post-vecz", InlinePostVectorizationPass()) +FUNCTION_PASS("interleave-combine-loads", InterleavedGroupCombinePass(eInterleavedLoad)) +FUNCTION_PASS("interleave-combine-stores", InterleavedGroupCombinePass(eInterleavedStore)) + +FUNCTION_PASS("print", StrideAnalysisPrinterPass(llvm::dbgs())) +#undef FUNCTION_PASS + +#ifndef LOOP_PASS +#define LOOP_PASS(NAME, CREATE_PASS) +#endif +LOOP_PASS("simplify-infinite-loops", SimplifyInfiniteLoopPass()) +LOOP_PASS("vecz-loop-rotate", VeczLoopRotatePass()) +#undef LOOP_PASS diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp new file mode 100644 index 0000000000000..4c2ac445b32c3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp @@ -0,0 +1,281 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "reachability.h" + +#include +#include +#include +#include +#include + +#include "debugging.h" + +#define DEBUG_TYPE "vecz-reachability" + +// HOW IT WORKS +// +// It builds two complementary topological sorts of the supplied basic blocks, +// which it then uses to filter out obviously unreachable blocks as early as +// possible. Where we have two blocks A and B and B has any topology index +// less than that of A, then B is definitely not reachable from A. However, +// if B has a higher index, it might be (but we have to check to be sure). +// +// For details on the above approach, see "Reachability Queries in Very Large +// Graphs: A Fast Refined Online Search Approach" by +// Renê R. Veloso, Loïc Cerf, Wagner Meira Jr, Mohammed J. Zaki. +// +// It also uses data from the Dominator Tree and Post Dominator Tree, in order +// to skip ahead. If we want to know if B is reachable from A and we know +// that C dominates B, if A->C is not ruled out by the topology indices then we +// know there can be no path from A to B that does NOT go through C, therefore +// we only need to check if C is reachable from A. The same follows in reverse +// for Post Dominators. + +using namespace llvm; + +namespace vecz { + +Reachability::Reachability(DominatorTree &p_DT, PostDominatorTree &p_PDT, + LoopInfo &p_LI) + : DT(p_DT), PDT(p_PDT), LI(p_LI) {} + +void Reachability::update(Function &F) { + if (graph.empty()) { + recalculate(F); + } +} + +void Reachability::clear() { + indexMap.clear(); + graph.clear(); +} + +void Reachability::recalculate(Function &F) { + clear(); + + indexMap.reserve(F.size()); + graph.resize(F.size()); + { + size_t i = 0; + for (auto &BB : F) { + indexMap[&BB] = i++; + } + } + + for (auto &BB : F) { + auto &node = graph[indexMap[&BB]]; + + auto *const loop = LI.getLoopFor(&BB); + auto *const header = loop ? loop->getHeader() : nullptr; + for (BasicBlock *succ : successors(&BB)) { + if (succ == header) { + continue; + } + + const size_t succIndex = indexMap[succ]; + + node.successors.push_back(succIndex); + auto &succNode = graph[succIndex]; + ++succNode.predecessors; + } + std::sort(node.successors.begin(), node.successors.end()); + + if (auto *DTNode = DT.getNode(&BB)) { + if (auto *IDom = DTNode->getIDom()) { + const size_t dom = indexMap[IDom->getBlock()]; + node.dom = dom; + } + } + if (auto *PDTNode = PDT.getNode(&BB)) { + if (auto *IPDom = PDTNode->getIDom()) { + const size_t postDom = indexMap[IPDom->getBlock()]; + node.postDom = postDom; + } + } + } + + std::vector roots; + size_t Xindex = 0; + size_t Yindex = 0; + + // It would be surprising in fact if there was more than one root, because + // we only expect a single entry block for a function, however we deal with + // it for completeness, and in case this is required to be valid for some + // intermediate state. + { + size_t i = 0; + for (auto &node : graph) { + if (node.successors.empty()) { + node.postDom = ~size_t(0); + } + node.predTmp = node.predecessors; + if (node.predecessors == 0) { + roots.push_back(i); + } + ++i; + } + } + // A copy of the roots vector so we don't need to build it again when we come + // to construct the Y index. + std::vector rootsY = roots; + + while (!roots.empty()) { + const size_t u = roots.back(); + roots.pop_back(); + + auto &uNode = graph[u]; + uNode.X = Xindex++; + for (const size_t v : uNode.successors) { + auto &vNode = graph[v]; + if (--vNode.predTmp == 0) { + roots.push_back(v); + } + } + } + + for (auto &node : graph) { + node.predTmp = node.predecessors; + } + roots.swap(rootsY); + + // Y heap represents right-most vertices (max X) + auto cmpY = [this](size_t lhs, size_t rhs) -> bool { + return graph[lhs].X < graph[rhs].X; + }; + + // The vector of roots has strictly decreasing X index, so it already has + // the property of a max heap. No need to make_heap! + while (!roots.empty()) { + std::pop_heap(roots.begin(), roots.end(), cmpY); + const size_t u = roots.back(); + roots.pop_back(); + + auto &uNode = graph[u]; + uNode.Y = Yindex++; + for (auto vi = uNode.successors.rbegin(), ve = uNode.successors.rend(); + vi != ve; ++vi) { + const size_t v = *vi; + auto &vNode = graph[v]; + if (--vNode.predTmp == 0) { + roots.push_back(v); + std::push_heap(roots.begin(), roots.end(), cmpY); + } + } + } + + LLVM_DEBUG({ + size_t I = 0; + for (auto &BB : F) { + auto &Node = graph[I]; + dbgs() << BB.getName() << ":\n"; + dbgs() << "[ " << Node.X << ", " << Node.Y << " ] : "; + dbgs() << "( " << Node.dom << ", " << Node.postDom << " ) : "; + for (const size_t S : Node.successors) { + if (graph[S].X <= graph[I].X) { + dbgs() << "!x!"; + } + if (graph[S].Y <= graph[I].Y) { + dbgs() << "!y!"; + } + dbgs() << S << "; "; + } + dbgs() << "\n\n"; + ++I; + } + }); + + assert(validate() && "Topological indices not valid for reachability graph"); +} + +bool Reachability::validate() const { + for (auto &node : graph) { + for (const size_t s : node.successors) { + if (graph[s].X <= node.X || graph[s].Y <= node.Y) { + return false; + } + } + } + return true; +} + +bool Reachability::isReachableImpl(size_t from, size_t to) const { + DenseSet visited; + std::vector worklist; + + while (true) { + auto &nodeFrom = graph[from]; + auto &nodeTo = graph[to]; + + if (nodeFrom.X > nodeTo.X || nodeFrom.Y > nodeTo.Y) { + return false; + } + + const size_t dom = nodeTo.dom; + const size_t postDom = nodeFrom.postDom; + if (dom == from || postDom == to) { + return true; + } + + auto &nodeDom = graph[dom]; + if (nodeFrom.X < nodeDom.X && nodeFrom.Y < nodeDom.Y) { + to = dom; + continue; + } + + if (postDom != ~size_t(0)) { + auto &nodePDom = graph[postDom]; + if (nodePDom.X < nodeTo.X && nodePDom.Y < nodeTo.Y) { + from = postDom; + continue; + } + } + + // possible false positive, so check recursively.. + for (const size_t succ : nodeFrom.successors) { + if (succ == to) { + return true; + } + auto &nodeSucc = graph[succ]; + if (nodeSucc.X < nodeTo.X && nodeSucc.Y < nodeTo.Y) { + if (visited.insert(succ).second) { + worklist.push_back(succ); + } + } + } + if (worklist.empty()) { + return false; + } + from = worklist.back(); + worklist.pop_back(); + } + return false; +} + +bool Reachability::isReachable(BasicBlock *from, BasicBlock *to) const { + auto fromI = indexMap.find(from); + if (fromI == indexMap.end()) { + return false; + } + + auto toI = indexMap.find(to); + if (toI == indexMap.end()) { + return false; + } + + return from == to || isReachableImpl(fromI->second, toI->second); +} + +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp new file mode 100644 index 0000000000000..6f0c952bf64c4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp @@ -0,0 +1,53 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "simd_packet.h" + +#define DEBUG_TYPE "vecz-simd" + +using namespace llvm; +using namespace vecz; + +llvm::Value *SimdPacket::at(unsigned Index) const { + if (Index >= size()) { + return nullptr; + } else { + return (*this)[Index]; + } +} + +void SimdPacket::set(unsigned Index, Value *V) { + if (Index < size()) { + (*this)[Index] = V; + Mask.enable(Index); + } +} + +SimdPacket &SimdPacket::update(const SimdPacket &Other) { + for (unsigned i = 0; i < size(); i++) { + if (Other.Mask.isEnabled(i)) { + (*this)[i] = Other[i]; + } + } + Mask.Value |= Other.Mask.Value; + return *this; +} + +void PacketMask::enableAll(unsigned NumLanes) { + for (unsigned i = 0; i < NumLanes; i++) { + enable(i); + } +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp new file mode 100644 index 0000000000000..e8c6c086828a8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp @@ -0,0 +1,243 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "debugging.h" +#include "transform/passes.h" + +using namespace llvm; +using namespace vecz; + +#define DEBUG_TYPE "vecz-mem2reg" + +PreservedAnalyses BasicMem2RegPass::run(Function &F, + FunctionAnalysisManager &) { + LLVM_DEBUG(dbgs() << "\n\nVECZ MEM2REG on " << F.getName() << "\n"); + bool modified = false; + if (F.empty()) { + return PreservedAnalyses::all(); + } + + // Find allocas that can be promoted. + SmallVector PromotableAllocas; + BasicBlock &EntryBB = F.getEntryBlock(); + for (Instruction &I : EntryBB) { + if (AllocaInst *Alloca = dyn_cast(&I)) { + if (canPromoteAlloca(Alloca)) { + PromotableAllocas.push_back(Alloca); + } + } + } + + // Promote them. + for (AllocaInst *Alloca : PromotableAllocas) { + if (promoteAlloca(Alloca)) { + LLVM_DEBUG(dbgs() << "VM2R: Promoted :" << *Alloca << "\n"); + Alloca->eraseFromParent(); + modified = true; + } + } + + if (!modified) { + return PreservedAnalyses::all(); + } + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} + +bool BasicMem2RegPass::canPromoteAlloca(AllocaInst *Alloca) const { + BasicBlock *ParentBB = Alloca->getParent(); + Function *F = ParentBB->getParent(); + BasicBlock &EntryBB = F->getEntryBlock(); + if (&EntryBB != ParentBB) { + return false; + } + + const unsigned SrcPointeeBits = + Alloca->getAllocatedType()->getPrimitiveSizeInBits(); + + if (SrcPointeeBits == 0) { + return false; + } + + // Validate the alloca's users. + StoreInst *TheStore = nullptr; + SmallPtrSet NonStoreUsers; + for (User *U : Alloca->users()) { + if (StoreInst *Store = dyn_cast(U)) { + // There can be at most one store. + if (TheStore) { + return false; + } + // Stores must be in the entry block. + if (Store->getParent() != &EntryBB) { + return false; + } + // Check if the store is actually storing a value *in* the alloca and not + // using the alloca itself as the value to be stored. For example, in the + // following IR code, the store can be used to promote p_639 but not + // c_640: + // + // %c_640 = alloca %struct.S2, align 16 + // %p_639 = alloca %struct.S2*, align 8 + // store %struct.S2* %c_640, %struct.S2** %p_639, align 8 + // + // Also, if the alloca pointer is stored in some other variable, we can + // not promote the alloca as we need the pointer. + if (Store->getPointerOperand() != Alloca) { + return false; + } + // Everything is fine, use this store + TheStore = Store; + } else if (isa(U)) { + // The loaded type doesn't necessarily equal the alloca type when opaque + // pointers are involved: + // %a = alloca i32 + // %v = load i16, ptr %a + // We can only promote the alloca if we can bitcast between the two + // underlying types as well. + // We could probably zero-extend or trunc if we had to? + const unsigned DstPointeeBits = U->getType()->getPrimitiveSizeInBits(); + if (!DstPointeeBits || SrcPointeeBits != DstPointeeBits) { + return false; + } + NonStoreUsers.insert(U); + } else if (BitCastInst *Cast = dyn_cast(U)) { + // The bitcast must be from one pointer type to another. + PointerType *SrcPtrTy = dyn_cast(Cast->getSrcTy()); + PointerType *DstPtrTy = dyn_cast(Cast->getType()); + if (!SrcPtrTy || !DstPtrTy) { + return false; + } + // The cast must have one load user. + if (!Cast->hasOneUse()) { + return false; + } + User *CastUser = *Cast->user_begin(); + if (!isa(CastUser)) { + return false; + } + // Since this is a bitcast, we can only promote the alloca if we can + // bitcast between the two underlying types as well. + const unsigned DstPointeeBits = + CastUser->getType()->getPrimitiveSizeInBits(); + if (!DstPointeeBits || SrcPointeeBits != DstPointeeBits) { + return false; + } + NonStoreUsers.insert(U); + } else { + // Do not allow other kinds of users. + return false; + } + } + + // If the alloca has no value stored into it, then there is no value to get + // and we can't promote it. + if (!TheStore) { + return false; + } + + // Stores must precede other users. + for (Instruction &I : EntryBB) { + if (NonStoreUsers.contains(&I)) { + return false; + } else if (&I == TheStore) { + break; + } + } + + return true; +} + +bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const { + LLVM_DEBUG(dbgs() << "VM2R: NOW AT :" << *Alloca << "\n"); + // Find the value stored in the alloca. + Value *StoredValue = nullptr; + SmallVector ToDelete; + for (User *U : Alloca->users()) { + if (StoreInst *Store = dyn_cast(U)) { + StoredValue = Store->getValueOperand(); + ToDelete.push_back(Store); + break; + } + } + assert(StoredValue != nullptr && "Could not find value stored in alloca"); + + // Replace non-store users with the stored value. + for (User *U : Alloca->users()) { + if (isa(U)) { + continue; + } + LoadInst *Load = dyn_cast(U); + Value *NewValue = StoredValue; + BitCastInst *Cast = dyn_cast(U); + if (Cast) { + // We've already verified that a bitcast must have a load attached. + Load = cast(*Cast->user_begin()); + LLVM_DEBUG(dbgs() << "VM2R: Cast :" << *Cast << "\n"); + } + if (!Load) { + return false; + } + LLVM_DEBUG(dbgs() << "VM2R: Load :" << *Load << "\n"); + // Handle any type changes - not necessarily from the BitCastInst we've + // checked above! We've already verified that the loaded type type and the + // alloca size must be identical... + assert(Load->getType()->getPrimitiveSizeInBits() == + Alloca->getAllocatedType()->getPrimitiveSizeInBits()); + if (Load->getType() != NewValue->getType()) { + // ... but we haven't checked that the stored value is the right size: + // %a = alloca i32 + // store i16, ptr %a + // %v = load i32, ptr %a + // Note: we could do other things if the type sizes didn't match. + if (Load->getType()->getPrimitiveSizeInBits() != + NewValue->getType()->getPrimitiveSizeInBits()) { + return false; + } + auto *CI = CastInst::CreateBitOrPointerCast(StoredValue, Load->getType()); + CI->insertBefore(Load->getIterator()); + NewValue = CI; + } + LLVM_DEBUG(dbgs() << "VM2R: Replaced :" << *Load << "\n"); + LLVM_DEBUG(dbgs() << " |-> with :" << *NewValue << "\n"); + Load->replaceAllUsesWith(NewValue); + if (Cast) { + ToDelete.push_back(Cast); + } + ToDelete.push_back(Load); + } + + // Clean up instructions bottom-up (users first). + while (!ToDelete.empty()) { + Instruction *I = ToDelete.pop_back_val(); + if (I->use_empty()) { + LLVM_DEBUG(dbgs() << "VM2R: Deleted :" << *I << "\n"); + I->eraseFromParent(); + } + } + return true; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp new file mode 100644 index 0000000000000..9e865838c021e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp @@ -0,0 +1,294 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "transform/passes.h" + +using namespace llvm; +using namespace vecz; + +PreservedAnalyses BuiltinInliningPass::run(Module &M, + ModuleAnalysisManager &AM) { + bool modified = false; + bool needToRunInliner = false; + llvm::FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + for (Function &F : M.functions()) { + SmallVector ToDelete; + for (BasicBlock &BB : F) { + if (!FAM.getResult(F).hasResult()) { + continue; + } + for (Instruction &I : BB) { + // Only look at call instructions as those are the only things that can + // be builtins. + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + + bool NeedLLVMInline = false; + Value *NewCI = processCallSite(CI, NeedLLVMInline); + needToRunInliner |= NeedLLVMInline; + if ((NewCI == CI) || !NewCI) { + continue; + } + + if (!CI->getType()->isVoidTy()) { + CI->replaceAllUsesWith(NewCI); + } + ToDelete.push_back(CI); + modified = true; + } + } + // Clean up. + while (!ToDelete.empty()) { + Instruction *I = ToDelete.pop_back_val(); + I->eraseFromParent(); + } + } + + // Run the LLVM inliner if some calls were marked as needing inlining. + if (needToRunInliner) { + llvm::legacy::PassManager PM; + PM.add(llvm::createAlwaysInlinerLegacyPass()); + modified |= PM.run(M); + } + + // Recursively run the pass to inline any newly introduced functions. + if (modified) { + run(M, AM); + } + + return modified ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B, + ArrayRef Args, llvm::CallBase *CB) { + LLVMContext &Context = F->getContext(); + auto &DL = F->getParent()->getDataLayout(); + const unsigned PtrBits = DL.getPointerSizeInBits(); + + // Check the alignment constraints do not exceed the algorithmic requirements + // of doing 64 bits at time + + // @llvm.memset defines 0 and 1 to both mean no alignment. + const auto &MSI = cast(CB); + + // Note that once LLVM 8.0 is deprecated we can use actual alignment classes + const Align Alignment = MSI->getDestAlign().valueOrOne(); + const Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty()); + if (Alignment < std::max(Int64Alignment, Align(8u))) { + return nullptr; + } + + Value *DstPtr = Args[0]; + Type *Int8Ty = B.getInt8Ty(); + + Value *StoredValue = Args[1]; + const bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context)); + llvm::StoreInst *MS = nullptr; + + // For nicely named IR instructions + const std::string DstName = DstPtr->getName().str(); + + // We can only replace memset instructions if they have a constant length + ConstantInt *CL = dyn_cast(Args[2]); + if (!CL) { + return nullptr; + } + const int64_t Bytes = CL->getValue().getZExtValue(); + + // Unlike memcpy, if we want to use 64bit stores in memset we need to + // construct the 64bit value from a 8bit one. + // First, check if we can get the value at compile time + ConstantInt *ConstantValue = dyn_cast(StoredValue); + Value *StoredValue64 = nullptr; + if (ConstantValue) { + // If we can get the value at compile time, calculate the 64bit value at + // compile time as well. + const unsigned IntValue = ConstantValue->getZExtValue(); + APInt APValue(64, IntValue); + for (int i = 1; IntValue && i < 8; ++i) { + APValue |= APValue << 8; + } + StoredValue64 = ConstantInt::get(Context, APValue); + } else { + StoredValue64 = B.CreateZExt(StoredValue, Type::getInt64Ty(Context)); + for (int i = 1; i < 8; ++i) { + StoredValue64 = B.CreateOr( + StoredValue64, + B.CreateShl(StoredValue64, + llvm::ConstantInt::get(Context, llvm::APInt(64, 8)))); + } + // If we can't get the value at compile time, we have to emit instructions + // to generate it at runtime. + } + StoredValue64->setName("ms64val"); + + // Emit enough loads and stores to replicate the behaviour of memset. + int64_t byte = 0; + // Initially we use 64bit loads and stores, in order to avoid emitting too + // many instructions. + + for (; byte <= Bytes - 8; byte += 8) { + Value *Idx = B.getIntN(PtrBits, byte); + Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx); + MS = B.CreateStore(StoredValue64, OffsetDstPtr, IsVolatile); + + // Set alignments for store to be minimum of that from + // the instruction and what is required for 8 byte stores + const Align StoreAlign = + byte == 0 ? Alignment : std::min(Align(8u), Alignment); + MS->setAlignment(StoreAlign); + } + // ...and then we fill in the remaining with 8bit stores. + for (; byte < Bytes; byte += 1) { + Value *Idx = B.getIntN(PtrBits, byte); + Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx, DstName); + MS = B.CreateStore(StoredValue, OffsetDstPtr, IsVolatile); + MS->setAlignment(llvm::Align(1)); + } + + return MS; +} + +static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B, + ArrayRef Args, llvm::CallBase *CB) { + LLVMContext &Context = F->getContext(); + auto &DL = F->getParent()->getDataLayout(); + + const auto &MSI = cast(CB); + const Align DestAlignment = MSI->getDestAlign().valueOrOne(); + const Align SourceAlignment = MSI->getSourceAlign().valueOrOne(); + const Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty()); + + if (DestAlignment < std::max(Int64Alignment, Align(8u))) { + return nullptr; + } + + if (SourceAlignment < std::max(Int64Alignment, Align(8u))) { + return nullptr; + } + + const unsigned PtrBits = DL.getPointerSizeInBits(); + + Value *DstPtr = Args[0]; + Value *SrcPtr = Args[1]; + Type *Int8Ty = B.getInt8Ty(); + + const bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context)); + llvm::StoreInst *MC = nullptr; + + // For nicely named IR instructions + const std::string DstName = DstPtr->getName().str(); + const std::string SrcName = SrcPtr->getName().str(); + + // Get the length as a constant + ConstantInt *CL = dyn_cast(Args[2]); + // We can only replace memcpy instructions if they have a constant length + if (!CL) { + return nullptr; + } + const int64_t Length = CL->getValue().getSExtValue(); + + // Emit enough stores to replicate the behaviour of memcpy. + int64_t byte = 0; + // Initially we use 64bit loads and stores, in order to avoid emitting too + // many instructions... + Type *Int64Ty = B.getInt64Ty(); + + for (; byte <= Length - 8; byte += 8) { + Value *Idx = B.getIntN(PtrBits, byte); + Value *OffsetSrcPtr = B.CreateInBoundsGEP(Int8Ty, SrcPtr, Idx); + Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx); + LoadInst *LoadValue = + B.CreateLoad(Int64Ty, OffsetSrcPtr, IsVolatile, SrcName); + MC = B.CreateStore(LoadValue, OffsetDstPtr, IsVolatile); + + // Set alignments for stores and loads to be minimum of that from + // the instruction and what is required for 8 byte load/stores + const Align StoreAlign = + byte == 0 ? DestAlignment : std::min(Align(8u), DestAlignment); + MC->setAlignment(StoreAlign); + const Align LoadAlign = + byte == 0 ? SourceAlignment : std::min(Align(8u), SourceAlignment); + LoadValue->setAlignment(LoadAlign); + } + // ...and then we fill in the remaining with 8bit stores. + for (; byte < Length; byte += 1) { + Value *Idx = B.getIntN(PtrBits, byte); + Value *OffsetSrcPtr = B.CreateInBoundsGEP(Int8Ty, SrcPtr, Idx); + Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx, DstName); + LoadInst *LoadValue = + B.CreateLoad(Int8Ty, OffsetSrcPtr, IsVolatile, SrcName); + MC = B.CreateStore(LoadValue, OffsetDstPtr, IsVolatile); + LoadValue->setAlignment(llvm::Align(1)); + MC->setAlignment(llvm::Align(1)); + } + + return MC; +} + +Value *BuiltinInliningPass::processCallSite(CallInst *CI, + bool &NeedLLVMInline) { + NeedLLVMInline = false; + + Function *Callee = CI->getCalledFunction(); + if (!Callee) { + return CI; + } + + // Mark user function as needing inlining by LLVM, unless it has the NoInline + // attribute + if (!Callee->isDeclaration() && + !Callee->hasFnAttribute(Attribute::NoInline)) { + CI->addFnAttr(Attribute::AlwaysInline); + NeedLLVMInline = true; + return CI; + } + + // Specially inline some LLVM intrinsics. + if (Callee->isIntrinsic()) { + if (Callee->getIntrinsicID() == Intrinsic::memcpy) { + IRBuilder<> B(CI); + const SmallVector Args(CI->args()); + if (Value *Impl = emitBuiltinMemCpy(Callee, B, Args, CI)) { + return Impl; + } + } + + if (Callee->getIntrinsicID() == Intrinsic::memset) { + IRBuilder<> B(CI); + const SmallVector Args(CI->args()); + if (Value *Impl = emitBuiltinMemSet(Callee, B, Args, CI)) { + return Impl; + } + } + } + + return CI; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp new file mode 100644 index 0000000000000..7a6e7d00fb05e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp @@ -0,0 +1,112 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/common_gep_elimination_pass.h" + +#include +#include + +#include + +#include "analysis/control_flow_analysis.h" +#include "analysis/divergence_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "ir_cleanup.h" +#include "vectorization_unit.h" + +using namespace llvm; +using namespace vecz; + +char CommonGEPEliminationPass::PassID = 0; + +PreservedAnalyses CommonGEPEliminationPass::run(Function &F, + FunctionAnalysisManager &AM) { + const DominatorTree &DT = AM.getResult(F); + + // Redundant GEPs to remove + SmallPtrSet toDelete; + // GEPs we come across. + std::unordered_multimap GEPs; + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (auto *GEP = dyn_cast(&I)) { + Value *Ptr = GEP->getPointerOperand(); + // If this is the first time we meet the source of the GEP, just add + // it to the multimap and look for another GEP. + if (GEPs.find(Ptr) == GEPs.end()) { + GEPs.emplace(Ptr, GEP); + continue; + } + + // The range of values that have the key `Ptr`. + auto Range = GEPs.equal_range(Ptr); + auto it = Range.first; + for (; it != Range.second; it++) { + auto *trackedGEP = it->second; + if (GEP->getNumIndices() != trackedGEP->getNumIndices()) { + continue; + } + + // With opaque pointers, we need to check the element types as well. + if (GEP->getSourceElementType() != + trackedGEP->getSourceElementType()) { + continue; + } + + unsigned i = 0; + for (; i < GEP->getNumIndices(); i++) { + Value *lhs = GEP->getOperand(i + 1); + Value *rhs = trackedGEP->getOperand(i + 1); + + // Both GEPs we compare are not the same, stop comparing. + if (lhs != rhs) { + break; + } + } + + // trackedGEP does the same operation as GEP, so replace GEP + // with the already tracked GEP. + if (i == GEP->getNumIndices()) { + if (DT.dominates(trackedGEP->getParent(), GEP->getParent())) { + GEP->replaceAllUsesWith(trackedGEP); + toDelete.insert(GEP); + break; + } + } + } + // We iterated over all values whose key is Ptr, but haven't found + // a matching GEP, so add the latter to the multimap. + if (it == Range.second) { + GEPs.emplace(Ptr, GEP); + } + } + } + } + + // Proceed to remove every duplicate GEP we found. + for (auto *GEP : toDelete) { + IRCleanup::deleteInstructionNow(GEP); + } + + PreservedAnalyses Preserved; + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + + return Preserved; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp new file mode 100644 index 0000000000000..175e1f043729d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp @@ -0,0 +1,3306 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/control_flow_conversion_pass.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "analysis/control_flow_analysis.h" +#include "analysis/divergence_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "control_flow_boscc.h" +#include "control_flow_roscc.h" +#include "debugging.h" +#include "ir_cleanup.h" +#include "llvm_helpers.h" +#include "memory_operations.h" +#include "reachability.h" +#include "transform/passes.h" +#include "vecz/vecz_choices.h" + +#define DEBUG_TYPE "vecz-cf" + +using namespace llvm; +using namespace vecz; + +class ControlFlowConversionState::Impl : public ControlFlowConversionState { +public: + Impl(Function &F, FunctionAnalysisManager &AM) + : ControlFlowConversionState(F, AM) {} + + PreservedAnalyses run(Function &, FunctionAnalysisManager &); + +private: + /// @brief utility struct used by LinearizeCFG to allow block retargeting + /// info to be stored in a single contiguous vector of variable-length + /// subvectors. This avoids having to use a vector of vectors, and all + /// the individual heap allocations that would involve. Empirically (based on + /// UnitCL) we have approximately one new target per Basic Block overall, + /// and never more than 2 (which is not to say more than 2 is impossible). + /// Since we iterate over all NewTargetInfos linearly, we only need to record + /// the number of targets for each block, and not their starting indices. + struct Linearization { + struct NewTargetInfo { + BasicBlock *BB; + size_t numTargets = 0; + + NewTargetInfo(BasicBlock *bb) : BB(bb) {} + }; + + std::vector infos; + std::vector data; + + void beginBlock(BasicBlock *BB) { infos.emplace_back(BB); } + size_t currentSize() const { return infos.back().numTargets; } + void push(BasicBlock *BB) { + data.push_back(BB); + ++infos.back().numTargets; + } + }; + + /// @brief Type that maps exit blocks to exit mask information. + using DenseExitPHIMap = SmallDenseMap; + /// @brief Type that maps exiting blocks to update mask information. + using DenseExitUpdateMap = + SmallDenseMap; + + struct LoopMasksInfo { + /// @brief Keep track of which instances left the loop through which exit + /// (persisted throughout the whole loop). + DenseExitPHIMap persistedDivergentExitMasks; + /// @brief Divergent loop exit masks updated for the current iteration. + DenseExitUpdateMap updatedPersistedDivergentExitMasks; + /// @brief Combined divergent loop exit masks of the current iteration. + Instruction *combinedDivergentExitMask = nullptr; + /// @brief Combined divergent loop exit masks of the whole loop. + Instruction *persistedCombinedDivergentExitMask = nullptr; + }; + + /// @brief Convert the function's CFG to data-flow. + /// @return true if the function's CFG was converted, false otherwise. + bool convertToDataFlow(); + + /// @brief Generate masks needed to do control-flow to data-flow conversion. + /// @return true if masks were generated successfully, false otherwise. + bool generateMasks(); + + /// @brief Generate masks for the given block. + /// @param[in] BB Block whose masks we are generating. + /// @return true if no problem occurred, false otherwise. + bool createMasks(BasicBlock &BB); + + /// @brief Create entry mask for the given block. + /// @param[in] BB Block whose masks we are generating. + /// @return true if no problem occurred, false otherwise. + bool createEntryMasks(BasicBlock &BB); + + /// @brief Create exit mask for the given block. + /// @param[in] BB Block whose masks we are generating. + /// @param[in] isBOSCCEntry Whether BB creates a uniform region. + /// @return true if no problem occurred, false otherwise. + bool createExitMasks(BasicBlock &BB, bool isBOSCCEntry = false); + + /// @brief Create loop exit masks for the given loop. + /// @param[in,out] LTag Information on the loop we are evaluating. + /// @return true if no problem occurred, false otherwise. + bool createLoopExitMasks(LoopTag <ag); + + /// @brief Combine all information about instances that left the loop in the + /// current iteration. + /// @param[in,out] LTag Information on the loop we are evaluating. + /// @return true if no problem occurred, false otherwise. + bool createCombinedLoopExitMask(LoopTag <ag); + + /// @brief Apply masks to basic blocks in the function, to prevent + /// side-effects for inactive instances. + /// + /// @return llvm::Error::success if masks were applied successfully, an error + /// message explaining the failure otherwise. + Error applyMasks(); + + /// @brief Apply a mask to the given basic block, to prevent side-effects for + /// inactive instances. + /// + /// @param[in] BB Basic block to apply masks to. + /// @param[in] mask Mask to apply. + /// + /// @return llvm::Error::success if masks were applied successfully, an error + /// message explaining the failure otherwise. + Error applyMask(BasicBlock &BB, Value *mask); + + /// @brief Emit a call instructions to the masked version of the called + /// function. + /// + /// @param[in] CI The call instructions to create a masked version of + /// @param[in] entryBit The Value that determines if the lane is active or + /// not. + /// @return The call instruction to the masked version. + CallInst *emitMaskedVersion(CallInst *CI, Value *entryBit); + + /// @brief Create a masked version of the given function + /// + /// The Function (F) to be masked will be extracted from the CallInst and a + /// new Function (NewFunction) will be generated. NewFunction takes the same + /// arguments as F, plus an additional boolean argument that determines if the + /// lane is active or not. If the boolean argument is true, then NewFunction + /// will execute F and (if it's not void) return its return value. Vararg + /// functions are supported by expanding their arguments. + /// + /// @param[in] CI The call instructions to create a masked version of + /// @return The masked function + Function *getOrCreateMaskedVersion(CallInst *CI); + + /// @brief a type that maps unmasked instructions onto masked replacements. + using DeletionMap = SmallVector, 4>; + + /// @brief Attempt to apply a mask to an Instruction as a Memory Operation + /// + /// @param[in] I The Binary Operation to apply the mask to + /// @param[in] mask The mask to apply to the MemOp + /// @param[out] toDelete mapping of deleted unmasked operations + /// @param[out] safeDivisors a cache of re-usable known non-zero divisors + /// @return true if it was a BinOp, false otherwise + bool tryApplyMaskToBinOp(Instruction &I, Value *mask, DeletionMap &toDelete, + DenseMap &safeDivisors); + + /// @brief Attempt to apply a mask to a Memory Operation + /// + /// @param[in] op The MemOp to apply the mask to + /// @param[in] mask The mask to apply to the MemOp + /// @param[out] toDelete mapping of deleted unmasked operations + /// @return true of the MemOp got masked, false otherwise + bool tryApplyMaskToMemOp(MemOp &op, Value *mask, DeletionMap &toDelete); + + /// @brief Attempt to apply a mask to an Instruction as a Memory Operation + /// + /// @param[in] CI The call instruction to apply the mask to + /// @param[in] mask The mask to apply to the MemOp + /// @param[out] toDelete mapping of deleted unmasked operations + /// @return true if it is valid to mask this call, false otherwise + bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete); + + /// @brief Attempt to apply a mask to an atomic instruction via a builtin + /// call. + /// + /// @param[in] I The (atomic) instruction to apply the mask to + /// @param[in] mask The mask to apply to the masked atomic + /// @param[out] toDelete mapping of deleted unmasked operations + /// @return true if it is valid to mask this atomic, false otherwise + bool applyMaskToAtomic(Instruction &I, Value *mask, DeletionMap &toDelete); + + /// @brief Linearize a CFG. + /// @return true if no problem occurred, false otherwise. + bool partiallyLinearizeCFG(); + + /// @brief Create the reduction functions needed to vectorize the branch + /// @return true on success, false otherwise + bool createBranchReductions(); + + /// @brief Uniformize every divergent loop. + /// + /// @return true if no problem occurred, false otherwise. + bool uniformizeDivergentLoops(); + + /// @brief Assign a divergent loop a single loop exit from which all other + /// exits will be rewired. + /// @param[in] LTag Tag of the processed loop + /// @return true if no problem occurred, false otherwise. + bool computeDivergentLoopPureExit(LoopTag <ag); + + /// @brief Rewire every loop exit block such that the loop can be considered + /// uniform. + /// + /// @param[in] LTag Tag of the processed loop + /// @param[in] exitBlocks List of exit blocks before any transformation + /// @return true if no problem occurred, false otherwise. + bool rewireDivergentLoopExitBlocks( + LoopTag <ag, const SmallVectorImpl &exitBlocks); + + /// @brief Generate blend operations to discard execution of inactive + /// instances. + /// @param[in] LTag The loop whose live value is being handled. + /// @return true if no problem occurred, false otherwise. + bool generateDivergentLoopResults(LoopTag <ag); + + /// @brief Generate loop live value update instructions. + /// @param[in] LLV The loop live value we want to generate instructions for. + /// @param[in] LTag The loop whose live value is being handled. + /// @return true if no problem occurred, false otherwise. + bool generateDivergentLoopResultUpdates(Value *LLV, LoopTag <ag); + + /// @brief Generate blend instruction for loop live values at the latch. + /// @param[in] LTag The loop whose live values are being handled. + /// @param[in] exitBlocks List of exit blocks before any transformation + /// @return true if no problem occurred, false otherwise. + bool + blendDivergentLoopLiveValues(LoopTag <ag, + const SmallVectorImpl &exitBlocks); + + /// @brief Generate blend instruction for loop exit masks at the latch. + /// + /// @param[in] LTag Tag of the processed loop + /// @param[in] exitEdges List of exit edges before any transformation + /// @param[in] exitBlocks List of exit blocks before any transformation + /// @return true if no problem occurred, false otherwise. + bool + blendDivergentLoopExitMasks(LoopTag <ag, + const SmallVectorImpl &exitEdges, + const SmallVectorImpl &exitBlocks); + + /// @brief Replace uses of loop values outside of a divergent loop. + /// + /// @param[in] LTag Tag of the processed loop + /// @param[in] from Instruction to be replaced. + /// @param[in] to Instruction to replace `from` with. + /// @param[in] exitBlocks Exit blocks of the loop. + /// @return true if no problem occurred, false otherwise. + bool replaceUsesOutsideDivergentLoop( + LoopTag <ag, Value *from, Value *to, + const SmallVectorImpl &exitBlocks); + + /// @brief Assign new targets to edges based on the dominance-compact + /// ordering. + /// @param[out] lin New target information for each BasicBlock + /// @return true if no problem occurred, false otherwise. + bool computeNewTargets(Linearization &lin); + + /// @brief Linearize the CFG with the new calculated edges. + /// @return true if no problem occurred, false otherwise. + bool linearizeCFG(); + + /// @brief Generate blend operations to discard execution of inactive + /// instances. + /// @return true if no problem occurred, false otherwise. + bool generateSelects(); + + /// @brief Split a phi instruction into several select instructions. + /// @param[in,out] PHI The PHI node we want to split. + /// @param[in] B The block PHI belongs to. + /// @return true if no problem occurred, false otherwise. + bool generateSelectFromPHI(PHINode *PHI, BasicBlock *B); + + /// @brief Repair the SSA form. First blend and create new masks from the + /// new wires, then blend all the instructions that need blending. + /// @return true if no errors occurred. + bool repairSSA(); + + /// @brief Update the incoming blocks of phi nodes whose predecessors have + /// changed whilst rewiring. + /// @return true if no errors occurred. + bool updatePHIsIncomings(); + + /// @brief Blend instructions before their uses if divergence happened + /// inbetween. + /// @return true if no errors occurred. + bool blendInstructions(); + + /// @brief Simplify the mask instructions. + /// @return true if no errors occurred. + bool simplifyMasks(); + + /// @brief Check all blocks have a unique index order. + /// @return true if no errors occurred. + bool checkBlocksOrder() const; + + /// @brief Upon modifying a mask, we need to update the in-memory masks as + /// well. + /// @param[in] src The block whose mask changed + /// @param[in] from The old mask + /// @param[in] to The new mask + void replaceMasks(BasicBlock *src, Value *from, Value *to); + + /// @brief Upon removing an instruction, we need to also update our internal + /// containers. + /// @param[in] from The old value + /// @param[in] to The new value + void updateMaps(Value *from, Value *to); + + BasicBlock *functionExitBlock = nullptr; + DenseSet blends; + DenseMap LoopMasks; +}; + +STATISTIC(VeczCFGFail, + "Number of kernels that failed control flow conversion [ID#L80]"); + +// Set this to enable all-of masks in the latch of divergent loops. This can +// be interesting if there exists an intrinsic that, when comparing vector +// instructions, can immediately stop comparing if one of the operands if false. +// In counterpart, this makes us update two more values per divergent loops +// (said values allowing to keep track of which instances left the loop). +// +// Because no such intrinsic exists to my knowledge, we don't set this by +// default. +#undef ALL_OF_DIVERGENT_LOOP_LATCH + +namespace { + +BasicBlock::iterator getInsertionPt(BasicBlock &BB) { + // We have to insert instructions after any Allocas + auto it = BB.getFirstInsertionPt(); + while (isa(*it)) { + ++it; + } + return it; +} + +Instruction *copyMask(Value *mask, Twine name) { + VECZ_ERROR_IF(!mask, "Trying to copy mask with invalid arguments"); + return BinaryOperator::CreateAnd(mask, getDefaultValue(mask->getType(), 1), + name); +} + +Instruction *copyEntryMask(Value *mask, BasicBlock &BB) { + VECZ_ERROR_IF(!mask, "Trying to copy entry mask with invalid arguments"); + auto *EM = copyMask(mask, BB.getName() + ".entry_mask"); + EM->insertBefore(getInsertionPt(BB)); + return EM; +} + +Instruction *copyExitMask(Value *mask, StringRef base, BasicBlock &BB) { + VECZ_ERROR_IF(!mask, "Trying to copy exit mask with invalid arguments"); + auto *EM = copyMask(mask, base + ".exit_mask"); + EM->insertBefore(BB.getTerminator()->getIterator()); + return EM; +} + +/// Wrap a string into an llvm::StringError, pointing to an instruction. +static inline Error makeStringError(const Twine &message, Instruction &I) { + std::string helper_str = message.str(); + raw_string_ostream helper_stream(helper_str); + helper_stream << " " << I; + return make_error(helper_stream.str(), inconvertibleErrorCode()); +} + +// A helper method to determine whether a branch condition +// (expected to be an i1 result of a comparison instruction) is truly uniform. +static bool isBranchCondTrulyUniform(Value *cond, UniformValueResult &UVR) { + const auto *cmp = dyn_cast_if_present(cond); + if (!cmp || cmp->getType()->isVectorTy()) { + return false; + } + + return UVR.isTrueUniform(cmp); +} +} // namespace + +//////////////////////////////////////////////////////////////////////////////// + +char ControlFlowConversionPass::PassID = 0; + +PreservedAnalyses ControlFlowConversionPass::run(Function &F, + FunctionAnalysisManager &AM) { + ControlFlowConversionState::Impl state(F, AM); + return state.run(F, AM); +} + +ControlFlowConversionState::ControlFlowConversionState( + Function &F, FunctionAnalysisManager &AM) + : F(F), AM(AM), VU(AM.getResult(F).getVU()), + Ctx(AM.getResult(F).getContext()) {} + +PreservedAnalyses +ControlFlowConversionState::Impl::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &CFGR = AM.getResult(F); + if (CFGR.getFailed()) { + ++VeczCFGFail; + return VU.setFailed("Cannot vectorize the CFG for", &F, &F); + } else if (!CFGR.isConversionNeeded()) { + return PreservedAnalyses::all(); + } + functionExitBlock = CFGR.getExitBlock(); + + if (!convertToDataFlow()) { + // This pass may leave the function in an invalid state. Instead of doing + // so, and hoping that later passes don't throw verification failures back + // at us, replace the function body with an unreachable statement. Marking + // vectorization has having failed will mean the function will later be + // deleted. + // Note that this is quite coarse-grained; we could be cleverer, e.g., by + // returning whether convertToDataFlow has (potentially) left behind an + // invalid function. + ++VeczCFGFail; + VU.setFailed("Control flow conversion failed for", &F, VU.scalarFunction()); + F.deleteBody(); + BasicBlock *BB = BasicBlock::Create(F.getContext(), "entry", &F); + IRBuilder<> IRB(BB); + IRB.CreateUnreachable(); + return PreservedAnalyses::none(); + } + + PreservedAnalyses Preserved; + Preserved.preserve(); + + return Preserved; +} + +bool ControlFlowConversionState::replaceReachableUses(Reachability &RC, + Instruction *from, + Value *to, + BasicBlock *src) { + for (auto it = from->use_begin(); it != from->use_end();) { + Use &U = *it++; + Instruction *user = cast(U.getUser()); + + if (user == to) { + continue; + } + + BasicBlock *blockUse = user->getParent(); + + if (PHINode *PHI = dyn_cast(user)) { + // Cannot replace a use in a phi node with another phi node in the same + // block. + if (blockUse == src) { + if (isa(to)) { + continue; + } + } else { + // We must also check that 'src' can reach the incoming block to be + // allowed to replace the incoming value. + BasicBlock *incoming = PHI->getIncomingBlock(U); + if (!RC.isReachable(src, incoming)) { + continue; + } + } + } + + if (auto toI = dyn_cast(to)) { + if (toI->getParent() == blockUse) { + for (Instruction &I : *src) { + // If we found the user before `to`, then skip this user as it lives + // before `to` in the same block. + if (&I == user) { + break; + } + if (&I == to) { + LLVM_DEBUG(dbgs() << "Replace " << *from << " with " << *to + << " in " << *user << "\n"); + U.set(to); + break; + } + } + // We've handled all possible cases if `to` lives in the same block as + // `user`, so iterate over a new instruction. + continue; + } + } + + // `to` is in a different block than `user` so just check for reachability + // across BasicBlocks and not within them. + if (RC.isReachable(src, blockUse)) { + LLVM_DEBUG(dbgs() << "Replace " << *from << " with " << *to << " in " + << *user << "\n"); + U.set(to); + } + } + + return true; +} + +bool ControlFlowConversionState::Impl::convertToDataFlow() { + DT = &AM.getResult(F); + PDT = &AM.getResult(F); + LI = &AM.getResult(F); + UVR = &AM.getResult(F); + + // Make sure every loop has an entry in the masks table before we start. + for (auto *L : *LI) { + LoopMasks[L]; + } + + if (!VU.choices().linearizeBOSCC()) { + ROSCCGadget ROSCC(*this); + ROSCC.run(F); + } + + RC = std::make_unique(*DT, *PDT, *LI); + + // We do this after ROSCC, because it may have modified the CFG. + DR = &AM.getResult(F); + + if (VU.choices().linearizeBOSCC()) { + BOSCC = std::make_unique(*this); + if (!BOSCC->duplicateUniformRegions()) { + emitVeczRemarkMissed(&F, VU.scalarFunction(), + "Could not duplicate uniform regions for"); + return false; + } + } + + // Reserve space for the masks table and default-construct all entries, to + // avoid re-hashing/element relocation on access. + MaskInfos.reserve(F.size()); + for (auto &BB : F) { + MaskInfos[&BB]; + } + + if (!generateMasks()) { + emitVeczRemarkMissed(&F, VU.scalarFunction(), + "Could not generate masks for"); + return false; + } + if (auto err = applyMasks()) { + emitVeczRemarkMissed(&F, VU.scalarFunction(), "Could not apply masks for", + llvm::toString(std::move(err))); + return false; + } + + if (!partiallyLinearizeCFG()) { + emitVeczRemarkMissed(&F, VU.scalarFunction(), + "Could not partially linearize the CFG for"); + return false; + } + + return true; +} + +bool ControlFlowConversionState::Impl::generateMasks() { + LLVM_DEBUG(dbgs() << "MASKS GENERATION\n"); + + RC->update(F); + + VECZ_FAIL_IF(!createMasks(*functionExitBlock)); + + if (BOSCC) { + // The BOSCC entry blocks that have not been duplicated need exit masks + // towards uniform blocks. + SmallVector entryBlocks; + BOSCC->getUnduplicatedEntryBlocks(entryBlocks); + for (auto *const entry : entryBlocks) { + VECZ_FAIL_IF(!createExitMasks(*entry, true)); + } + + // Link the masks of the predicated regions to the uniform regions. + VECZ_FAIL_IF(!BOSCC->linkMasks()); + } + + for (auto *const LTag : DR->getLoopOrdering()) { + VECZ_FAIL_IF(!createLoopExitMasks(*LTag)); + } + + return true; +} + +bool ControlFlowConversionState::Impl::createMasks(BasicBlock &BB) { + // If we have already set the mask for this block, don't do it again. + // Uniform blocks are handled separately because of their lack of context. + if (MaskInfos[&BB].entryMask) { + return true; + } + + auto *const LTag = DR->getTag(&BB).loop; + auto *const header = LTag ? LTag->header : nullptr; + // If BB is a header, we will need the mask from its preheader. + // KLOCWORK "NPD.CHECK.MIGHT" possible false positive + // LTag is only dereferenced if it's not nullptr, but Klocwork doesn't follow + // the logic. + if (header == &BB) { + BasicBlock *preheader = LTag->preheader; + VECZ_FAIL_IF(!createMasks(*preheader)); + } else { + // Otherwise we will need the mask from every incoming edge. + for (BasicBlock *pred : predecessors(&BB)) { + VECZ_FAIL_IF(!createMasks(*pred)); + } + } + + VECZ_FAIL_IF(!createEntryMasks(BB)); + VECZ_FAIL_IF(!createExitMasks(BB)); + + // If the block is a loop header, its entry mask is a phi function with + // incoming values from the preheader and: + // - the latch for divergent loops, + // - nothing else for uniform loops (because if we enter an uniform loop, + // all instance that were active upon entry remain active upon exit). + if (header == &BB) { + BasicBlock *latch = LTag->latch; + VECZ_FAIL_IF(!createMasks(*latch)); + + if (LTag->isLoopDivergent()) { + auto *const entryMask = MaskInfos[&BB].entryMask; + assert(isa(entryMask) && + "Divergent Loop entry mask must be a PHI Node!"); + PHINode *phi = cast(entryMask); + // If header has two incoming values, we have already processed it. + if (phi->getNumIncomingValues() != 2) { + Value *latchMask = MaskInfos[latch].exitMasks[header]; + phi->addIncoming(latchMask, latch); + + LLVM_DEBUG(dbgs() << "Divergent loop header " << header->getName() + << ": entry mask: " << *phi << "\n"); + } + } + } + + return true; +} + +bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) { + auto &maskInfo = MaskInfos[&BB]; + if (maskInfo.entryMask) { + return true; + } + + Type *maskTy = Type::getInt1Ty(BB.getContext()); + + // If the block is by_all (i.e. executed by all lanes), it will always be + // executed on active masks, + // Similarly, if the block is uniform, its mask is true by definition. + if (DR->isByAll(BB) || DR->isUniform(BB)) { + maskInfo.entryMask = copyEntryMask(getDefaultValue(maskTy, 1), BB); + LLVM_DEBUG(dbgs() << BB.getName() << ": entry mask: " << *maskInfo.entryMask + << "\n"); + return true; + } + + // If the block has only one predecessor, set its entry mask to be its + // predecessor's exit mask. + const unsigned numPreds = std::distance(pred_begin(&BB), pred_end(&BB)); + if (numPreds == 1) { + BasicBlock *pred = *pred_begin(&BB); + maskInfo.entryMask = copyEntryMask(MaskInfos[pred].exitMasks[&BB], BB); + LLVM_DEBUG(dbgs() << BB.getName() + << ": entry mask: its single predecessor exit mask " + << *maskInfo.entryMask << "\n"); + return true; + } + + // If the block is a loop header, its mask is a phi function with incoming + // values from the preheader and: + // - the latch for divergent loops, + // - nothing else for uniform loops (because if we enter a uniform loop, + // all instance that were active upon entry remain active upon exit). + // + // Here we only store the preheader's exit block as we handle the latch + // in case the loop is divergent in the caller function. + const auto *const LTag = DR->getTag(&BB).loop; + if (LTag && LTag->header == &BB) { + BasicBlock *preheader = LTag->preheader; + VECZ_ERROR_IF(!preheader, "BasicBlock tag is not defined"); + + if (LTag->isLoopDivergent()) { + PHINode *PHI = PHINode::Create(maskTy, 2, BB.getName() + ".entry_mask"); + PHI->insertBefore(BB.begin()); + PHI->addIncoming(MaskInfos[preheader].exitMasks[&BB], preheader); + maskInfo.entryMask = PHI; + LLVM_DEBUG(dbgs() << "Loop divergent loop header " << BB.getName() + << ": entry mask: " << *maskInfo.entryMask << "\n"); + + } else { + maskInfo.entryMask = + copyEntryMask(MaskInfos[preheader].exitMasks[&BB], BB); + LLVM_DEBUG(dbgs() << "Uniform loop header " << BB.getName() + << ": entry mask: " << *maskInfo.entryMask << "\n"); + } + return true; + } + + // If the dominator of this block is also post-dominated by this block, + // then if one is executed, the other must be also. So copy the mask. + auto *IDom = DT->getNode(&BB)->getIDom(); + while (IDom) { + BasicBlock *DomBB = IDom->getBlock(); + if (DR->getTag(DomBB).loop == LTag && PDT->dominates(&BB, DomBB)) { + maskInfo.entryMask = copyEntryMask(MaskInfos[DomBB].entryMask, BB); + LLVM_DEBUG(dbgs() << "Copied-via-domination " << BB.getName() + << ": entry mask: " << *maskInfo.entryMask << "\n"); + return true; + } + IDom = IDom->getIDom(); + } + + // In any other case, its mask is the disjunction of every incoming edge. + // The union of every predecessor if it is a join point of a varying branch. + if (DR->isBlend(BB)) { + for (auto it = pred_begin(&BB); it != pred_end(&BB); ++it) { + if (it == pred_begin(&BB)) { + maskInfo.entryMask = copyEntryMask(MaskInfos[*it].exitMasks[&BB], BB); + LLVM_DEBUG(dbgs() << "Blend block " << BB.getName() + << ": entry mask: " << *maskInfo.entryMask << "\n"); + } else { + auto InsertPt = std::next(maskInfo.entryMask->getIterator()); + maskInfo.entryMask = BinaryOperator::CreateOr( + maskInfo.entryMask, MaskInfos[*it].exitMasks[&BB], + BB.getName() + ".entry_mask"); + maskInfo.entryMask->insertBefore(InsertPt); + + LLVM_DEBUG(dbgs() << "Blend block " << BB.getName() + << ": entry mask: " << *maskInfo.entryMask << "\n"); + } + } + } else { + // A phi function of the predecessors otherwise. + PHINode *PHI = + PHINode::Create(maskTy, numPreds, BB.getName() + ".entry_mask"); + PHI->insertBefore(BB.begin()); + for (auto it = pred_begin(&BB); it != pred_end(&BB); ++it) { + PHI->addIncoming(MaskInfos[*it].exitMasks[&BB], *it); + } + maskInfo.entryMask = PHI; + LLVM_DEBUG(dbgs() << BB.getName() << ": entry mask: " << *maskInfo.entryMask + << "\n"); + } + + return true; +} + +bool ControlFlowConversionState::Impl::createExitMasks(BasicBlock &BB, + bool isBOSCCEntry) { + assert((!isBOSCCEntry || BOSCC) && + "Creating BOSCC Exit Masks when BOSCC object does not exist!"); + + auto &maskInfo = MaskInfos[&BB]; + + // If BB is a BOSCC entry, we want to compute the uniform exit masks for + // this block. + if (!isBOSCCEntry && !maskInfo.exitMasks.empty()) { + return true; + } + + const unsigned numSucc = std::distance(succ_begin(&BB), succ_end(&BB)); + + // If BB has no successor, there is obviously nothing to do. + if (numSucc == 0) { + return true; + } + + // If BB has only one successor, then the exit mask is the entry mask of BB. + if (numSucc == 1) { + BasicBlock *succ = *succ_begin(&BB); + maskInfo.exitMasks[succ] = + copyExitMask(maskInfo.entryMask, succ->getName(), BB); + LLVM_DEBUG(dbgs() << BB.getName() << ": exit mask to single successor " + << succ->getName() << ": " << *maskInfo.entryMask + << "\n"); + return true; + } + + const bool isVarying = DR->getTag(&BB).hasVaryingBranch(); + + // If BB has more than 1 successor, the exit mask of each successor is the + // conjunction of the entry mask of BB and the condition to jump to the + // successor. + auto *T = BB.getTerminator(); + IRBuilder<> B(T); + + if (BranchInst *BI = dyn_cast(T)) { + BasicBlock *trueBB = BI->getSuccessor(0); + BasicBlock *falseBB = BI->getSuccessor(1); + assert(trueBB && "Could not get successor 0 of branch"); + assert(falseBB && "Could not get successor 1 of branch"); + + if (isBOSCCEntry) { + if (BasicBlock *trueBBUniform = BOSCC->getBlock(trueBB)) { + trueBB = trueBBUniform; + } + if (BasicBlock *falseBBUniform = BOSCC->getBlock(falseBB)) { + falseBB = falseBBUniform; + } + } + + Value *cond = BI->getCondition(); + if (isVarying) { + Value *constantFalse = getDefaultValue(cond->getType()); + + maskInfo.exitMasks[trueBB] = + B.CreateSelect(maskInfo.entryMask, cond, constantFalse, + trueBB->getName() + ".exit_mask"); + + // For the false edge, we have to negate the condition. + Value *negCond = B.CreateNot(cond, cond->getName() + ".not"); + maskInfo.exitMasks[falseBB] = + B.CreateSelect(maskInfo.entryMask, negCond, constantFalse, + falseBB->getName() + ".exit_mask"); + + LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to " + << trueBB->getName() << ": " + << *maskInfo.exitMasks[trueBB] << "\n"); + LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to " + << falseBB->getName() << ": " + << *maskInfo.exitMasks[falseBB] << "\n"); + } else { + maskInfo.exitMasks[trueBB] = B.CreateSelect( + cond, maskInfo.entryMask, getDefaultValue(cond->getType()), + trueBB->getName() + ".exit_mask"); + maskInfo.exitMasks[falseBB] = + B.CreateSelect(cond, getDefaultValue(cond->getType()), + maskInfo.entryMask, falseBB->getName() + ".exit_mask"); + + LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to " + << trueBB->getName() << ": " + << *maskInfo.exitMasks[trueBB] << "\n"); + LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to " + << falseBB->getName() << ": " + << *maskInfo.exitMasks[falseBB] << "\n"); + } + } else if (SwitchInst *SI = dyn_cast(T)) { + Value *cond = SI->getCondition(); + BasicBlock *defaultDest = SI->getDefaultDest(); + + if (isBOSCCEntry) { + if (BasicBlock *defaultDestUniform = BOSCC->getBlock(defaultDest)) { + defaultDest = defaultDestUniform; + } + } + + // The default condition is the negation of the disjunction of every case + // condition, so that if no case has its condition true, then we can choose + // default. + Value *caseConds = nullptr; + Value *constantFalse = nullptr; + for (auto c : SI->cases()) { + Value *caseCond = B.CreateICmpEQ(cond, c.getCaseValue()); + if (!caseConds) { + caseConds = caseCond; + constantFalse = getDefaultValue(caseCond->getType()); + } else { + caseConds = B.CreateOr(caseConds, caseCond); + } + BasicBlock *caseBlock = c.getCaseSuccessor(); + if (isBOSCCEntry) { + if (BasicBlock *caseBlockUniform = BOSCC->getBlock(caseBlock)) { + caseBlock = caseBlockUniform; + } + } + + if (isVarying) { + maskInfo.exitMasks[caseBlock] = + B.CreateSelect(maskInfo.entryMask, caseCond, constantFalse, + caseBlock->getName() + ".exit_mask"); + LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to " + << caseBlock->getName() << ": " + << *maskInfo.exitMasks[caseBlock] << "\n"); + } else { + maskInfo.exitMasks[caseBlock] = + B.CreateSelect(maskInfo.entryMask, caseCond, constantFalse, + caseBlock->getName() + ".exit_mask"); + LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to " + << caseBlock->getName() << ": " + << *maskInfo.exitMasks[caseBlock] << "\n"); + } + } + + VECZ_ERROR_IF(!caseConds, "No switch condition was found"); + + Value *negCond = B.CreateNot(caseConds, caseConds->getName() + ".not"); + if (isVarying) { + maskInfo.exitMasks[defaultDest] = + B.CreateSelect(maskInfo.entryMask, negCond, constantFalse, + defaultDest->getName() + ".exit_mask"); + LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to " + << defaultDest->getName() << ": " + << *maskInfo.exitMasks[defaultDest] << "\n"); + } else { + maskInfo.exitMasks[defaultDest] = + B.CreateSelect(maskInfo.entryMask, negCond, constantFalse, + defaultDest->getName() + ".exit_mask"); + LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to " + << defaultDest->getName() << ": " + << *maskInfo.exitMasks[defaultDest] << "\n"); + } + } else { + // We should not have a case where we don't have a BranchInst nor a + // SwitchInst but more than 1 successors. + return false; + } + + return true; +} + +bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag <ag) { + auto &LMask = LoopMasks[LTag.loop]; + // If the Loop already has a CombinedExitMasks we have already processed it. + if (LMask.combinedDivergentExitMask) { + return true; + } + + Type *maskTy = Type::getInt1Ty(F.getContext()); + SmallVector exitEdges; + LTag.loop->getExitEdges(exitEdges); + for (const Loop::Edge &EE : exitEdges) { + const auto *const exitingBlock = EE.first; + const auto *const exitBlock = EE.second; + // Divergent loop need to keep track of which instance left at which exit. + if (LTag.isLoopDivergent() && DR->isDivergent(*exitBlock)) { + // The value of the exit mask of a divergent loop is a phi function + // between the mask update and the loop exit mask phi. + auto *const exitMask = + PHINode::Create(maskTy, 2, exitBlock->getName() + ".loop_exit_mask"); + exitMask->insertBefore(LTag.header->getFirstNonPHIIt()); + LMask.persistedDivergentExitMasks[exitingBlock] = exitMask; + if (BOSCC) { + BOSCC->createReference(exitMask, getDefaultValue(maskTy)); + } + } + } + + for (Loop *L : LTag.loop->getSubLoops()) { + VECZ_FAIL_IF(!createLoopExitMasks(DR->getTag(L))); + } + + // If the loop is uniform, all instances that enter the loop will leave it + // together. + if (!LTag.isLoopDivergent()) { + return true; + } + + // Check if the exit edge leaves multiple loops, in which case we return the + // next inner loop left by it. + auto nextInnerLoopLeft = [this, <ag](BasicBlock *exitingBlock, + BasicBlock *exitBlock) -> Loop * { + Loop *innerLoop = nullptr; + Loop *loop = DR->getTag(exitingBlock).loop->loop; + // Iterate until we reach the current loop. + while (loop && loop != LTag.loop) { + // If this is an exit edge. + if (loop->contains(exitingBlock) && !loop->contains(exitBlock)) { + innerLoop = loop; + } + + loop = loop->getParentLoop(); + } + + return innerLoop; + }; + + for (const Loop::Edge &EE : exitEdges) { + BasicBlock *exitingBlock = const_cast(EE.first); + BasicBlock *exitBlock = const_cast(EE.second); + + if (DR->isDivergent(*exitBlock)) { + PHINode *REM = LMask.persistedDivergentExitMasks[exitingBlock]; + REM->addIncoming(getDefaultValue(REM->getType()), LTag.preheader); + + const auto *const exitingLTag = DR->getTag(exitingBlock).loop; + VECZ_ERROR_IF(!exitingLTag, "Loop tag is not defined"); + + // By default, the second operand of the mask update is the exit + // condition. + auto &exitMasks = MaskInfos[exitingBlock].exitMasks; + Value *maskUpdateOperand = exitMasks[exitBlock]; + + // If the exit leaves multiple loops and the current loop is not the + // innermost left by this exit, set the update mask to be a disjunction + // with the exit mask and the accumulated update mask from the next inner + // loop left by this exit. + if (exitingLTag->loop != LTag.loop) { + if (Loop *nestedLoop = nextInnerLoopLeft(exitingBlock, exitBlock)) { + maskUpdateOperand = + LoopMasks[nestedLoop] + .updatedPersistedDivergentExitMasks[exitingBlock]; + } + } + + BinaryOperator *maskUpdate = BinaryOperator::CreateOr( + REM, maskUpdateOperand, + exitBlock->getName() + ".loop_exit_mask.update"); + maskUpdate->insertBefore(exitingBlock->getTerminator()->getIterator()); + + LMask.updatedPersistedDivergentExitMasks[exitingBlock] = maskUpdate; + + if (BOSCC) { + // The uniform version of divergent loop exit masks is the edge's exit + // mask. + BOSCC->addReference(maskUpdate, exitMasks[exitBlock]); + } + + // If this is the outermost loop left by this exit, update the exit + // mask. + if (DR->getTag(exitBlock).outermostExitedLoop == <ag) { + VECZ_ERROR_IF(!isa(exitMasks[exitBlock]), + "Trying to replace uses of a value"); + VECZ_FAIL_IF( + !replaceReachableUses(*RC, cast(exitMasks[exitBlock]), + maskUpdate, exitBlock)); + + exitMasks[exitBlock] = maskUpdate; + } + + REM->addIncoming(maskUpdate, LTag.latch); + + LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName() + << ": divergent loop exit edges [" + << exitingBlock->getName() << " -> " + << exitBlock->getName() << "]: exit mask: " << *REM + << "\n"); + LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName() + << ": divergent loop exit edges [" + << exitingBlock->getName() << " -> " + << exitBlock->getName() + << "]: update exit mask: " << *maskUpdate << "\n"); + } + } + + VECZ_FAIL_IF(!createCombinedLoopExitMask(LTag)); + + return true; +} + +bool ControlFlowConversionState::Impl::createCombinedLoopExitMask( + LoopTag <ag) { + // Gather every information on every instance that left the loop in the + // current iteration. + SmallVector exitEdges; + auto *const Loop = LTag.loop; + Loop->getExitEdges(exitEdges); + auto &LMask = LoopMasks[Loop]; + for (const Loop::Edge &EE : exitEdges) { + BasicBlock *exitingBlock = const_cast(EE.first); + BasicBlock *exitBlock = const_cast(EE.second); + if (DR->isDivergent(*exitBlock)) { + if (!LMask.combinedDivergentExitMask) { + LMask.combinedDivergentExitMask = copyMask( + LMask.updatedPersistedDivergentExitMasks[exitingBlock]->getOperand( + 1), + Loop->getName() + ".combined_divergent_exit_mask"); + + LMask.persistedCombinedDivergentExitMask = copyMask( + LMask.updatedPersistedDivergentExitMasks[exitingBlock], + Loop->getName() + ".persisted_combined_divergent_exit_mask"); + } else { + LMask.combinedDivergentExitMask = BinaryOperator::CreateOr( + LMask.combinedDivergentExitMask, + LMask.updatedPersistedDivergentExitMasks[exitingBlock]->getOperand( + 1), + Loop->getName() + ".combined_divergent_exit_mask"); + + LMask.persistedCombinedDivergentExitMask = BinaryOperator::CreateOr( + LMask.persistedCombinedDivergentExitMask, + LMask.updatedPersistedDivergentExitMasks[exitingBlock], + Loop->getName() + ".persisted_combined_divergent_exit_mask"); + } + LMask.combinedDivergentExitMask->insertBefore( + LTag.latch->getTerminator()->getIterator()); + LMask.persistedCombinedDivergentExitMask->insertBefore( + LTag.latch->getTerminator()->getIterator()); + } + } + + VECZ_ERROR_IF(!LMask.combinedDivergentExitMask || + !LMask.persistedCombinedDivergentExitMask, + "Divergent loop has no loop exit condition"); + + LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName() + << ": current iteration combine divergent loop exit: " + << *LMask.combinedDivergentExitMask << "\n"); + LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName() + << ": whole loop combine divergent loop exit: " + << *LMask.persistedCombinedDivergentExitMask << "\n"); + + return true; +} + +Error ControlFlowConversionState::Impl::applyMasks() { + for (auto &BB : F) { + // Use masks with instructions that have side-effects. + if (!DR->isUniform(BB) && !DR->isByAll(BB)) { + auto *const entryMask = MaskInfos[&BB].entryMask; + VECZ_ERROR_IF(!entryMask, "BasicBlock should have an entry mask"); + if (auto err = applyMask(BB, entryMask)) { + return err; + } + } + } + return Error::success(); +} + +Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) { + // Packetization hasn't happened yet so this better be a scalar 1 bit int. + assert(mask->getType()->isIntegerTy(1) && "CFG mask type should be int1"); + // Map the unmasked instruction with the masked one. + DeletionMap toDelete; + DenseMap safeDivisors; + + for (Instruction &I : BB) { + if (tryApplyMaskToBinOp(I, mask, toDelete, safeDivisors)) { + continue; + } + std::optional memOp = MemOp::get(&I); + // Turn loads and stores into masked loads and stores. + if (memOp && (memOp->isLoad() || memOp->isStore())) { + if (!tryApplyMaskToMemOp(*memOp, mask, toDelete)) { + return makeStringError("Could not apply mask to MemOp", I); + } + } else if (auto *CI = dyn_cast(&I)) { + // Turn calls into masked calls if possible. + if (!applyMaskToCall(CI, mask, toDelete)) { + return makeStringError("Could not apply mask to call instruction", I); + } + } else if (I.isAtomic() && !isa(&I)) { + // Turn atomics into calls to masked builtins if possible. + if (!applyMaskToAtomic(I, mask, toDelete)) { + return makeStringError("Could not apply mask to atomic instruction", I); + } + } else if (auto *branch = dyn_cast(&I)) { + // We have to be careful with infinite loops, because if they exist on a + // divergent code path, they will always be entered and will hang the + // kernel. Therefore, we replace the branch condition with the mask of + // the preheader, to ensure they only loop if at least one lane is + // actually executed. + if (branch->isConditional()) { + auto *const cond = dyn_cast(branch->getCondition()); + if (cond && cond->isOneValue()) { + auto *const loop = DR->getTag(&BB).loop; + if (loop && loop->latch == &BB) { + auto *const loopMask = MaskInfos[loop->preheader].entryMask; + branch->setCondition(loopMask); + } + } + } + } + } + + for (auto &pair : toDelete) { + Instruction *unmasked = pair.first; + Value *masked = pair.second; + updateMaps(unmasked, masked); + IRCleanup::deleteInstructionNow(unmasked); + } + + return Error::success(); +} + +CallInst *ControlFlowConversionState::Impl::emitMaskedVersion(CallInst *CI, + Value *entryBit) { + // Get the masked function + Function *newFunction = Ctx.getOrCreateMaskedFunction(CI); + VECZ_FAIL_IF(!newFunction); + SmallVector fnArgs; + for (unsigned i = 0; i < CI->arg_size(); ++i) { + fnArgs.push_back(CI->getOperand(i)); + } + fnArgs.push_back(entryBit); + + CallInst *newCI = CallInst::Create(newFunction, fnArgs); + newCI->insertBefore(CI->getIterator()); + newCI->setCallingConv(CI->getCallingConv()); + newCI->setAttributes(CI->getAttributes()); + + return newCI; +} + +bool ControlFlowConversionState::Impl::tryApplyMaskToBinOp( + Instruction &I, Value *mask, DeletionMap &toDelete, + DenseMap &safeDivisors) { + if (auto *binOp = dyn_cast(&I)) { + if (!VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions)) { + // we don't need to mask division operations if they don't trap + return true; + } + // We might have to mask integer divides to avoid division errors. + // NOTE we don't generate any specific error checks ourselves, on the + // assumption that the incoming IR is already guarded against these, + // so it is sufficient to use the mask generated from the CFG. + bool isUnsigned = false; + switch (binOp->getOpcode()) { + case Instruction::UDiv: + case Instruction::URem: + isUnsigned = true; + LLVM_FALLTHROUGH; + case Instruction::SDiv: + case Instruction::SRem: { + auto *divisor = binOp->getOperand(1); + // no need to mask divides by a constant.. + if (auto *C = dyn_cast(divisor)) { + if (C->isZeroValue()) { + // Divides by constant zero can be a NOP since there is no + // division by zero exception in OpenCL. + auto *nop = binOp->getOperand(0); + I.replaceAllUsesWith(nop); + toDelete.emplace_back(&I, nop); + } + } else { + auto &masked = safeDivisors[divisor]; + if (!masked) { + // NOTE this function does not check for the pattern + // "select (x eq 0) 1, x" or equivalent, so we might want to + // write it ourselves, but Instruction Combining cleans it up. + // NOTE that for a signed division, we also have to consider the + // potential overflow situation, which is not so simple + if (isUnsigned && + isKnownNonZero(divisor, F.getParent()->getDataLayout())) { + // Static analysis concluded it can't be zero, so we don't need + // to do anything. + masked = divisor; + } else { + auto *SI = SelectInst::Create( + mask, divisor, ConstantInt::get(divisor->getType(), 1), + divisor->getName() + ".masked"); + SI->insertBefore(I.getIterator()); + masked = SI; + } + } + + if (masked != divisor) { + binOp->setOperand(1, masked); + } + } + } break; + + default: + break; + } + return true; + } else { + return false; + } +} + +bool ControlFlowConversionState::Impl::tryApplyMaskToMemOp( + MemOp &memOp, Value *mask, DeletionMap &toDelete) { + VECZ_FAIL_IF(!memOp.isLoad() && !memOp.isStore()); + auto *I = memOp.getInstr(); + VECZ_FAIL_IF(!I); + auto *dataVecTy = dyn_cast(memOp.getDataType()); + const unsigned dataWidth = dataVecTy ? dataVecTy->getNumElements() : 1; + Value *wideMask = mask; + if (dataWidth > 1) { + // If it's a vector mem-op it gets the same mask for every element + IRBuilder<> B(I); + wideMask = B.CreateVectorSplat(dataWidth, mask); + } + + // Turn loads and stores into masked loads and stores. + if (memOp.isLoadStoreInst()) { + // Create a new mem-op the same as the original except for the addition + // of the mask. + Instruction *newVal = nullptr; + if (memOp.isLoad()) { + newVal = createMaskedLoad( + Ctx, memOp.getDataType(), memOp.getPointerOperand(), wideMask, + /*VL*/ nullptr, memOp.getAlignment(), I->getName()); + } else { + newVal = createMaskedStore( + Ctx, memOp.getDataOperand(), memOp.getPointerOperand(), wideMask, + /*VL*/ nullptr, memOp.getAlignment(), I->getName()); + } + VECZ_FAIL_IF(!newVal); + + newVal->insertBefore(I->getIterator()); + + if (!I->getType()->isVoidTy()) { + I->replaceAllUsesWith(newVal); + } + toDelete.emplace_back(I, newVal); + return true; + } + + if (auto *opMask = memOp.getMaskOperand()) { + auto *mask = BinaryOperator::CreateAnd(wideMask, opMask, "composite_mask"); + mask->insertBefore(I->getIterator()); + memOp.setMaskOperand(mask); + return true; + } + + return false; +} + +bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI, + Value *mask, + DeletionMap &toDelete) { + LLVM_DEBUG(dbgs() << "vecz-cf: Now at CallInst " << *CI << "\n"); + // It might be that we need to mask the function call here because we + // won't be able to packetize it later on. + Function *callee = CI->getCalledFunction(); + if (!callee) { + callee = dyn_cast(CI->getCalledOperand()->stripPointerCasts()); + } + VECZ_FAIL_IF(!callee); // TODO: Support indirect function calls. + // Check to see if this is a function that we know we won't be able to + // handle in any other way. + VECZ_FAIL_IF(callee->cannotDuplicate()); + + // Do not mess with internal builtins + if (Ctx.isInternalBuiltin(callee)) { + LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an internal builtin\n"); + return true; + } + + // Functions without side-effects do not need to be masked. + if (callee->onlyReadsMemory() || callee->doesNotAccessMemory()) { + LLVM_DEBUG( + dbgs() << "vecz-cf: Called function does not have any side-effects\n"); + return true; + } + + // Builtins without side effects do not need to be masked. + if (const auto builtin = Ctx.builtins().analyzeBuiltin(*callee)) { + const auto props = builtin->properties; + if (props & compiler::utils::eBuiltinPropertyNoSideEffects) { + LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an pure builtin\n"); + return true; + } + if (props & compiler::utils::eBuiltinPropertyWorkItem) { + LLVM_DEBUG( + dbgs() << "vecz-cf: Called function is a workitem ID builtin\n"); + return true; + } + if (props & compiler::utils::eBuiltinPropertyExecutionFlow) { + LLVM_DEBUG( + dbgs() << "vecz-cf: Called function is an execution flow builtin\n"); + // Masking this kind of builtin (a barrier) is not valid. + return false; + } + // We don't want to mask work-group collective builtins, because they are + // barriers (see above). This should actually be a rare situation, as these + // builtins are required to be uniform/convergent and so either all + // work-items or no work-items should hit them. Most of the time, this + // situation relies on the vectorizer failing to trace the branch flow and + // failing to realize the conditions are in fact uniform. + if (auto info = Ctx.builtins().isMuxGroupCollective(builtin->ID); + info && info->isWorkGroupScope()) { + LLVM_DEBUG( + dbgs() << "vecz-cf: Called function is a work-group collective\n"); + return true; + } + } + + // Create the new function and replace the old one with it + CallInst *newCI = emitMaskedVersion(CI, mask); + VECZ_FAIL_IF(!newCI); + if (!CI->getType()->isVoidTy()) { + CI->replaceAllUsesWith(newCI); + } + toDelete.emplace_back(CI, newCI); + + LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << *CI << "\n"); + LLVM_DEBUG(dbgs() << " with " << *newCI << "\n"); + + return true; +} + +bool ControlFlowConversionState::Impl::applyMaskToAtomic( + Instruction &I, Value *mask, DeletionMap &toDelete) { + LLVM_DEBUG(dbgs() << "vecz-cf: Now at atomic inst " << I << "\n"); + + SmallVector maskedFnArgs; + VectorizationContext::MaskedAtomic MA; + MA.VF = ElementCount::getFixed(1); + MA.IsVectorPredicated = VU.choices().vectorPredication(); + + if (auto *atomicI = dyn_cast(&I)) { + MA.Align = atomicI->getAlign(); + MA.BinOp = atomicI->getOperation(); + MA.IsVolatile = atomicI->isVolatile(); + MA.Ordering = atomicI->getOrdering(); + MA.SyncScope = atomicI->getSyncScopeID(); + MA.ValTy = atomicI->getType(); + MA.PointerTy = atomicI->getPointerOperand()->getType(); + + // Set up the arguments to this function + maskedFnArgs = {atomicI->getPointerOperand(), atomicI->getValOperand(), + mask}; + + } else if (auto *cmpxchgI = dyn_cast(&I)) { + MA.Align = cmpxchgI->getAlign(); + MA.BinOp = AtomicRMWInst::BAD_BINOP; + MA.IsWeak = cmpxchgI->isWeak(); + MA.IsVolatile = cmpxchgI->isVolatile(); + MA.Ordering = cmpxchgI->getSuccessOrdering(); + MA.CmpXchgFailureOrdering = cmpxchgI->getFailureOrdering(); + MA.SyncScope = cmpxchgI->getSyncScopeID(); + MA.ValTy = cmpxchgI->getCompareOperand()->getType(); + MA.PointerTy = cmpxchgI->getPointerOperand()->getType(); + + // Set up the arguments to this function + maskedFnArgs = {cmpxchgI->getPointerOperand(), + cmpxchgI->getCompareOperand(), cmpxchgI->getNewValOperand(), + mask}; + } else { + return false; + } + + // Create the new function and replace the old one with it + // Get the masked function + Function *maskedAtomicFn = Ctx.getOrCreateMaskedAtomicFunction( + MA, VU.choices(), ElementCount::getFixed(1)); + VECZ_FAIL_IF(!maskedAtomicFn); + // We don't have a vector length just yet - pass in one as a dummy. + if (MA.IsVectorPredicated) { + maskedFnArgs.push_back( + ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 1)); + } + + CallInst *maskedCI = CallInst::Create(maskedAtomicFn, maskedFnArgs); + VECZ_FAIL_IF(!maskedCI); + maskedCI->insertBefore(I.getIterator()); + + I.replaceAllUsesWith(maskedCI); + toDelete.emplace_back(&I, maskedCI); + + LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << I << "\n"); + LLVM_DEBUG(dbgs() << " with " << *maskedCI << "\n"); + + return true; +} + +bool ControlFlowConversionState::Impl::partiallyLinearizeCFG() { + // Two methods are possible to transform the divergent loops into uniform + // ones: + // 1) rewire the exit edges to the single latch, which means the loop live + // masks have to be updated at each exiting block. + // 2) delete the divergent loop exit edges and update the loop live masks at + // the latch. + // + // The former means more overhead when a loop exit is reached because we + // always have to update the masks, but it allows to retain the exiting + // branches. + // The latter means we only blend at the latch, thus less overhead at the + // loop exits, but if we reach a divergent loop exit, and it happens that all + // lanes have exited the loop, we still have to finish the iteration until we + // reach the latch and exit the loop. + // + // We are currently using the latter. + VECZ_FAIL_IF(!uniformizeDivergentLoops()); + + // ... and actually rewire them. + VECZ_FAIL_IF(!linearizeCFG()); + + // Transform phi nodes into selects for blocks that got blended. + VECZ_FAIL_IF(!generateSelects()); + + // Connect BOSCC regions if it is activated. + VECZ_FAIL_IF(BOSCC && !BOSCC->connectBOSCCRegions()); + + // Repair the CFG because the rewiring broke it. + VECZ_FAIL_IF(!repairSSA()); + + // Now we create the opaque calls to builtins that compute the real branch + // values. This must come before instruction simplification, otherwise LLVM + // can fold branch predicates that appear unreachable now, but would later + // become vector masks, thus mangling the control flow.. + VECZ_FAIL_IF(!createBranchReductions()); + + // ... and now we can do instruction simplification on the masks and know they + // won't be prematurely folded. + VECZ_FAIL_IF(!simplifyMasks()); + + // Finally, if we used BOSCC it might want to do some tidying up. + VECZ_FAIL_IF(BOSCC && !BOSCC->cleanUp()); + + return true; +} + +bool ControlFlowConversionState::Impl::createBranchReductions() { + // Try to retrieve the builtin if it already exists. + const auto baseName = + Twine(VectorizationContext::InternalBuiltinPrefix).concat("divergence"); + const StringRef nameAny = "_any"; + const StringRef nameAll = "_all"; + + Type *boolTy = Type::getInt1Ty(F.getContext()); + FunctionType *FT = FunctionType::get(boolTy, {boolTy}, false); + + for (BasicBlock &BB : F) { + const bool needsAllOfMask = DR->hasFlag(BB, eBlockNeedsAllOfMask); + + // If the block is uniform and is not a bossc indirection, all its lanes + // are true or false, not both. Thus, we don't need to packetize the + // condition. + if (!needsAllOfMask && DR->isUniform(BB)) { + continue; + } + + auto *TI = BB.getTerminator(); + if (BranchInst *Branch = dyn_cast(TI)) { + if (Branch->isConditional()) { + auto *cond = Branch->getCondition(); + if (isa(cond)) { + continue; + } + + // On divergent paths, ensure that only active lanes contribute to a + // branch condition; merge the branch condition with the active lane + // mask. This ensures that disabled lanes don't spuriously contribute a + // 'true' value into the reduced branch condition. + // Note that the distinction between 'uniform' and 'divergent' isn't + // 100% sufficient for our purposes here, because even uniform values + // may read undefined/poison values when masked out. + // Don't perform this on uniform loops as those may be unconditionally + // entered even when no work-items are active. Masking the loop exit + // with the entry mask would mean that the loop never exits. + // FIXME: Is this missing incorrect branches in uniform blocks/loops? + if (auto *LTag = DR->getTag(&BB).loop; + DR->isDivergent(BB) && (!LTag || LTag->isLoopDivergent())) { + if (!isBranchCondTrulyUniform(cond, *UVR)) { + auto *newcond = SelectInst::Create(MaskInfos[&BB].entryMask, cond, + getDefaultValue(cond->getType()), + cond->getName() + "_active"); + newcond->insertBefore(Branch->getIterator()); + cond = newcond; + } + } + + const auto &name = needsAllOfMask ? nameAll : nameAny; + Function *const F = Ctx.getOrCreateInternalBuiltin( + Twine(baseName).concat(name).str(), FT); + VECZ_FAIL_IF(!F); + + auto *const newCall = + CallInst::Create(F, {cond}, Twine(cond->getName()).concat(name)); + newCall->insertBefore(Branch->getIterator()); + Branch->setCondition(newCall); + } + } else if (isa(TI) && + DR->hasFlag(BB, eBlockHasDivergentBranch)) { + // Not sure what to actually do with switch instructions.. + return false; + } + } + return true; +} + +bool ControlFlowConversionState::Impl::uniformizeDivergentLoops() { + LLVM_DEBUG(dbgs() << "CFC: UNIFORMIZE DIVERGENT LOOPS\n"); + + // For every divergent loop of the function, we want to create a new exit edge + // whose source is the latch of the loop. That exit is called "pure". The + // target of this edge is a new divergent loop exit that will start a cascade + // of if conditions to branch to the original loop exits. The divergent loop + // exits will no longer be exits, while the optional loop exits will retain + // their branch but they will be rewired to the pure exit. + // + // Given the following *divergent* loop: + // + // preheader + // | + // header <---------. + // / \ | + // ... ... | + // / \ | + // %exit2.o ... | + // / / \ | + // %d %exit1.o ... | + // / \ | + // %b ... | + // / \ | + // %exit2.r ... | + // / \ | + // %c %latch.r --' + // / + // %exit1.r + // | + // %a + // + // with: + // - %a, %b, %c, %d = a group of non specific basic blocks + // - %exit*.* = loop exits + // - *.o = optional blocks + // - *.r = divergent blocks + // - %latch.r = the latch of the loop. It is necessarily a divergent + // block because the loop is divergent + // + // The following transformation is performed: + // + // preheader + // | + // header <---------. + // / \ | + // ... ... | + // / \ | + // %exit2.split1.o ... | + // | / \ | + // \ %exit1.split1.o ... | + // \ | \ | + // \ \ ... | + // \ \ \ | + // \ \ ... | + // \ \ \ | + // \ \ %latch.r --' + // \ \ | + // `---`-> %loop.pure_exit + // / | + // %exit1.r %exit1.else.r + // / / | + // %a %exit2.r %exit2.else.r + // / / | + // %c / | + // / | + // %exit1.split2.o %exit1.else.o + // / / | + // %b %exit2.split2.o %exit2.else.o + // / + // %d + // + // with: + // - %exit*.split1.o = the first half of the original %exit*.o with only + // phi nodes + // - %exit*.split2.o = the second half of the original %exit*.o without the + // phi nodes + // - %loop.pure_exit = a new loop exit starting a cascade of ifs towards the + // original loop exits + // - %exit*.else.* = a new block whose only purpose is to branch to other + // blocks + // + // Each introduced conditional branch uses the entry mask of the exit block + // as the condition. + // Each introduced divergent conditional block is marked as Div causing, thus + // linearizing them. + // Each introduced optional conditional block is marked as divergent, thus + // retaining the branches and branching to the true path only if any of the + // lanes that executed the loop left through the exit the true path targets. + // + // The state of the loop after the transformation is invalid and relies on + // the linearizer to correctly rewire the introduced blocks. The result of the + // above transformed loop after linearization will be: + // + // preheader + // | + // header <---------. + // / \ | + // ... ... | + // / \ | + // %exit2.split1.o ... | + // | \ | + // | ... | + // | \ | + // | ... | + // | / \ | + // | %exit1.split1.o ... | + // \ | \ | + // \ | %latch.r --' + // \ | | + // `---> %loop.pure_exit + // | + // %exit1.r + // | + // %a + // | + // %exit1.else.r + // | + // %exit2.r + // | + // %c + // | + // %exit2.else.r + // / | + // %exit1.split.o %exit1.else.o + // / / | + // %b %exit2.split.o %exit2.else.o + // / ... + // %d + // + // Note that only one branch introduced from an optional loop exit + // ('%exit2.else.r' and '%exit1.else.o' in this example) can evaluate to + // true because as soon as an optional loop exit is taken, all the active + // lanes in the loop leave through it. + // However, as many as all the branches introduced from divergent loop exits + // may evaluate to true. The '...' at the end of the CFG will be replaced by + // whatever would originally succeed the original divergent loop exits. + bool modified = false; + for (auto *const LTag : DR->getLoopOrdering()) { + if (LTag->isLoopDivergent()) { + Loop *L = LTag->loop; + + // Store the loop exit blocks and edges before doing any modification. + SmallVector exitBlocks; + SmallVector exitEdges; + { + L->getExitEdges(exitEdges); + // 1) Retrieve the unique loop exit blocks. + // 2) Remove any loop exit for which 'L' is not the outermost loop left. + // 3) Sort the loop exit blocks. + // + // We can't use the `getUniqueExitBlocks' method because the loop may + // not be in a canonical form because of BOSCC. + if (BOSCC) { + L->getExitBlocks(exitBlocks); + SmallPtrSet _uniqueExitBlocks; + for (auto it = exitBlocks.begin(); it != exitBlocks.end();) { + if (!_uniqueExitBlocks.insert(*it).second) { + it = exitBlocks.erase(it); + } else { + ++it; + } + } + } else { + L->getUniqueExitBlocks(exitBlocks); + } + // Only handle outermost loops left by the exits. + exitBlocks.erase( + std::remove_if(exitBlocks.begin(), exitBlocks.end(), + [this, LTag](BasicBlock *EB) { + return DR->getTag(EB).outermostExitedLoop != LTag; + }), + exitBlocks.end()); + // Order the loop exit blocks such that: + // - divergent loop exits come first + // - smallest DCBI come first + const auto middle = std::partition( + exitBlocks.begin(), exitBlocks.end(), + [this](BasicBlock *BB) { return DR->isDivergent(*BB); }); + std::sort(exitBlocks.begin(), middle, + [this](BasicBlock *LHS, BasicBlock *RHS) { + return DR->getTagIndex(LHS) < DR->getTagIndex(RHS); + }); + std::sort(middle, exitBlocks.end(), + [this](BasicBlock *LHS, BasicBlock *RHS) { + return DR->getTagIndex(LHS) < DR->getTagIndex(RHS); + }); + } + + if (exitBlocks.empty()) { + LLVM_DEBUG(dbgs() << "Loop " << L->getName() + << " has no loop exits eligible for rewiring.\n"); + continue; + } + + VECZ_FAIL_IF(!computeDivergentLoopPureExit(*LTag)); + VECZ_FAIL_IF(!rewireDivergentLoopExitBlocks(*LTag, exitBlocks)); + + VECZ_FAIL_IF(!generateDivergentLoopResults(*LTag)); + VECZ_FAIL_IF(!blendDivergentLoopLiveValues(*LTag, exitBlocks)); + VECZ_FAIL_IF(!blendDivergentLoopExitMasks(*LTag, exitEdges, exitBlocks)); + + modified = true; + } + } + + // We have modified the divergent loops into uniform ones, thus changing the + // dominance-compact block ordering. We need to recompute it. + if (modified) { + DT->recalculate(F); + PDT->recalculate(F); + // And make sure we correctly updated the DomTrees. + VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated"); + VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated"); + VECZ_FAIL_IF(!computeBlockOrdering()); + + RC->clear(); + } + + return true; +} + +bool ControlFlowConversionState::Impl::computeDivergentLoopPureExit( + LoopTag <ag) { + LLVM_DEBUG(dbgs() << "CFC: COMPUTE PURE EXIT FOR LOOP " + << LTag.loop->getName() << "\n"); + + auto *const latchBB = LTag.latch; + BasicBlock *pureExit = + BasicBlock::Create(F.getContext(), LTag.loop->getName() + ".pure_exit", + &F, latchBB->getNextNode()); + BasicBlockTag &pureExitTag = DR->getOrCreateTag(pureExit); + + // Set the tags. + auto &LMask = LoopMasks[LTag.loop]; + MaskInfos[pureExit].entryMask = LMask.persistedCombinedDivergentExitMask; + pureExitTag.outermostExitedLoop = <ag; + + auto *const preheaderLoopTag = DR->getTag(LTag.preheader).loop; + if (preheaderLoopTag) { + pureExitTag.loop = preheaderLoopTag; + preheaderLoopTag->loop->addBasicBlockToLoop(pureExit, *LI); + } + DR->setFlag(*pureExit, + static_cast( + BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit | + BlockDivergenceFlag::eBlockHasDivergentBranch | + BlockDivergenceFlag::eBlockIsDivergent)); + + LTag.pureExit = pureExit; + + LLVM_DEBUG(dbgs() << "Pure exit: " << pureExit->getName() << "\n"); + + if (BOSCC) { + BOSCC->addInRegions(pureExit, latchBB); + } + + auto *latchT = latchBB->getTerminator(); +#ifndef ALL_OF_DIVERGENT_LOOP_LATCH + Value *cond = MaskInfos[latchBB].exitMasks[LTag.header]; + auto *newT = BranchInst::Create(LTag.header, pureExit, cond, latchBB); +#else + // Exit the loop through the single divergent loop exit only if all instances + // that entered the loop left it. + ICmpInst *cond = new ICmpInst( + latchT, CmpInst::ICMP_EQ, LMask.persistedCombinedDivergentExitMask, + MaskInfos[LTag.preheader].exitMasks[LTag.header]); + auto *newT = BranchInst::Create(pureExit, LTag.header, cond, latchBB); + DR->setFlag(*latchBB, eBlockNeedsAllOfMask); +#endif + + updateMaps(latchT, newT); + + IRCleanup::deleteInstructionNow(latchT); + + MaskInfos[latchBB].exitMasks[pureExit] = + LMask.persistedCombinedDivergentExitMask; + + return true; +} + +bool ControlFlowConversionState::Impl::rewireDivergentLoopExitBlocks( + LoopTag <ag, const SmallVectorImpl &exitBlocks) { + LLVM_DEBUG(dbgs() << "CFC: REWIRE EXIT BLOCKS FOR LOOP " + << LTag.loop->getName() << "\n"); + + auto removeSuccessor = [this](Instruction *T, unsigned succIdx) { + switch (T->getOpcode()) { + default: + // Any other kind of Terminator cannot be handled and until + // proven otherwise, should not. + break; + case Instruction::Br: { + const unsigned keepIdx = succIdx == 0 ? 1 : 0; + auto *newT = BranchInst::Create(T->getSuccessor(keepIdx)); + newT->insertBefore(T->getIterator()); + + updateMaps(T, newT); + + IRCleanup::deleteInstructionNow(T); + break; + } + case Instruction::Switch: { + SwitchInst *SI = cast(T); + if (succIdx == 0) { + SI->setDefaultDest(SI->getSuccessor(1)); + SI->removeCase(SI->case_begin()); + } else { + SI->removeCase(std::next(SI->case_begin(), succIdx - 1)); + } + break; + } + case Instruction::IndirectBr: { + IndirectBrInst *IBI = cast(T); + IBI->removeDestination(succIdx); + break; + } + } + }; + + // 'divergentLE' represents the current virtual divergent loop exit that a + // loop exit needs to be rewired to/from. + BasicBlock *divergentLE = LTag.pureExit; + for (unsigned idx = 0; idx < exitBlocks.size(); ++idx) { + BasicBlock *EB = exitBlocks[idx]; + + // The target of 'divergentLE'. + BasicBlock *target = nullptr; + + // If 'EB' is optional, we split it at the terminator so that the exiting + // block keeps its edge towards it. The second half of 'EB' will be targeted + // by the cascade if. + if (DR->isOptional(*EB)) { + LLVM_DEBUG(dbgs() << "Optional loop exit " << EB->getName() << ":\n"); + + target = + EB->splitBasicBlock(EB->getTerminator(), EB->getName() + ".split"); + auto &targetTag = DR->getOrCreateTag(target); + + LLVM_DEBUG(dbgs() << "\tSplit " << EB->getName() << " into " + << target->getName() << "\n"); + + // Set the tags. + // We have to be very careful copying a value from one key to another, in + // case one key did not exist, and constructing it caused rehashing. + { + auto EBmasks = MaskInfos[EB]; + MaskInfos[target] = std::move(EBmasks); + } + + auto *const EBLoopTag = DR->getTag(EB).loop; + if (EBLoopTag) { + targetTag.loop = EBLoopTag; + EBLoopTag->loop->addBasicBlockToLoop(target, *LI); + } + + // If 'EB' is the preheader of a loop then 'target' takes its place. + for (auto *const ordered : DR->getLoopOrdering()) { + if (ordered->preheader == EB) { + LLVM_DEBUG(dbgs() + << "\t" << target->getName() << " is now the preheader of " + << ordered->loop->getName() << "\n"); + ordered->preheader = target; + } + } + + if (BOSCC) { + BOSCC->addReference(target, EB); + BOSCC->addInRegions(target, EB); + } + DR->setFlag(*target, DR->getFlag(*EB)); + + // Rewire 'EB' to the pure exit. + auto *const pureExit = LTag.pureExit; + EB->getTerminator()->setSuccessor(0, pureExit); + + LLVM_DEBUG(dbgs() << "\t" << EB->getName() << " now targets " + << pureExit->getName() << "\n"); + + // Retain branch for optional loop exits. + DR->clearFlag(*divergentLE, + BlockDivergenceFlag::eBlockHasDivergentBranch); + // Set all-of mask because the first successor of 'divergentLE' is taken + // if no one existed from the optional loop exit. + DR->setFlag(*divergentLE, eBlockNeedsAllOfMask); + + // 'EB' now has only one single exit edge. + auto &EBmasks = MaskInfos[EB]; + EBmasks.exitMasks[pureExit] = EBmasks.entryMask; + } else { + LLVM_DEBUG(dbgs() << "Divergent loop exit " << EB->getName() << ":\n"); + + // Otherwise, the edge exiting-block-to-divergent-exit-block is removed .. + { + SmallPtrSet predsToRemove; + for (BasicBlock *pred : predecessors(EB)) { + const auto *const predLTag = DR->getTag(pred).loop; + // All predecessors of the divergent loop exit that belong in a loop + // contained in the outermost loop left by that exit need their + // edge removed. + if (predLTag && LTag.loop->contains(predLTag->loop)) { + predsToRemove.insert(pred); + } + } + for (BasicBlock *pred : predsToRemove) { + auto *predT = pred->getTerminator(); + for (unsigned succIdx = 0; succIdx < predT->getNumSuccessors(); + ++succIdx) { + if (predT->getSuccessor(succIdx) == EB) { + removeSuccessor(predT, succIdx); + LLVM_DEBUG(dbgs() << "\tRemove predecessor: " << pred->getName() + << "\n"); + + break; + } + } + } + PHINode *PHI = nullptr; + while ((PHI = dyn_cast(&EB->front()))) { + VECZ_FAIL_IF(!generateSelectFromPHI(PHI, EB)); + } + } + + // ... and the exit block gets targeted by the current divergent loop + // exit. + target = EB; + } + + VECZ_ERROR_IF(!target, "No target was found"); + + // If we are processing the last exit block, and it happens to be divergent + // there is no optional exit loop it can branch to, so create an + // unconditional branch. + if ((idx + 1 == exitBlocks.size()) && DR->isDivergent(*target)) { + BranchInst::Create(target, divergentLE); + auto &maskInfo = MaskInfos[divergentLE]; + maskInfo.exitMasks[target] = maskInfo.entryMask; + + LLVM_DEBUG(dbgs() << "\tVirtual Divergent Loop Exit " + << divergentLE->getName() + << ":\n\t\tSuccessor 0: " << target->getName() << "\n"); + } else { + // The DCBI ordering sets the right sibling to be of an index less than + // the left sibling if they are on the same level of dominance. For that + // reason, we want to set the original loop exit as the right sibling so + // that the latter gets processed first while linearizing, and branches + // to the left sibling. We thus have to negate the condition to do so. + // + // The said condition is the entry mask of the exit block, i.e. whether + // any exiting block left through it. + auto &targetMasks = MaskInfos[target]; + Instruction *cond = cast(targetMasks.entryMask); + // If that entry mask is defined in the loop (if the exit block has only + // one predecessor), then we can directly use that mask as the condition. + // Otherwise, we must move the latter in the pure exit so that + // 'divergentLE' can refer to it. + if (cond->getParent() == target) { + if (PHINode *PHI = dyn_cast(cond)) { + VECZ_FAIL_IF(!generateSelectFromPHI(PHI, target)); + cond = cast(targetMasks.entryMask); + } + std::queue toMove; + toMove.push(cond); + // Make sure to move all the operands of the condition that are in its + // definition block. + while (!toMove.empty()) { + Instruction *move = toMove.front(); + toMove.pop(); + move->moveBefore(*LTag.pureExit, LTag.pureExit->begin()); + for (Value *op : move->operands()) { + if (Instruction *opI = dyn_cast(op)) { + if (opI->getParent() == target) { + toMove.push(opI); + } + } + } + } + } + + auto *negCond = BinaryOperator::CreateNot(cond, cond->getName() + ".not", + divergentLE); + BasicBlock *newDivergentLE = BasicBlock::Create( + F.getContext(), EB->getName() + ".else", &F, EB->getNextNode()); + BranchInst::Create(newDivergentLE, target, negCond, divergentLE); + + // The divergentLE block "ought" to exist in the masks map already, but + // it is safer to take a local copy and retire `targetMasks`. + auto *const targetEntryMask = targetMasks.entryMask; + + // No use of `targetMasks` after this line + auto &divgLEMask = MaskInfos[divergentLE]; + divgLEMask.exitMasks[target] = targetEntryMask; + divgLEMask.exitMasks[newDivergentLE] = negCond; + + LLVM_DEBUG(dbgs() << "\tCreate new virtual divergent loop exit " + << newDivergentLE->getName() << "\n"); + + LLVM_DEBUG( + dbgs() << "\tVirtual Divergent Loop Exit " << divergentLE->getName() + << ":\n\t\tSuccessor 0: " << target->getName() + << "\n\t\tSuccessor 1: " << newDivergentLE->getName() << "\n"); + + auto &newDivergentLETag = DR->getOrCreateTag(newDivergentLE); + + // Set the tags. + MaskInfos[newDivergentLE].entryMask = negCond; + if (auto *const divLoopTag = DR->getTag(divergentLE).loop) { + newDivergentLETag.loop = divLoopTag; + newDivergentLETag.loop->loop->addBasicBlockToLoop(newDivergentLE, *LI); + } + + DR->setFlag(*newDivergentLE, + static_cast( + DR->getFlag(*divergentLE) | + BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit | + BlockDivergenceFlag::eBlockHasDivergentBranch | + BlockDivergenceFlag::eBlockIsDivergent)); + + if (BOSCC) { + BOSCC->addInRegions(newDivergentLE, LTag.latch); + } + + divergentLE = newDivergentLE; + } + } + + return true; +} + +bool ControlFlowConversionState::Impl::generateDivergentLoopResults( + LoopTag <ag) { + LLVM_DEBUG(dbgs() << "CFC: GENERATE DIVERGENT LOOP RESULTS FOR " + << LTag.loop->getName() << "\n"); + + // First create instructions to save the value of the last iteration ... + IRBuilder<> B(LTag.header, LTag.header->getFirstNonPHIIt()); + for (Value *LLV : LTag.loopLiveValues) { + LTag.loopResultPrevs[LLV] = + B.CreatePHI(LLV->getType(), 2, LLV->getName() + ".prev"); + LLVM_DEBUG(dbgs() << "Create result phi: " + << LTag.loopResultPrevs[LLV]->getName() << "\n"); + } + + // ... then create instructions to retrieve the updated value in the current + // iteration. + for (Value *LLV : LTag.loopLiveValues) { + VECZ_FAIL_IF(!generateDivergentLoopResultUpdates(LLV, LTag)); + } + + if (BOSCC) { + // Clone the loop live values update instructions in the uniform version. + if (Loop *uniformL = BOSCC->getLoop(LTag.loop)) { + auto *const uniformHeader = DR->getTag(uniformL).header; + for (Value *LLV : LTag.loopLiveValues) { + BOSCC->addReference(LTag.loopResultUpdates[LLV], LLV); + PHINode *LRP = LTag.loopResultPrevs[LLV]; + // We only need to clone the value of the previous iteration. + PHINode *uniformLRP = cast(LRP->clone()); + + uniformLRP->setIncomingValue(1, LLV); + + uniformLRP->insertBefore(uniformHeader->getFirstNonPHIIt()); + BOSCC->createReference(LRP, uniformLRP, true); + } + } + } + + return true; +} + +bool ControlFlowConversionState::Impl::generateDivergentLoopResultUpdates( + Value *LLV, LoopTag <ag) { + auto &LMask = LoopMasks[LTag.loop]; + Value *mask = LMask.combinedDivergentExitMask; + VECZ_ERROR_IF(!mask, "Divergent loop does not have an exit mask"); + PHINode *PHI = LTag.loopResultPrevs[LLV]; + SelectInst *select = + SelectInst::Create(mask, LLV, PHI, LLV->getName() + ".update"); + select->insertBefore(LTag.latch->getTerminator()->getIterator()); + LTag.loopResultUpdates[LLV] = select; + + // The PHI function of each loop live value has one incoming value from + // the preheader if this is the outermost loop, or from the PHI function from + // the outer loop otherwise. + auto *const ParentL = LTag.loop->getParentLoop(); + auto *const ParentLT = ParentL ? &DR->getTag(ParentL) : nullptr; + if (!ParentLT || !ParentLT->loopResultPrevs.contains(LLV)) { + PHI->addIncoming(getDefaultValue(PHI->getType()), LTag.preheader); + } else { + BasicBlock *LLVDef = cast(LLV)->getParent(); + if (LLVDef != LTag.header && DR->isReachable(LLVDef, LTag.header)) { + PHI->addIncoming(LLV, LTag.preheader); + } else { + PHI->addIncoming(ParentLT->loopResultPrevs[LLV], LTag.preheader); + } + } + + LLVM_DEBUG(dbgs() << "Create result update: " << *select << "\n"); + + // The second incoming value is the updated value from the latch. + PHI->addIncoming(select, LTag.latch); + + LLVM_DEBUG(dbgs() << "Update result phi: " << *PHI << "\n"); + + return true; +} + +bool ControlFlowConversionState::Impl::blendDivergentLoopLiveValues( + LoopTag <ag, const SmallVectorImpl &exitBlocks) { + LLVM_DEBUG(dbgs() << "CFC: BLEND DIVERGENT LOOP LIVE VALUES FOR " + << LTag.loop->getName() << "\n"); + + // Get the exit blocks that were not removed. + SmallVector optionalExitBlocks; + LTag.loop->getExitBlocks(optionalExitBlocks); + // Remove the pure exit from it. + for (auto it = optionalExitBlocks.begin(); it != optionalExitBlocks.end(); + ++it) { + if (*it == LTag.pureExit) { + (void)optionalExitBlocks.erase(it); + break; + } + } + + for (Value *LLV : LTag.loopLiveValues) { + BasicBlock *LLVDef = cast(LLV)->getParent(); + PHINode *prev = LTag.loopResultPrevs[LLV]; + SelectInst *update = LTag.loopResultUpdates[LLV]; + + VECZ_ERROR_IF( + !update, + "Divergent loop live value does not have an update instruction"); + VECZ_ERROR_IF( + !prev, "Divergent loop live value does not have a persist instruction"); + + PHINode *blend = + PHINode::Create(LLV->getType(), 2, LLV->getName() + ".blend"); + blend->insertBefore(LTag.pureExit->begin()); + + // Replace all uses outside the loop. + VECZ_FAIL_IF( + !replaceUsesOutsideDivergentLoop(LTag, LLV, blend, optionalExitBlocks)); + + for (BasicBlock *EB : exitBlocks) { + if (DR->isOptional(*EB)) { + if (!DR->isReachable(LLVDef, EB)) { + blend->addIncoming(prev, EB); + } else { + blend->addIncoming(LLV, EB); + } + } + } + blend->addIncoming(update, LTag.latch); + + if (BOSCC) { + BOSCC->addReference(blend, update); + } + + LLVM_DEBUG(dbgs() << "Create blend " << *blend << " for LLV " << *LLV + << "\n"); + } + + return true; +} + +bool ControlFlowConversionState::Impl::blendDivergentLoopExitMasks( + LoopTag <ag, const SmallVectorImpl &exitEdges, + const SmallVectorImpl &exitBlocks) { + LLVM_DEBUG(dbgs() << "CFC: BLEND DIVERGENT LOOP EXIT MASKS FOR " + << LTag.loop->getName() << "\n"); + + // Get the exit blocks that were not removed. + SmallVector optionalExitBlocks; + LTag.loop->getExitBlocks(optionalExitBlocks); + // Remove the pure exit from it. + for (auto it = optionalExitBlocks.begin(); it != optionalExitBlocks.end(); + ++it) { + if (*it == LTag.pureExit) { + (void)optionalExitBlocks.erase(it); + break; + } + } + + auto &LMask = LoopMasks[LTag.loop]; + for (const Loop::Edge &EE : exitEdges) { + BasicBlock *exitingBlock = const_cast(EE.first); + BasicBlock *exitBlock = const_cast(EE.second); + + if (DR->isDivergent(*exitBlock)) { + PHINode *prev = LMask.persistedDivergentExitMasks[exitingBlock]; + BinaryOperator *update = + LMask.updatedPersistedDivergentExitMasks[exitingBlock]; + + VECZ_ERROR_IF( + !update, + "Divergent loop exit mask does not have an update instruction"); + VECZ_ERROR_IF( + !prev, + "Divergent loop exit mask does not have a persist instruction"); + + PHINode *blend = + PHINode::Create(prev->getType(), 2, prev->getName() + ".blend"); + blend->insertBefore(LTag.pureExit->begin()); + + // Replace all uses outside the loop. + VECZ_FAIL_IF(!replaceUsesOutsideDivergentLoop(LTag, update, blend, + optionalExitBlocks)); + + for (BasicBlock *EB : exitBlocks) { + if (DR->isOptional(*EB)) { + blend->addIncoming(prev, EB); + } + } + blend->addIncoming(update, LTag.latch); + + if (BOSCC) { + BOSCC->addReference(blend, update); + } + + LLVM_DEBUG(dbgs() << "Create blend " << *blend << " for loop exit mask " + << *update << "\n"); + } + } + + return true; +} + +bool ControlFlowConversionState::Impl::replaceUsesOutsideDivergentLoop( + LoopTag <ag, Value *from, Value *to, + const SmallVectorImpl &exitBlocks) { + for (auto it = from->use_begin(); it != from->use_end();) { + Use &U = *it++; + Instruction *user = cast(U.getUser()); + BasicBlock *blockUse = user->getParent(); + // Don't replace uses within the loop. + if (LTag.loop->contains(blockUse) || + // If the use is in a loop exit block, then 'to' can't reach it. + std::count(exitBlocks.begin(), exitBlocks.end(), blockUse)) { + continue; + } + // If the use is in a pure exit block of a divergent loop, don't replace + // the use if it comes from an optional exit block of that loop. + if (PHINode *PHI = dyn_cast(user)) { + const auto *const exitedLoop = DR->getTag(blockUse).outermostExitedLoop; + if (exitedLoop && exitedLoop->pureExit == blockUse) { + BasicBlock *incoming = PHI->getIncomingBlock(U); + if (!exitedLoop->loop->contains(incoming)) { + continue; + } + } + } + U.set(to); + LLVM_DEBUG(dbgs() << "Replace loop value " << *from << " with blend " + << to->getName() << "\n"); + } + + return true; +} + +namespace { +using DenseDeferralMap = + SmallDenseMap, 32>; + +void addDeferral(BasicBlock *newSrc, BasicBlock *deferred, + DenseDeferralMap &deferrals) { + auto newSrcIt = deferrals.find(newSrc); + if (newSrcIt != deferrals.end()) { + // If the deferral edge already exists, there is no need to add it again. + if (newSrcIt->second.contains(deferred)) { + LLVM_DEBUG(dbgs() << "\t\tDeferral (" << newSrc->getName() << ", " + << deferred->getName() << ") already exists\n"); + return; + } + } + auto deferredIt = deferrals.find(deferred); + if (deferredIt != deferrals.end()) { + // If the deferral edge already exists the other way around, we don't want + // to add it the opposite way, in risk of creating an infinite loop in the + // CFG. + if (deferredIt->second.contains(newSrc)) { + LLVM_DEBUG(dbgs() << "\t\tOpposite deferral (" << deferred->getName() + << ", " << newSrc->getName() << ") already exists\n"); + return; + } + } + + deferrals[newSrc].insert(deferred); + + LLVM_DEBUG(dbgs() << "\t\tAdd deferral (" << newSrc->getName() << ", " + << deferred->getName() << ")\n"); +} + +void removeDeferrals(BasicBlock *src, DenseDeferralMap &deferrals) { + auto deferredIt = deferrals.find(src); + if (deferredIt != deferrals.end()) { +#ifndef NDEBUG + for (BasicBlock *deferred : deferredIt->second) { + LLVM_DEBUG(dbgs() << "\tRemove deferral (" << src->getName() << ", " + << deferred->getName() << ")\n"); + } +#endif + deferrals.erase(deferredIt); + } +} +} // namespace + +bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) { + // The entry block cannot be targeted. + const auto &DCBI = DR->getBlockOrdering(); + const size_t numBlocks = DCBI.size(); + DenseSet targets(numBlocks - 1); + for (const auto &tag : make_range(std::next(DCBI.begin()), DCBI.end())) { + targets.insert(tag.BB); + } + + DenseDeferralMap deferrals; + + LLVM_DEBUG(dbgs() << "CFC: COMPUTE NEW TARGETS\n"); + + // For each basic block, select its new targets based on previous blocks that + // have been deferred because of divergence, and their current successors. + // Select the target that has the lowest DCBI, i.e. the block whose dominance + // englobes or is equal to the other available targets. + // + // If we assign a target different from the current successor of the block, + // we must add a deferral edge from the selected target to the current + // successor (that got replaced by the selected target) such that an edge + // from the current block to the replaced successor exists in the modified + // graph. + lin.infos.reserve(numBlocks); + lin.data.reserve(numBlocks); + for (size_t BBIndex = 0; BBIndex != numBlocks; ++BBIndex) { + const auto &BBTag = DR->getBlockTag(BBIndex); + BasicBlock *BB = BBTag.BB; + lin.beginBlock(BB); + + LLVM_DEBUG(dbgs() << "BB " << BB->getName() << ":\n"); + + // Retrieve the rewire list for 'BB'. + SmallPtrSet availableTargets; + { + auto deferredIt = deferrals.find(BB); + if (deferredIt != deferrals.end()) { + for (BasicBlock *deferred : deferredIt->second) { + availableTargets.insert(deferred); + } + } + } + + if (!DR->isDivCausing(*BB) || + // Loop latches must have their branch retained. + (BBTag.loop && BBTag.loop->latch == BB)) { + // If 'BB' ends in a uniform branch. + LLVM_DEBUG(dbgs() << " uniform branch\n"); + + // Keep track of what blocks we have targeted in case we have a deferred + // block that is a current successor (which could lead in choosing the + // same block twice!). + SmallPtrSet targeted; + + for (BasicBlock *succ : successors(BB)) { + size_t nextIndex = ~size_t(0); + for (BasicBlock *deferred : availableTargets) { + if (targeted.contains(deferred)) { + continue; + } + + const size_t deferredIndex = DR->getTagIndex(deferred); + if (nextIndex == ~size_t(0) || nextIndex > deferredIndex) { + nextIndex = deferredIndex; + } + } + + const size_t succIndex = DR->getTagIndex(succ); + if (!targeted.contains(succ)) { + // If we have not found a target or there is a better one. + if (nextIndex == ~size_t(0) || nextIndex > succIndex) { + nextIndex = succIndex; + } + } + + VECZ_ERROR_IF(nextIndex == ~size_t(0), "No target was found"); + + auto *const next = DR->getBlockTag(nextIndex).BB; + lin.push(next); + targeted.insert(next); + + LLVM_DEBUG(dbgs() << "\tsuccessor " << lin.currentSize() - 1 << ": " + << next->getName() << "\n"); + + // Virtually remove backedges. + if (!BBTag.isLoopBackEdge(next)) { + targets.erase(next); + // Don't add deferred edges to blocks already processed. + if (BBIndex < nextIndex) { + auto S = availableTargets; + S.insert(succ); + + for (BasicBlock *deferred : S) { + if (deferred != next) { + addDeferral(next, deferred, deferrals); + } + } + } + } + } + } else { + LLVM_DEBUG(dbgs() << " divergent branch\n"); + + for (BasicBlock *succ : successors(BB)) { + availableTargets.insert(succ); + } + + size_t nextIndex = ~size_t(0); + for (BasicBlock *deferred : availableTargets) { + const size_t deferredIndex = DR->getTagIndex(deferred); + if (nextIndex == ~size_t(0) || nextIndex > deferredIndex) { + LLVM_DEBUG(dbgs() + << (nextIndex == ~size_t(0) + ? "\tchoosing successor: " + : "\tpreferring instead successor: ") + << DR->getBlockTag(deferredIndex).BB->getName() << "\n"); + nextIndex = deferredIndex; + } + } + + VECZ_ERROR_IF(nextIndex == ~size_t(0), "No target was found"); + + BasicBlock *const next = DR->getBlockTag(nextIndex).BB; + lin.push(next); + + // The last eBlockIsVirtualDivergentLoopExit introduced from an optional + // loop exit wasn't given a block to branch to, it is thus empty. + if (DR->hasFlag(*BB, + BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit) && + !BB->getTerminator()) { + BranchInst::Create(next, BB); + } + + LLVM_DEBUG(dbgs() << "\tsuccessor 0: " << next->getName() << "\n"); + + // Virtually remove backedges. + if (!BBTag.isLoopBackEdge(next)) { + targets.erase(next); + // Don't add deferred edges to blocks already processed. + if (BBIndex < nextIndex) { + for (BasicBlock *deferred : availableTargets) { + if (deferred != next) { + addDeferral(next, deferred, deferrals); + } + } + } + } + } + + // Remove the deferrals that involved 'BB'. + removeDeferrals(BB, deferrals); + + // clang-format off + LLVM_DEBUG( + dbgs() << " deferral list:"; + if (deferrals.empty()) { + dbgs() << " (empty)\n"; + } else { + dbgs() << "\n"; + for (const auto &Pair : deferrals) { + for (BasicBlock *Deferred : Pair.second) { + LLVM_DEBUG(dbgs() << "\t(" << Pair.first->getName() << ", " + << Deferred->getName() << ")\n"); + } + } + } + ); + // clang-format on + } + + // There shouldn't remain any deferral edges. + VECZ_ERROR_IF(!deferrals.empty(), "Deferrals remain"); + // All blocks should have been targeted at least once. + VECZ_ERROR_IF(!targets.empty(), "Not all blocks have been rewired"); + + return true; +} + +bool ControlFlowConversionState::Impl::linearizeCFG() { + LLVM_DEBUG(dbgs() << "CFC: LINEARIZE\n"); + + // Compute the new targets ... + Linearization lin; + VECZ_FAIL_IF(!computeNewTargets(lin)); + + auto dataIt = lin.data.begin(); + for (const auto &newTargetInfo : lin.infos) { + BasicBlock *BB = newTargetInfo.BB; + + // Get the new target info for this block + const auto numTargets = newTargetInfo.numTargets; + const auto newTargets = dataIt; + dataIt += numTargets; + + LLVM_DEBUG(dbgs() << BB->getName() << ":\n"); + + auto *T = BB->getTerminator(); + + // If we have set a new target that is already a successor of BB, but we + // have not set it at the same successor's position, then do it! + // It will avoid to have to update the phi nodes. + SmallDenseMap successors; + for (unsigned idx = 0; idx < T->getNumSuccessors(); ++idx) { + BasicBlock *succ = T->getSuccessor(idx); + successors.try_emplace(succ, idx); + } + for (unsigned idx = 0; idx < numTargets; ++idx) { + auto succIt = successors.find(newTargets[idx]); + // If we have a successor set as a new target ... + if (succIt != successors.end()) { + // ... but we have not set it at the same position ... + if (succIt->second != idx && succIt->second < numTargets) { + // .. then swap both blocks. + std::swap(newTargets[idx], newTargets[succIt->second]); + } + } + } + + // Now iterate over the new targets to set them as successors of BB if + // they were not already. + unsigned idx = 0; + for (; idx < numTargets; ++idx) { + BasicBlock *const newTarget = newTargets[idx]; + + VECZ_ERROR_IF( + idx >= T->getNumSuccessors(), + "BasicBlock should not have more successors after linearization"); + + BasicBlock *oldSucc = T->getSuccessor(idx); + + LLVM_DEBUG(dbgs() << "\tOld successor: " << oldSucc->getName() << "\n"); + + // If we have set the current successor to be the new target, there is + // nothing to do. + if (oldSucc == newTarget) { + LLVM_DEBUG(dbgs() << "\tUntouched successor: " << oldSucc->getName() + << "\n"); + continue; + } + + // Uniform blocks should not be rewired. + VECZ_ERROR_IF(DR->isUniform(*oldSucc), + "Uniform BasicBlock should not have its edge modified"); + + // Otherwise update the successor. + T->setSuccessor(idx, newTarget); + LLVM_DEBUG(dbgs() << "\tAdd successor: " << newTarget->getName() << "\n"); + } + + // We have either processed a divergent branch (with only one successor), or + // we have processed a uniform branch (with all its successors untouched). + VECZ_ERROR_IF(idx != 1 && idx != T->getNumSuccessors(), + "Number of processed new targets is undefined"); + + // Finally, clear the remaining successors that have not been set as new + // targets. + if (idx != T->getNumSuccessors()) { + for (; idx < T->getNumSuccessors(); ++idx) { + BasicBlock *succ = T->getSuccessor(idx); + + // Uniform blocks should not be rewired. + VECZ_ERROR_IF(DR->isUniform(*succ), + "Uniform BasicBlock should not have its edge modified"); + + LLVM_DEBUG(dbgs() << "\tRemove successor: " << succ->getName() << "\n"); + } + + auto *newT = BranchInst::Create(T->getSuccessor(0)); + newT->insertBefore(T->getIterator()); + + updateMaps(T, newT); + + IRCleanup::deleteInstructionNow(T); + } + } + assert(dataIt == lin.data.end() && + "Failed to reach end of Linearization data vector!"); + + // Updating on-the-fly the DomTree and PostDomTree whilst rewiring the CFG + // is extremely tedious, and may not even be possible due to all the invalid + // states that happen during it ... Therefore, we have no choice but to + // recalculate the DomTree and PostDomTree from scratch. + DT->recalculate(F); + PDT->recalculate(F); + VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated"); + VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated"); + VECZ_FAIL_IF(!computeBlockOrdering()); + RC->clear(); + + return true; +} + +bool ControlFlowConversionState::Impl::generateSelects() { + LLVM_DEBUG(dbgs() << "CFC: GENERATE SELECTS FROM PHI NODES\n"); + // For each basic block that has only one predecessor and phi nodes, we need + // to either blend those phi nodes into select instructions or try to move + // the phi nodes up the chain of linearized path. + for (const auto &BTag : DR->getBlockOrdering()) { + BasicBlock *B = BTag.BB; + if (B->hasNPredecessors(1) || DR->isBlend(*B)) { + if (PHINode *PHI = dyn_cast(&B->front())) { + LLVM_DEBUG(dbgs() << B->getName() << ":\n"); + const SmallPtrSet incomings(PHI->block_begin(), + PHI->block_end()); + BasicBlock *cur = B; + while (cur->hasNPredecessors(1) && !incomings.empty()) { + cur = cur->getSinglePredecessor(); + if (incomings.contains(cur)) { + break; + } + } + // Only move the phis up the chain of linearized path: + // - if the block whose phis we are processing is not a blend block + // (because the latter do need to have its phis transformed into + // selects), + // - if the last block of the chain is not an incoming block, and + // - if the last block of the chain is a convergence block. + if (!DR->isBlend(*B) && !incomings.contains(cur) && + cur->hasNPredecessorsOrMore(2) && PHI->getNumIncomingValues() > 1) { + // All PHI nodes have the same incoming blocks so we update the exit + // masks of the incoming blocks of the first PHI node here. + for (unsigned i = 0; i < PHI->getNumIncomingValues(); ++i) { + auto &maskInfo = MaskInfos[PHI->getIncomingBlock(i)]; + Value *&exitMask = maskInfo.exitMasks[cur]; + + if (!exitMask) { + exitMask = maskInfo.exitMasks[B]; + } + } + + while ((PHI = dyn_cast(&B->front()))) { + LLVM_DEBUG(dbgs() << "\tMove " << *PHI << " in " << cur->getName() + << "\n"); + PHI->moveBefore(*cur, cur->begin()); + } + } else { + while ((PHI = dyn_cast(&B->front()))) { + VECZ_FAIL_IF(!generateSelectFromPHI(PHI, B)); + } + } + } + } + } + + return true; +} + +bool ControlFlowConversionState::Impl::generateSelectFromPHI(PHINode *PHI, + BasicBlock *B) { + const unsigned phiNumIncVals = PHI->getNumIncomingValues(); + VECZ_ERROR_IF(phiNumIncVals == 0, "PHINode does not have any incoming value"); + + Value *newVal = nullptr; + auto &maskInfo = MaskInfos[B]; + if (PHI == maskInfo.entryMask) { + // The entry mask of a blend value should be the conjunction of the incoming + // masks, so change it. + maskInfo.entryMask = copyEntryMask(PHI->getIncomingValue(0), *B); + for (unsigned i = 1; i < phiNumIncVals; i++) { + Value *V = PHI->getIncomingValue(i); + auto InsertPt = std::next(maskInfo.entryMask->getIterator()); + maskInfo.entryMask = BinaryOperator::CreateOr( + maskInfo.entryMask, V, B->getName() + ".entry_mask"); + maskInfo.entryMask->insertBefore(InsertPt); + } + newVal = maskInfo.entryMask; + } else { + Value *select = PHI->getIncomingValue(0); + for (unsigned i = 1; i < phiNumIncVals; i++) { + Value *V = PHI->getIncomingValue(i); + BasicBlock *PHIB = PHI->getIncomingBlock(i); + Value *cond = MaskInfos[PHIB].exitMasks[B]; + VECZ_ERROR_IF(!cond, "Exit mask does not exist"); + + auto InsertPt = B->getFirstInsertionPt(); + if (i == 1) { + if (Instruction *condI = dyn_cast(cond)) { + BasicBlock *maskParent = condI->getParent(); + if (maskParent == B) { + InsertPt = std::next(condI->getIterator()); + } + } + } else { + InsertPt = std::next(cast(select)->getIterator()); + } + auto *selectInst = + SelectInst::Create(cond, V, select, PHI->getName() + ".blend"); + selectInst->insertBefore(InsertPt); + select = selectInst; + } + newVal = select; + } + + LLVM_DEBUG(dbgs() << "\tReplace " << *PHI << " with " << *newVal << "\n"); + + updateMaps(PHI, newVal); + + PHI->replaceAllUsesWith(newVal); + + IRCleanup::deleteInstructionNow(PHI); + + return true; +} + +bool ControlFlowConversionState::Impl::repairSSA() { + // Check that all the blocks have a unique position + VECZ_FAIL_IF(!checkBlocksOrder()); + RC->update(F); + + VECZ_FAIL_IF(!updatePHIsIncomings()); + VECZ_FAIL_IF(!blendInstructions()); + + VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated"); + VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated"); + + return true; +} + +bool ControlFlowConversionState::Impl::updatePHIsIncomings() { + // We need to update the incoming blocks of phi nodes whose predecessors may + // have changed since we have not changed the phi nodes during the rewiring. + for (const auto &BBTag : DR->getBlockOrdering()) { + BasicBlock *BB = BBTag.BB; + const SmallPtrSet preds(pred_begin(BB), pred_end(BB)); + for (auto it = BB->begin(); it != BB->end();) { + Instruction &I = *it++; + PHINode *PHI = dyn_cast(&I); + if (!PHI) { + break; + } + + const SmallPtrSet incomings(PHI->block_begin(), + PHI->block_end()); + + // If no predecessors of `BB` is an incoming block of its PHI Node, then + // completely transform the PHI Node into multiple select instructions. + bool intersect = false; + for (BasicBlock *inc : incomings) { + for (BasicBlock *pred : preds) { + if (pred == inc) { + intersect = true; + break; + } + } + if (intersect) { + break; + } + } + if (!intersect) { + VECZ_FAIL_IF(!generateSelectFromPHI(PHI, BB)); + continue; + } + // Otherwise, only transform the incoming blocks of predecessors that got + // linearized into selects. + // + // Instruction that will combine the phi node and the select instructions + // created from it if some incoming blocks are no longer predecessors. + Instruction *newBlend = nullptr; + const BasicBlock::iterator InsertPt = getInsertionPt(*BB); + + auto &maskInfo = MaskInfos[BB]; + const bool isEntryMask = PHI == maskInfo.entryMask; + for (unsigned idx = 0; idx < PHI->getNumIncomingValues(); ++idx) { + BasicBlock *incoming = PHI->getIncomingBlock(idx); + if (preds.contains(incoming)) { + continue; + } + // If the incoming block is no longer a predecessor, transform it into + // a select instruction, or a binary OR if it is an entry mask. + Value *V = PHI->getIncomingValue(idx); + + if (isEntryMask) { + // The entry mask of a blend value should be the conjunction of + // the incoming masks, so change it. + if (!newBlend) { + newBlend = + BinaryOperator::CreateOr(PHI, V, BB->getName() + ".entry_mask"); + } else { + newBlend = BinaryOperator::CreateOr(newBlend, V, + BB->getName() + ".entry_mask"); + } + maskInfo.entryMask = newBlend; + } else { + Value *cond = MaskInfos[incoming].exitMasks[BB]; + VECZ_ERROR_IF(!cond, "Exit mask does not exist"); + if (!newBlend) { + newBlend = + SelectInst::Create(cond, V, PHI, PHI->getName() + ".blend"); + } else { + newBlend = SelectInst::Create(cond, V, newBlend, + PHI->getName() + ".blend"); + } + } + newBlend->insertBefore(InsertPt); + PHI->removeIncomingValue(idx--); + } + + // If we have created select instructions from `PHI`, update the users + // of the latter. + if (newBlend) { + VECZ_FAIL_IF(!replaceReachableUses(*RC, PHI, newBlend, BB)); + updateMaps(PHI, newBlend); + } + + // And add any new incoming blocks that do not replace any previous. + for (BasicBlock *pred : preds) { + if (!incomings.contains(pred)) { + PHI->addIncoming(getDefaultValue(PHI->getType()), pred); + } + } + } + } + + return true; +} + +bool ControlFlowConversionState::Impl::blendInstructions() { + LLVM_DEBUG(dbgs() << "CFC: BLEND INSTRUCTIONS\n"); + + auto addSuccessors = [this](const BasicBlockTag &BTag, BlockQueue &queue, + DenseSet &visited, + const BasicBlockTag &dstTag) { + for (BasicBlock *succ : successors(BTag.BB)) { + // Allow latch if 'succ' belongs in 'dst's loop and 'dst' is the header + // of that loop. + const bool allowLatch = + dstTag.isLoopHeader() && dstTag.loop->loop->contains(succ); + + if (!allowLatch && BTag.isLoopBackEdge(succ)) { + continue; + } + + if (allowLatch) { + // the fast Reachability calculation can't follow back edges yet + if (!DR->isReachable(succ, dstTag.BB, allowLatch)) { + continue; + } + } else if (!RC->isReachable(succ, dstTag.BB)) { + continue; + } + + if (visited.insert(succ).second) { + LLVM_DEBUG(dbgs() << "\t\t\tInsert " << succ->getName() + << " in the queue\n"); + queue.push(DR->getTagIndex(succ)); + } + } + + // clang-format off + LLVM_DEBUG( + dbgs() << "\t\t\tWorklist: ["; + if (!queue.empty()) { + dbgs() << DR->getBlockTag(*queue.begin()).BB->getName(); + for (auto It = std::next(queue.begin()); It != queue.end(); ++It) { + dbgs() << ", " << DR->getBlockTag(*It).BB->getName(); + } + dbgs() << "]\n"; + } + ); + // clang-format on + }; + + DenseMap> blendMap; + + auto getValueOfAt = [&blendMap](Instruction *opDef, + BasicBlock *B) -> Value * { + auto it = blendMap.find(opDef); + if (it != blendMap.end()) { + auto it2 = it->second.find(B); + if (it2 != it->second.end()) { + return it2->second; + } + } + return nullptr; + }; + + auto createBlend = [this, &blendMap, &getValueOfAt]( + BasicBlock *B, Instruction *opDef) -> Value * { + if (Value *V = getValueOfAt(opDef, B)) { + return V; + } + + Type *T = opDef->getType(); + const unsigned numPreds = std::distance(pred_begin(B), pred_end(B)); + Value *blend = nullptr; + PHINode *PHI = PHINode::Create(T, numPreds, opDef->getName() + ".merge"); + PHI->insertBefore(B->begin()); + + const auto *const LTag = DR->getTag(B).loop; + bool hasVisitedPred = false; + for (BasicBlock *pred : predecessors(B)) { + Value *incomingV = nullptr; + if (Value *predV = getValueOfAt(opDef, pred)) { + incomingV = predV; + hasVisitedPred = true; + } else { + // When blending a loop header, the value coming from the latch should + // be the one coming from the preheader if that value dominates the + // latch and the latch has no definition of the value we are trying to + // blend. + if (DR->getTag(pred).isLoopBackEdge(B)) { + if (Value *preheaderV = getValueOfAt(opDef, LTag->preheader)) { + if (auto *instV = dyn_cast(preheaderV)) { + if (DT->dominates(instV->getParent(), pred)) { + incomingV = preheaderV; + } + } else { + incomingV = preheaderV; + } + } + } + } + + if (!incomingV) { + incomingV = getDefaultValue(T); + } + PHI->addIncoming(incomingV, pred); + } + if (!hasVisitedPred) { + IRCleanup::deleteInstructionNow(PHI); + return nullptr; + } + + if (PHI->hasConstantValue()) { + blend = PHI->getIncomingValue(0); + IRCleanup::deleteInstructionNow(PHI); + } else { + blend = PHI; + blends.insert(PHI); + } + + blendMap[opDef][B] = blend; + + return blend; + }; + + // Manually set the entry point of persisted loop live values and persisted + // loop exit masks. + for (auto *const LTag : DR->getLoopOrdering()) { + auto *const header = LTag->header; + for (Value *LLV : LTag->loopLiveValues) { + Instruction *LLVI = cast(LLV); + if (LLVI->getParent() != header) { + blendMap[LLVI][header] = LTag->loopResultPrevs[LLV]; + } + } + + auto &LMask = LoopMasks[LTag->loop]; + for (auto &UPREM : LMask.updatedPersistedDivergentExitMasks) { + if (UPREM.first != header) { + blendMap[UPREM.second][header] = + LMask.persistedDivergentExitMasks[UPREM.first]; + } + } + } + + SmallPtrSet spareBlends; + + for (const auto &dstTag : DR->getBlockOrdering()) { + BasicBlock *dst = dstTag.BB; + LLVM_DEBUG(dbgs() << "Blending instructions used in " << dst->getName() + << ":\n"); + for (Instruction &I : *dst) { + // Don't try to blend a blend value. + if (blends.contains(&I)) { + continue; + } + + LLVM_DEBUG(dbgs() << "\tInstruction " << I << ":\n"); + + for (unsigned idx = 0; idx < I.getNumOperands(); ++idx) { + Instruction *opDef = dyn_cast(I.getOperand(idx)); + if (!opDef) { + continue; + } + + BasicBlock *src = opDef->getParent(); + + LLVM_DEBUG(dbgs() << "\t\tOperand " << *opDef << "\n\t\tdefined in " + << src->getName() << ":\n"); + + blendMap[opDef][src] = opDef; + + // There exists two possible ways to early exit the blend instruction: + // - if the current block dominates the 'dst'. + // - if the current block dominates the incoming block of the phi node + // 'I' we are blending in 'dst'. + // + // 'dst' can freely access the values of 'src'. + if (DT->dominates(src, dst)) { + LLVM_DEBUG(dbgs() << "\t\t\tDefinition dominates use\n"); + continue; + } + // The incoming block of this phi node is dominated by the definition + // block of the incoming value. + BasicBlock *incoming = nullptr; + if (PHINode *PHI = dyn_cast(&I)) { + incoming = PHI->getIncomingBlock(idx); + if (DT->dominates(src, incoming)) { + LLVM_DEBUG(dbgs() << "\t\t\tDefinition dominates use\n"); + continue; + } + } + + DenseSet visited; + BlockQueue queue(*DR); + + const auto &srcTag = DR->getTag(src); + + addSuccessors(srcTag, queue, visited, dstTag); + + auto *const srcLoop = srcTag.loop; + if (srcLoop && srcLoop->isLoopDivergent()) { + if (dst != srcLoop->header) { + auto &srcMasks = LoopMasks[srcLoop->loop]; + const auto &headerTag = DR->getTag(srcLoop->header); + + // If 'opDef' is an update loop exit mask, set an entry point in + // the loop header. + auto UPREMIt = + srcMasks.updatedPersistedDivergentExitMasks.find(src); + if (UPREMIt != srcMasks.updatedPersistedDivergentExitMasks.end()) { + if (UPREMIt->second == opDef) { + LLVM_DEBUG(dbgs() + << "\t\t\tFound persisted value of the operand: " + << srcMasks.persistedDivergentExitMasks[src] + << "\n"); + addSuccessors(headerTag, queue, visited, dstTag); + } + } + // If 'opDef' is a loop live value, set an entry point in the loop + // header. + if (srcLoop->loopLiveValues.contains(opDef)) { + LLVM_DEBUG(dbgs() + << "\t\t\tFound persisted value of the operand: " + << srcLoop->loopResultPrevs[opDef] << "\n"); + addSuccessors(headerTag, queue, visited, dstTag); + } + } + } + + while (!queue.empty()) { + const BasicBlockTag &curTag = queue.pop(); + BasicBlock *const cur = curTag.BB; + + LLVM_DEBUG(dbgs() << "\t\t\tPopping " << cur->getName() << "\n"); + + // We have reached 'dst' without finding a block that dominates it, + // we need to create a phi node if the user is not one, and replace + // the operand with the last blended value. + if (cur == dst) { + LLVM_DEBUG(dbgs() << "\t\t\tReached destination: "); + VECZ_ERROR_IF(!queue.empty(), "Blocks remain in the queue"); + if (PHINode *PHI = dyn_cast(&I)) { + BasicBlock *incoming = PHI->getIncomingBlock(idx); + Value *V = getValueOfAt(opDef, incoming); + VECZ_ERROR_IF(!V, "No blend value was found"); + I.setOperand(idx, V); + } else { + Value *blend = createBlend(cur, opDef); + VECZ_ERROR_IF(!blend, "No blend value was found"); + spareBlends.erase(blend); + I.setOperand(idx, blend); + } + LLVM_DEBUG(dbgs() << "new operand: " << *I.getOperand(idx) << "\n"); + break; + } + + const bool curDomDst = DT->dominates(cur, dst); + const bool curDomInc = incoming && DT->dominates(cur, incoming); + const bool srcDomCur = DT->dominates(src, cur); + + auto &opDefBlend = blendMap[opDef]; + // If either condition is true, we can early exit: + // - 'dst' can freely access the values of 'cur', + // - 'incoming' can freely access the values of 'cur'. + if ((curDomDst || curDomInc) && queue.empty()) { + LLVM_DEBUG(dbgs() << "\t\t\tBlock " << cur->getName() + << " dominates destination: "); + if (srcDomCur) { + auto *const blend = opDefBlend[src]; + opDefBlend[cur] = blend; + I.setOperand(idx, blend); + } else { + auto *const blend = createBlend(cur, opDef); + VECZ_ERROR_IF(!blend, "No blend value was found"); + spareBlends.erase(blend); + I.setOperand(idx, blend); + } + LLVM_DEBUG(dbgs() << "new operand: " << *I.getOperand(idx) << "\n"); + break; + } + + addSuccessors(curTag, queue, visited, dstTag); + + // 'cur' can freely access 'opDef'. + if (srcDomCur) { + // DANGER! operator[] returns a reference, which may be invalidated + // by a second call to it. Therefore we have to copy the value via + // a temporary variable. + auto *const blendSrc = opDefBlend[src]; + opDefBlend[cur] = blendSrc; + continue; + } + + // 'cur' does not have a blend value of 'opDef' so create one. + Value *blend = createBlend(cur, opDef); + VECZ_ERROR_IF(!blend, "No blend value was found"); + if (isa(blend)) { + spareBlends.insert(blend); + } + } + } + } + } + + for (auto *blend : spareBlends) { + auto *I = cast(blend); + if (I->use_empty()) { + IRCleanup::deleteInstructionNow(I); + } + } + + return true; +} + +bool ControlFlowConversionState::Impl::simplifyMasks() { + const SimplifyQuery Q(F.getParent()->getDataLayout(), nullptr, DT); + + // We might like to just look at the masks pointed to by the block/loop tags, + // however linearization and/or BOSCC can sometimes delete them from under + // our nose so it's only safe just to go through all the boolean operations + // and see if we can simplify any of them. + for (const auto &BBTag : DR->getBlockOrdering()) { + SmallVector toDelete; + for (auto &I : *BBTag.BB) { + if (isa(&I) || (I.getType()->getScalarSizeInBits() == 1 && + (isa(&I) || + isa(&I) || isa(&I)))) { + if (I.use_empty()) { + toDelete.push_back(&I); + } else { + Value *simpleMask = simplifyInstruction(&I, Q); + if (simpleMask && simpleMask != &I) { + I.replaceAllUsesWith(simpleMask); + toDelete.push_back(&I); + } + } + } + } + for (auto *I : toDelete) { + IRCleanup::deleteInstructionNow(I); + } + } + + return true; +} + +bool ControlFlowConversionState::computeBlockOrdering() { + LLVM_DEBUG(dbgs() << "CFC: COMPUTE BLOCK ORDERING\n"); + RC->clear(); + return DR->computeBlockOrdering(*DT); +} + +bool ControlFlowConversionState::Impl::checkBlocksOrder() const { + const auto &DCBI = DR->getBlockOrdering(); + VECZ_ERROR_IF(F.size() != DCBI.size(), + "Worklist does not contain all blocks"); + + uint32_t next = 0u; + for (const auto &BBTag : DCBI) { + VECZ_ERROR_IF(BBTag.pos != next, + "BasicBlock indices not in consecutive order"); + ++next; + } + + return true; +} + +void ControlFlowConversionState::Impl::updateMaps(Value *from, Value *to) { + // Because we keep track of mapping values between uniform and predicated + // version, since we replace 'from' with 'to', we also have to update + // the hashtable. + if (BOSCC) { + BOSCC->updateValue(from, to); + } + + // Because we keep track of loop live values, since we replace 'from' with + // 'to', we also have to update the hashset. + for (auto *const LTag : DR->getLoopOrdering()) { + if (LTag->loopLiveValues.erase(from)) { + LTag->loopLiveValues.insert(to); + auto LRPIt = LTag->loopResultPrevs.find(from); + if (LRPIt != LTag->loopResultPrevs.end()) { + PHINode *from = LRPIt->second; + LTag->loopResultPrevs.erase(LRPIt); + LTag->loopResultPrevs[to] = from; + } + auto LRUIt = LTag->loopResultUpdates.find(from); + if (LRUIt != LTag->loopResultUpdates.end()) { + SelectInst *select = LRUIt->second; + LTag->loopResultUpdates.erase(LRUIt); + LTag->loopResultUpdates[to] = select; + } + } + } +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp new file mode 100644 index 0000000000000..3953e4257e1a9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp @@ -0,0 +1,132 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/inline_post_vectorization_pass.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "vecz/vecz_choices.h" + +using namespace llvm; +using namespace vecz; + +namespace { +/// @brief Process a call site, inlining it or marking it as needing inlining +/// if required. +/// +/// @param[in] CI Call site to inspect. +/// @param[out] NeedLLVMInline Whether the call site needs LLVM inlining. +/// @param[in] BI Builtin database. +/// +/// @return New return value for the call instruction. +Value *processCallSite(CallInst *CI, bool &NeedLLVMInline, + compiler::utils::BuiltinInfo &BI) { + NeedLLVMInline = false; + + Function *Callee = CI->getCalledFunction(); + if (!Callee) { + return CI; + } + + // Mark called function as needing inlining by LLVM, unless it has the + // NoInline attribute + if (!Callee->isDeclaration() && + !Callee->hasFnAttribute(Attribute::NoInline)) { + CI->addFnAttr(Attribute::AlwaysInline); + NeedLLVMInline = true; + return CI; + } + + // Emit builtins inline when they have no vector/scalar equivalent. + IRBuilder<> B(CI); + const auto Builtin = BI.analyzeBuiltin(*Callee); + if (Builtin && Builtin->properties & + compiler::utils::eBuiltinPropertyInlinePostVectorization) { + const SmallVector Args(CI->args()); + if (Value *Impl = BI.emitBuiltinInline(Callee, B, Args)) { + VECZ_ERROR_IF( + Impl->getType() != CI->getType(), + "The inlined function type must match that of the original function"); + return Impl; + } + } + + return CI; +} + +} // namespace + +PreservedAnalyses +InlinePostVectorizationPass::run(Function &F, FunctionAnalysisManager &AM) { + bool modified = false; + bool needToRunInliner = false; + auto &BI = + AM.getResult(F).getContext().builtins(); + + SmallVector ToDelete; + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + // Look for calls to builtins with no vector/scalar equivalent. + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + + bool NeedLLVMInline = false; + Value *NewCI = processCallSite(CI, NeedLLVMInline, BI); + needToRunInliner |= NeedLLVMInline; + if ((NewCI == CI) || !NewCI) { + continue; + } + + if (!CI->getType()->isVoidTy()) { + CI->replaceAllUsesWith(NewCI); + } + ToDelete.push_back(CI); + modified = true; + } + } + + // Clean up. + while (!ToDelete.empty()) { + Instruction *I = ToDelete.pop_back_val(); + I->eraseFromParent(); + } + + // Run the LLVM inliner if some calls were marked as needing inlining. + if (needToRunInliner) { + llvm::legacy::PassManager PM; + PM.add(llvm::createAlwaysInlinerLegacyPass()); + PM.run(*F.getParent()); + modified = true; + } + + // Recursively run the pass to inline any newly introduced functions. + if (modified) { + run(F, AM); + } + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp new file mode 100644 index 0000000000000..4235885c8a564 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp @@ -0,0 +1,351 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/instantiation_pass.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "analysis/instantiation_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "llvm_helpers.h" +#include "memory_operations.h" +#include "transform/packetization_helpers.h" +#include "transform/packetizer.h" +#include "vectorization_context.h" +#include "vecz/vecz_choices.h" + +#define DEBUG_TYPE "vecz-instantiation" + +#undef VECZ_FAIL +#define VECZ_FAIL() return packetizer.getEmptyRange(); + +using namespace vecz; +using namespace llvm; + +STATISTIC(VeczInstantiated, "Number of instructions instantiated [ID#I00]"); +STATISTIC(VeczPacketizeFailInstantiate, + "Packetize: instantiation failures [ID#P84]"); + +InstantiationPass::InstantiationPass(Packetizer &pp) + : Ctx(pp.context()), packetizer(pp) {} + +PacketRange InstantiationPass::instantiate(Value *V) { + VECZ_FAIL_IF(packetizer.width().isScalable()); + if (auto info = packetizer.getPacketized(V)) { + const unsigned SimdWidth = packetizer.width().getFixedValue(); + return info.getAsPacket(SimdWidth); + } + + // Handle uniform values first, which instantiate to the same value for all + // items. + auto *Ins = dyn_cast(V); + if (Ins && packetizer.uniform().isMaskVarying(V)) { + const PacketRange P = simdBroadcast(Ins); + if (!P) { + emitVeczRemark(&packetizer.function(), V, + "Failed to broadcast Mask Varying instruction"); + VECZ_FAIL(); + } + return assignInstance(P, V); + } + + if (!packetizer.uniform().isVarying(V)) { + return assignInstance(broadcast(V), V); + } + + if (Ins) { + return instantiateInstruction(Ins); + } + + VECZ_STAT_FAIL_IF(true, VeczPacketizeFailInstantiate); +} + +PacketRange InstantiationPass::instantiateInternal(Value *V) { + if (packetizer.uniform().isVarying(V)) { + // The packetizer will call back into the instantiator when it needs to + VECZ_FAIL_IF(packetizer.width().isScalable()); + const unsigned SimdWidth = packetizer.width().getFixedValue(); + return packetizer.packetize(V).getAsPacket(SimdWidth); + } else { + return instantiate(V); + } +} + +PacketRange InstantiationPass::instantiateInstruction(Instruction *Ins) { + // Figure out what kind of instruction it is and try to instantiate it. + switch (Ins->getOpcode()) { + default: + // No special handling of this Instruction so just clone across lanes.. + break; + + case Instruction::Call: + return assignInstance(instantiateCall(cast(Ins)), Ins); + + case Instruction::Alloca: + return assignInstance(instantiateAlloca(cast(Ins)), Ins); + } + + return assignInstance(instantiateByCloning(Ins), Ins); +} + +PacketRange InstantiationPass::assignInstance(const PacketRange P, Value *V) { + if (!P) { + emitVeczRemarkMissed(&packetizer.function(), V, "Could not instantiate"); + VECZ_STAT_FAIL_IF(!P, VeczPacketizeFailInstantiate); + } else { + ++VeczInstantiated; + } + return P; +} + +PacketRange InstantiationPass::broadcast(Value *V) { + VECZ_FAIL_IF(packetizer.width().isScalable()); + const unsigned SimdWidth = packetizer.width().getFixedValue(); + PacketRange P = packetizer.createPacket(V, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + P[i] = V; + } + return P; +} + +PacketRange InstantiationPass::instantiateCall(CallInst *CI) { + VECZ_FAIL_IF(packetizer.width().isScalable()); + const unsigned SimdWidth = packetizer.width().getFixedValue(); + // Handle special call instructions that return a lane ID. + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + const auto Builtin = BI.analyzeBuiltinCall(*CI, packetizer.dimension()); + if (Builtin && + Builtin->properties & compiler::utils::eBuiltinPropertyWorkItem) { + const auto Uniformity = Builtin->uniformity; + if (Uniformity == compiler::utils::eBuiltinUniformityNever) { + // can't handle these (global/local linear ID probably) + VECZ_FAIL(); + } else if (Uniformity & compiler::utils::eBuiltinUniformityInstanceID) { + Type *RetTy = CI->getType(); + PacketRange P = packetizer.createPacket(CI, SimdWidth); + VECZ_FAIL_IF(!P); + IRBuilder<> B(CI); + for (unsigned j = 0; j < SimdWidth; j++) { + P[j] = B.CreateAdd(CI, ConstantInt::get(RetTy, j)); + } + packetizer.deleteInstructionLater(CI); + return P; + } + } + + // We can't instantiate noduplicate functions + VECZ_FAIL_IF(CI->hasFnAttr(Attribute::NoDuplicate)); + + packetizer.deleteInstructionLater(CI); + // Check if the instruction has any uses or not, and also if we want to + // instantiate call instructions with loops or not. + if (CI->hasNUsesOrMore(1) || + !packetizer.choices().instantiateCallsInLoops()) { + // Instantiate as always + SmallVector OpPackets; + for (unsigned i = 0; i < CI->arg_size(); i++) { + Value *Op = CI->getArgOperand(i); + const PacketRange OpPacket = instantiateInternal(Op); + VECZ_FAIL_IF(!OpPacket); + OpPackets.push_back(OpPacket); + } + PacketRange P = packetizer.createPacket(CI, SimdWidth); + VECZ_FAIL_IF(!P); + IRBuilder<> B(CI); + for (unsigned j = 0; j < SimdWidth; j++) { + SmallVector Ops; + for (unsigned i = 0; i < CI->arg_size(); i++) { + Ops.push_back(OpPackets[i][j]); + } + auto *NewCI = B.CreateCall(CI->getFunctionType(), CI->getCalledOperand(), + Ops, CI->getName()); + NewCI->setCallingConv(CI->getCallingConv()); + NewCI->setAttributes(CI->getAttributes()); + P[j] = NewCI; + } + return P; + } else { + // Instantiate in a loop + BasicBlock *BeforeCI = CI->getParent(); + BasicBlock *AfterCI = SplitBlock(BeforeCI, CI); + BasicBlock *LoopHeader = BasicBlock::Create( + CI->getContext(), "instloop.header", CI->getFunction(), AfterCI); + BasicBlock *LoopBody = BasicBlock::Create(CI->getContext(), "instloop.body", + CI->getFunction(), AfterCI); + + // Change the branch instruction from BeforeCI -> AfterCI to BeforeCI -> + // LoopHeader + BeforeCI->getTerminator()->setSuccessor(0, LoopHeader); + + IRBuilder<> B(LoopHeader); + // Create the induction variable + PHINode *Ind = B.CreatePHI(B.getInt32Ty(), 2, "instance"); + + // Create the conditional jump based on the current iteration number + Value *ICmp = B.CreateICmpULT(Ind, B.getInt32(SimdWidth)); + B.CreateCondBr(ICmp, LoopBody, AfterCI); + + B.SetInsertPoint(LoopBody); + SmallVector Operands; + for (auto &Arg : CI->args()) { + // We call the packetizer explicitly, instead of calling the + // instantiator, because we need a packetized value and not an + // instantiateed one. + Value *Packetized = packetizer.packetize(Arg).getAsValue(); + VECZ_FAIL_IF(!Packetized); + VECZ_ERROR_IF(!Packetized->getType()->isVectorTy(), + "The packetized Value has to be of a vector type"); + Operands.push_back(Packetized); + } + // Each Op is an element extracted from a packetized instruction. + SmallVector Ops; + for (unsigned i = 0; i < Operands.size(); ++i) { + Ops.push_back(B.CreateExtractElement(Operands[i], Ind)); + } + // Create the function call + auto CO = CI->getCalledOperand(); + FunctionType *FTy = CI->getFunctionType(); + CallInst *NewCI = B.CreateCall(FTy, CO, Ops); + NewCI->setCallingConv(CI->getCallingConv()); + NewCI->setAttributes(CI->getAttributes()); + // Increment the induction variable and jump back to the loop header + Value *IndInc = B.CreateAdd(Ind, B.getInt32(1), ""); + B.CreateBr(LoopHeader); + + // Set the operands to the Phi node in the loop header + Ind->addIncoming(B.getInt32(0), BeforeCI); + Ind->addIncoming(IndInc, LoopBody); + + // Set the Packet, even though we are not going to be using this value (we + // have checked if the call has 0 users). We don't need to populate it. + return packetizer.createPacket(CI, SimdWidth); + } +} + +PacketRange InstantiationPass::instantiateAlloca(AllocaInst *Alloca) { + VECZ_FAIL_IF(packetizer.width().isScalable()); + const unsigned SimdWidth = packetizer.width().getFixedValue(); + PacketRange P = packetizer.createPacket(Alloca, SimdWidth); + VECZ_FAIL_IF(!P); + IRBuilder<> B(Alloca); + for (unsigned i = 0; i < SimdWidth; i++) { + Type *Ty = Alloca->getAllocatedType(); + AllocaInst *New = B.CreateAlloca(Ty, nullptr, Alloca->getName()); + New->setAlignment(Alloca->getAlign()); + + P[i] = New; + } + packetizer.deleteInstructionLater(Alloca); + return P; +} + +PacketRange InstantiationPass::instantiateByCloning(Instruction *I) { + VECZ_FAIL_IF(packetizer.width().isScalable()); + auto SimdWidth = packetizer.width().getFixedValue(); + PacketRange P = packetizer.createPacket(I, SimdWidth); + if (!P || P.at(SimdWidth - 1)) { + return P; + } + + // Clone breadth first so that the packet is complete before fixing up the + // operands, that way we get less stack-thrashing, especially when there + // is a circular dependency. + SmallVector Clones; + for (decltype(SimdWidth) i = 0; i < SimdWidth; ++i) { + if (P.at(i)) { + Clones.push_back(nullptr); + continue; + } + Instruction *Clone = I->clone(); + Clone->insertBefore(I->getIterator()); + P[i] = Clone; + Clones.push_back(Clone); + } + + for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) { + Value *V = I->getOperand(i); + if (isa(V) || isa(V)) { + continue; + } + + if (const auto OpP = instantiateInternal(V)) { + for (decltype(SimdWidth) lane = 0; lane < SimdWidth; ++lane) { + if (auto *Clone = Clones[lane]) { + if (auto *At = OpP.at(lane)) { + Clone->setOperand(i, At); + } + } + } + } else { + VECZ_FAIL(); + } + } + + packetizer.deleteInstructionLater(I); + return P; +} + +PacketRange InstantiationPass::simdBroadcast(Instruction *I) { + VECZ_FAIL_IF(packetizer.width().isScalable()); + auto SimdWidth = packetizer.width().getFixedValue(); + PacketRange P = packetizer.createPacket(I, SimdWidth); + if (!P || P.at(0)) { + return P; + } + + for (auto &i : P) { + i = I; + } + + auto Op = MemOp::get(I); + if (!Op || !Op->getMaskOperand()) { + return P; + } + + if (auto *MaskInst = dyn_cast(Op->getMaskOperand())) { + const auto MP = instantiateInternal(MaskInst); + VECZ_FAIL_IF(!MP); + + auto W = SimdWidth; + SmallVector Reduce; + for (decltype(SimdWidth) i = 0; i < SimdWidth; i++) { + Reduce.push_back(MP.at(i)); + } + + IRBuilder<> B(buildAfter(Reduce.back(), packetizer.function())); + while ((W >>= 1)) { + for (decltype(W) i = 0; i < W; ++i) { + Reduce[i] = B.CreateOr(Reduce[i], Reduce[i + W], "any_of_mask"); + } + } + Op->setMaskOperand(Reduce.front()); + } + + return P; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp new file mode 100644 index 0000000000000..f2308d7bb050f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp @@ -0,0 +1,548 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/interleaved_group_combine_pass.h" + +#include +#include +#include +#include +#include +#include + +#include + +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "ir_cleanup.h" +#include "memory_operations.h" + +#define DEBUG_TYPE "vecz" + +using namespace llvm; +using namespace vecz; + +char InterleavedGroupCombinePass::PassID = 0; + +struct GroupMemberInfo { + int64_t Offset; + int64_t Order; + CallInst *MemOp; + Value *Ptr; + Type *DataTy; +}; + +/// @brief Information about an interleaved operation. +struct InterleavedGroupCombinePass::InterleavedOpInfo { + /// @brief Interleaved operation. + CallInst *Op; + /// @brief Kind of interleaved operation. + InterleavedOperation Kind; + /// @brief Interleaved stride. + int Stride; + /// @brief Whether the operation was removed or not. + bool Removed; +}; + +struct InterleavedGroupCombinePass::InterleavedGroupInfo { + BasicBlock *BB = nullptr; + SmallVector Data; + SmallVector Info; + Value *Base = nullptr; + unsigned Stride = 0; + int Offset = 0; + InterleavedOperation Kind = eInterleavedInvalid; + + void clear() { + BB = nullptr; + Data.clear(); + Info.clear(); + Base = nullptr; + Stride = 0; + Offset = 0; + Kind = eInterleavedInvalid; + } + + bool isConsecutive() const { + auto InfoIt = Info.begin(); + auto InfoE = Info.end(); + assert(InfoIt != InfoE); + int ExpectedOffset = Info.front().Offset; + for (++InfoIt; InfoIt != InfoE; ++InfoIt) { + if (InfoIt->Offset != ++ExpectedOffset) { + return false; + } + } + return true; + } + + bool canDeinterleaveMask(const Instruction &Mask) const; +}; + +namespace { + +bool canSwap(Instruction *IA, Instruction *IB) { + // we need to check for usage-relations here, because a load instruction + // might depend on a mask calculation and its uses that might end up + // swapped + for (auto *const Op : IB->operand_values()) { + if (isa(Op)) { + // GEPs get eliminated later so ignore them for now + continue; + } + if (Op == IA) { + return false; + } + } + + if (IA->mayReadOrWriteMemory()) { + if (isa(IB)) { + // can't swap any memory operation with a fence + return false; + } + } else { + // if either instruction is not a memory operation, we can swap them. + return true; + } + + if (IB->mayReadOrWriteMemory()) { + if (isa(IA)) { + return false; + } + } else { + return true; + } + + // can't swap a write with a write, or a write with a read, + // but it should be ok to swap two reads + if (IA->mayWriteToMemory() || IB->mayWriteToMemory()) { + return false; + } + + return true; +} + +bool canMoveUp(const SmallVectorImpl &Group, Instruction *IB) { + auto Ig = Group.rbegin(); + auto Ie = Group.rend(); + Instruction *IA = IB; + + // It looks through all preceding instructions, skipping over any that are + // already in the Group, until it reaches the first member of the group, + // terminating if it can't move IB through the current instruction. + // If it reaches the first member of the Group, it is safe to move IB there. + while ((IA = IA->getPrevNode())) { + if (IA == *Ig) { + if (++Ig == Ie) { + // we met every group member so we're done + return true; + } + } else if (!canSwap(IA, IB)) { + return false; + } + } + // if we get here, it means we didn't pass any of the other group members, + // which shouldn't be able to happen. + assert(false); + return false; +} + +bool canMoveDown(const SmallVectorImpl &Group, Instruction *IA) { + auto Ig = Group.rbegin(); + auto Ie = Group.rend(); + Instruction *IB = IA; + + // It looks through all following instructions, skipping over any that are + // already in the Group, until it reaches the first member of the group, + // terminating if it can't move IA through the current instruction. + // If it reaches the first member of the Group, it is safe to move IA there. + while ((IB = IB->getNextNode())) { + if (IB == *Ig) { + if (++Ig == Ie) { + // we met every group member so we're done + return true; + } + } else if (!canSwap(IA, IB)) { + return false; + } + } + // if we get here, it means we didn't pass any of the other group members, + // which shouldn't be able to happen. + assert(false); + return false; +} + +} // namespace + +bool InterleavedGroupCombinePass::InterleavedGroupInfo::canDeinterleaveMask( + const Instruction &Mask) const { + // If the mask definition is not in the same block as the group members, it + // is safe to de-interleave. + if (Mask.getParent() != BB) { + return true; + } + + SmallPtrSet Ops; + for (auto &Op : Mask.operands()) { + if (auto *OpI = dyn_cast(Op.get())) { + // We only care about operands in the same basic block, since otherwise + // they cannot be group members or in between group members. + if (OpI->getParent() == BB) { + Ops.insert(OpI); + } + } + } + + // If the mask has no dependency on anything in the group basic block, it is + // safe to de-interleave. + if (Ops.empty()) { + return true; + } + + // Note that the mask can hardly depend on the last group member, since it is + // itself an operand of this member. + Instruction *IA = cast(Data.back()); + + // It looks through all instructions from the last member of the group + // back to the first, looking to see if the mask depends on any of them. + // If it reaches the first member of the Group, it is safe to move the mask. + // If it finds any of the mask's own operands as group members or in + // between group members, the mask cannot be (trivially) moved. + while (IA) { + if (Ops.contains(IA)) { + // We found something the mask depends on, so we can't de-interleave... + return false; + } else if (IA == Data.front()) { + // we met every group member so we're done + return true; + } + IA = IA->getPrevNode(); + } + + // the mask definition was before every group member + return true; +} + +PreservedAnalyses +InterleavedGroupCombinePass::run(Function &F, FunctionAnalysisManager &AM) { + auto &Ctx = AM.getResult(F).getContext(); + IRCleanup IC; + + const bool IsLoad = + (Kind == eInterleavedLoad) || (Kind == eMaskedInterleavedLoad); + + LLVM_DEBUG(dbgs() << "vecz: InterleavedGroupCombinePass on " << F.getName() + << "\n"); + + scalarEvolution = &AM.getResult(F); + + UniformValueResult &UVR = AM.getResult(F); + const auto &DL = F.getParent()->getDataLayout(); + std::vector InterleavedOps; + for (BasicBlock &BB : F) { + // Look for interleaved operations. + for (Instruction &I : BB) { + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + + std::optional Op = MemOp::get(CI); + // We can't optimize interleaved memops if we don't know the stride at + // runtime, since we need to check if the stride and the group size match. + if (!Op || !Op->isStrideConstantInt()) { + continue; + } + const int64_t Stride = Op->getStrideAsConstantInt(); + if ((Stride == 0) || (Stride == 1)) { + continue; + } + Value *Mask = Op->getMaskOperand(); + InterleavedOpInfo Info; + + const bool OpIsLoad = Op->isLoad(); + if (OpIsLoad) { + Info.Kind = Mask ? eMaskedInterleavedLoad : eInterleavedLoad; + } else { + Info.Kind = Mask ? eMaskedInterleavedStore : eInterleavedStore; + } + Info.Op = CI; + Info.Stride = Stride; + Info.Removed = false; + + // only add the interleaved operation kinds we actually care about + if (IsLoad == OpIsLoad) { + InterleavedOps.push_back(Info); + } + } + if (!InterleavedOps.empty()) { + if (Kind == eInterleavedStore) { + // stores are collated downwards, so reverse the list.. + std::reverse(InterleavedOps.begin(), InterleavedOps.end()); + } + + InterleavedGroupInfo Group; + Group.BB = &BB; + + while (findGroup(InterleavedOps, UVR, Group)) { + // Loads have their uses afterwards, while stores use preceding values. + // Group.Info is in forwards order for Loads, reverse order for Stores. + IRBuilder<> B(Group.Info.front().MemOp); + + Value *Base = Group.Base; + if (Kind == eInterleavedLoad && Group.Offset != 0) { + auto *EltTy = Group.Info.front().DataTy->getScalarType(); + // if it's a Load group that was out of order, we have to use the + // sequentially first GEP in order to preserve use-def ordering, + // which means we have to offset it with an additional GEP and + // hope this optimizes out later. + // Note that this is not necessary for Stores, since instructions + // are inserted at the last Store. + Base = Group.Info.front().Ptr; + auto *Offset = ConstantInt::getSigned( + DL.getIntPtrType(Base->getType()), Group.Offset); + + Base = B.CreateInBoundsGEP(EltTy, Base, Offset, "reorder_offset"); + } + + SmallVector Masks; + if (Group.Kind == eMaskedInterleavedStore || + Group.Kind == eMaskedInterleavedLoad) { + Masks.reserve(Group.Data.size()); + for (auto *V : Group.Data) { + std::optional Op = MemOp::get(cast(V)); + assert(Op && "Unanalyzable interleaved access?"); + Masks.push_back(Op->getMaskOperand()); + } + } + if (Ctx.targetInfo().optimizeInterleavedGroup( + B, Group.Kind, Group.Data, Masks, Base, Group.Stride)) { + for (Value *V : Group.Data) { + if (Instruction *Ins = dyn_cast(V)) { + IC.deleteInstructionLater(Ins); + } + } + } + + // Remove the group no matter whether we optimized it or not. Otherwise + // we will just iterate indefinitely. + for (const auto &Info : Group.Info) { + InterleavedOps[Info.Order].Removed = true; + } + } + InterleavedOps.clear(); + } + } + IC.deleteInstructions(); + + LLVM_DEBUG(dbgs() << "vecz: InterleavedGroupCombinePass done!\n"); + + PreservedAnalyses Preserved; + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + + return Preserved; +} + +bool InterleavedGroupCombinePass::findGroup( + const std::vector &Ops, UniformValueResult &UVR, + InterleavedGroupInfo &Group) { + VECZ_FAIL_IF(Ops.empty()); + // this check keeps clang-tidy happy + VECZ_FAIL_IF(Kind != eInterleavedStore && Kind != eInterleavedLoad); + + auto &SE = *scalarEvolution; + + for (unsigned i = 0; i < Ops.size(); i++) { + // Extract the first memory instruction at the given offset. + const InterleavedOpInfo &Info0 = Ops[i]; + if (Info0.Removed) { + continue; + } + + Type *DataType0 = nullptr; + Value *Ptr0 = nullptr; + if (Kind == eInterleavedStore) { + DataType0 = Info0.Op->getOperand(0)->getType(); + Ptr0 = Info0.Op->getOperand(1); + } else if (Kind == eInterleavedLoad) { + DataType0 = Info0.Op->getType(); + Ptr0 = Info0.Op->getOperand(0); + } + + const IRBuilder<> B(cast(Info0.Op)); + Value *Base0 = UVR.extractMemBase(Ptr0); + if (!Base0) { + continue; + } + + PointerType *PtrTy = dyn_cast(Ptr0->getType()); + if (!PtrTy) { + continue; + } + + Type *EleTy = DataType0->getScalarType(); + const unsigned Align = EleTy->getScalarSizeInBits() / 8; + assert(Align != 0 && + "interleaved memory operation with zero-sized elements"); + + Group.clear(); + Group.Data.push_back(Info0.Op); + Group.Info.emplace_back(GroupMemberInfo{0, i, Info0.Op, Ptr0, DataType0}); + Group.Kind = Info0.Kind; + + // Try to find others that have the same stride and base pointer. + for (unsigned j = i + 1; j < Ops.size(); j++) { + const InterleavedOpInfo &InfoN = Ops[j]; + if (InfoN.Removed) { + continue; + } + + if (Group.Kind != InfoN.Kind) { + continue; + } + + Type *DataTypeN = nullptr; + Value *PtrN = nullptr; + if (Kind == eInterleavedStore) { + DataTypeN = InfoN.Op->getOperand(0)->getType(); + PtrN = InfoN.Op->getOperand(1); + } else if (Kind == eInterleavedLoad) { + DataTypeN = InfoN.Op->getType(); + PtrN = InfoN.Op->getOperand(0); + } + + if ((InfoN.Stride != Info0.Stride) || (DataTypeN != DataType0)) { + continue; + } + + const IRBuilder<> B(cast(InfoN.Op)); + Value *BaseN = UVR.extractMemBase(PtrN); + if (!BaseN || BaseN != Base0) { + continue; + } + + const SCEV *PtrDiff = SE.getMinusSCEV(SE.getSCEV(PtrN), SE.getSCEV(Ptr0)); + const auto *ConstDiff = dyn_cast(PtrDiff); + if (!ConstDiff) { + continue; + } + + // Note that the offset calculated here is a byte offset + int64_t Offset = ConstDiff->getAPInt().getSExtValue(); + if (Offset % Align == 0) { + // only add them to the group if it is possible to collate them together + // at the same place in the function + bool CanMove = false; + if (Kind == eInterleavedLoad) { + CanMove = canMoveUp(Group.Data, cast(InfoN.Op)); + + if (InfoN.Kind == eMaskedInterleavedLoad) { + std::optional Op = MemOp::get(InfoN.Op); + assert(Op && "Unanalyzable load?"); + if (auto *MaskInst = dyn_cast(Op->getMaskOperand())) { + CanMove &= Group.canDeinterleaveMask(*MaskInst); + } + } + } else if (Kind == eInterleavedStore) { + CanMove = canMoveDown(Group.Data, cast(InfoN.Op)); + } + + if (CanMove) { + Offset /= Align; + Group.Data.push_back(InfoN.Op); + Group.Info.emplace_back( + GroupMemberInfo{Offset, j, InfoN.Op, PtrN, DataTypeN}); + } + } + } + + if (Group.Data.size() > 1) { + auto InfoB = Group.Info.begin(); + auto InfoE = Group.Info.end(); + + if (Kind == eInterleavedStore) { + // In the case of stores, the instructions are processed in reverse + // order, so this just puts them back in forwards order + std::reverse(InfoB, InfoE); + } + + // Sort the group members in order of their offsets. Use a stable sort + // so that any duplicates don't get re-ordered (important for stores). + std::stable_sort( + InfoB, InfoE, + [](const GroupMemberInfo &a, const GroupMemberInfo &b) -> bool { + return a.Offset < b.Offset; + }); + + // If the same offset occurs several times, we can still de-interleave + // the unique ones, and maybe catch the rest the next time round. + InfoE = + Group.Info.erase(std::unique(InfoB, InfoE, + [](const GroupMemberInfo &a, + const GroupMemberInfo &b) -> bool { + return a.Offset == b.Offset; + }), + InfoE); + + if (Group.Info.size() <= 1) { + // This could happen if our entire group has the same address, in + // which case "std::unique" removes all but the first element and we + // don't have a Group anymore. + continue; + } + + const unsigned Stride = Info0.Stride; + Group.Stride = Stride; + // If the group is bigger than the stride we can still de-interleave the + // first "Stride" members + if (Group.Info.size() > Stride) { + Group.Info.resize(Stride); + InfoB = Group.Info.begin(); + InfoE = Group.Info.end(); + } + + if (!Group.isConsecutive()) { + // The group of memory instructions was not consecutive, try further. + continue; + } + + // Everything is fine, return this group in offset-sorted order. + { + Group.Data.resize(Group.Info.size()); + auto InfoIt = InfoB; + for (auto &Op : Group.Data) { + assert(InfoIt != InfoE); + Op = (InfoIt++)->MemOp; + } + } + + Group.Base = Group.Info.front().Ptr; + Group.Offset = Group.Info.front().Offset; + + // Put the Info list back into original Ops vector order + // (reverse order for Stores) + std::sort(InfoB, InfoE, + [](const GroupMemberInfo &a, const GroupMemberInfo &b) -> bool { + return a.Order < b.Order; + }); + return true; + } + } + return false; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp new file mode 100644 index 0000000000000..b1274d91cf196 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp @@ -0,0 +1,40 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/passes.h" +#include "llvm/Transforms/Scalar/LoopPassManager.h" +#include "llvm/Transforms/Scalar/LoopRotation.h" + +using namespace llvm; + +llvm::PreservedAnalyses +vecz::VeczLoopRotatePass::run(llvm::Loop &L, llvm::LoopAnalysisManager &LAM, + llvm::LoopStandardAnalysisResults &AR, + llvm::LPMUpdater &LU) { + // Only process loops whose latch cannot exit the loop and its predecessors + // cannot either. + if (L.isLoopExiting(L.getLoopLatch())) { + return PreservedAnalyses::all(); + } + + for (BasicBlock *pred : predecessors(L.getLoopLatch())) { + if (L.contains(pred) && L.isLoopExiting(pred)) { + return PreservedAnalyses::all(); + } + } + + return LoopRotatePass().run(L, LAM, AR, LU); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp new file mode 100644 index 0000000000000..cbeb82b3c47f6 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp @@ -0,0 +1,728 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This file contains all the code to perform, on demand, the plumbing between +// values that have been vectorized, vector-widened, instantiated, or +// semi-widened/instantiated (otherwise known as Vector Sub-Widening), +// including the broadcast of uniform values, scatters, gathers, vector splits +// and concatenations. + +#include "transform/packetization_helpers.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "debugging.h" +#include "transform/packetizer.h" +#include "vectorization_context.h" +#include "vectorization_unit.h" +#include "vecz/vecz_target_info.h" + +#define DEBUG_TYPE "vecz-packetization" + +using namespace llvm; +using namespace vecz; + +namespace { +Value *scalableBroadcastHelper(Value *subvec, ElementCount factor, + const vecz::TargetInfo &TI, IRBuilder<> &B, + bool URem); + +// Helper to broadcast a fixed vector thus: +// -> vscale x 1 -> +Value *createScalableBroadcastOfFixedVector(const vecz::TargetInfo &TI, + IRBuilder<> &B, Value *subvec, + ElementCount factor) { + assert(factor.isScalable()); + return scalableBroadcastHelper(subvec, factor, TI, B, /*URem*/ true); +} + +// Helper to broadcast a scalable vector thus: +// -> x 2 +Value *createFixedBroadcastOfScalableVector(const vecz::TargetInfo &TI, + IRBuilder<> &B, Value *subvec, + ElementCount factor) { + assert(!factor.isScalable()); + return scalableBroadcastHelper(subvec, factor, TI, B, /*URem*/ false); +} +} // namespace + +namespace vecz { +IRBuilder<> buildAfter(Value *V, Function &F, bool IsPhi) { + if (auto *const I = dyn_cast(V)) { + BasicBlock::iterator Next = I->getIterator(); + const BasicBlock::iterator End = Next->getParent()->end(); + do { + ++Next; + } while (!IsPhi && (Next != End) && + (isa(Next) || isa(Next))); + // If there is debug info between this instruction and the next, insert + // before the debug info. This is required for PHIs and makes sense for + // other instructions too. + Next.setHeadBit(true); + return {I->getParent(), Next}; + } + // Else find the first point in the function after any allocas. + auto it = F.getEntryBlock().begin(); + while (isa(*it)) { + ++it; + } + return {&F.getEntryBlock(), it}; +} + +static Constant *getShuffleMask(ShuffleVectorInst *shuffle) { + // The mask value seems not to be a proper operand for LLVM 11. + // NOTE this is marked as "temporary" in the docs! + return shuffle->getShuffleMaskForBitcode(); +} + +Value *createOptimalShuffle(IRBuilder<> &B, Value *srcA, Value *srcB, + const SmallVectorImpl &mask, + const Twine &name) { + const auto &maskC = mask; + auto *shuffleA = dyn_cast(srcA); + // If we have a unary shuffle of a shuffle, we can just pre-shuffle the masks + if (shuffleA && isa(srcB)) { + auto *const srcMask = getShuffleMask(shuffleA); + auto *const newMask = ConstantExpr::getShuffleVector( + srcMask, PoisonValue::get(srcMask->getType()), maskC); + + return B.CreateShuffleVector(shuffleA->getOperand(0), + shuffleA->getOperand(1), newMask, name); + } + + auto *shuffleB = dyn_cast(srcB); + + if (shuffleA && shuffleB) { + auto *const shuffleSrcA = shuffleA->getOperand(0); + auto *const shuffleSrcB = shuffleA->getOperand(1); + + // If we have a shuffle of two shuffles with identical source operands, + // we can just pre-shuffle their masks together. + if (shuffleB->getOperand(0) == shuffleSrcA && + shuffleB->getOperand(1) == shuffleSrcB) { + auto *const srcMaskA = getShuffleMask(shuffleA); + auto *const srcMaskB = getShuffleMask(shuffleB); + auto *const newMask = + ConstantExpr::getShuffleVector(srcMaskA, srcMaskB, maskC); + + return B.CreateShuffleVector(shuffleSrcA, shuffleSrcB, newMask, name); + } + } + + // If either operand is a unary shuffle, we can pull a few more tricks.. + // For instance: + // + // shuffle(shuffle(A, poison, maskA), shuffle(B, poison, maskB), maskC) + // => shuffle(A, B, shuffle(maskA, adjust(maskB), maskC)) + // where "adjust" refers to adjusting the mask values to refer to the second + // source vector by adding the width of the first operand to the indices. + // + // If either source operand is something other than a unary shuffle, we can + // "pretend" it is a NOP shuffle of that operand (i.e. a mask of <0, 1, 2..>) + // and proceed as before, absorbing the unary shuffle from the other operand. + if (shuffleA && !isa(shuffleA->getOperand(1))) { + shuffleA = nullptr; + } + if (shuffleB && !isa(shuffleB->getOperand(1))) { + shuffleB = nullptr; + } + + if (shuffleA || shuffleB) { + // We can absorb one or two unary shuffles into the new shuffle.. + auto *const shuffleAsrc = shuffleA ? shuffleA->getOperand(0) : srcA; + auto *const shuffleBsrc = shuffleB ? shuffleB->getOperand(0) : srcB; + const auto srcASize = + cast(shuffleAsrc->getType())->getNumElements(); + const auto srcBSize = + cast(shuffleBsrc->getType())->getNumElements(); + if (srcASize == srcBSize) { + Constant *srcMaskA = nullptr; + Constant *srcMaskB = nullptr; + + if (shuffleA) { + srcMaskA = getShuffleMask(shuffleA); + } else { + // if one operand is not a shuffle, we can make a pretend shuffle.. + SmallVector newMaskA; + for (unsigned i = 0; i < srcASize; ++i) { + newMaskA.push_back(B.getInt32(i)); + } + srcMaskA = ConstantVector::get(newMaskA); + } + + if (shuffleB) { + auto *const maskB = getShuffleMask(shuffleB); + + // adjust the second mask to refer to the second vector.. + srcMaskB = ConstantExpr::getAdd( + maskB, ConstantVector::getSplat( + multi_llvm::getVectorElementCount(maskB->getType()), + B.getInt32(srcASize))); + } else { + // if one operand is not a shuffle, we can make a pretend shuffle.. + SmallVector newMaskB; + for (unsigned i = 0; i < srcBSize; ++i) { + newMaskB.push_back(B.getInt32(i + srcASize)); + } + srcMaskB = ConstantVector::get(newMaskB); + } + + auto *const newMask = + ConstantExpr::getShuffleVector(srcMaskA, srcMaskB, maskC); + + return B.CreateShuffleVector(shuffleAsrc, shuffleBsrc, newMask, name); + } + } + + // No more optimal alternative, just build a new one + return B.CreateShuffleVector(srcA, srcB, maskC, name); +} + +bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B, + SmallVectorImpl &srcs, unsigned subWidth) { + // Scalable sub-splats must be handled specially. + if (isa(srcs.front()->getType())) { + if (srcs.size() != 1) { + return false; + } + Value *&val = srcs.front(); + val = createFixedBroadcastOfScalableVector( + TI, B, val, ElementCount::getFixed(subWidth)); + return val != nullptr; + } + + auto *const vecTy = dyn_cast(srcs.front()->getType()); + + if (!vecTy) { + return false; + } + + const unsigned srcWidth = vecTy->getNumElements(); + + // Build shuffle mask to widen the vector condition. + SmallVector mask; + for (unsigned i = 0; i < srcWidth; ++i) { + for (unsigned j = 0; j < subWidth; ++j) { + mask.push_back(i); + } + } + + auto *poison = PoisonValue::get(srcs.front()->getType()); + for (auto &src : srcs) { + src = createOptimalShuffle(B, src, poison, mask); + } + return true; +} + +Value *createMaybeVPReduction(IRBuilderBase &B, Value *Val, RecurKind Kind, + Value *VL) { + assert(isa(Val->getType()) && "Must be vector type"); + // If VL is null, it's not a vector-predicated reduction. + if (!VL) { + return createSimpleReduction(B, Val, Kind); + } + auto IntrinsicOp = Intrinsic::not_intrinsic; + switch (Kind) { + default: + break; + case RecurKind::None: + return nullptr; + case RecurKind::Add: + IntrinsicOp = Intrinsic::vp_reduce_add; + break; + case RecurKind::Mul: + IntrinsicOp = Intrinsic::vp_reduce_mul; + break; + case RecurKind::Or: + IntrinsicOp = Intrinsic::vp_reduce_or; + break; + case RecurKind::And: + IntrinsicOp = Intrinsic::vp_reduce_and; + break; + case RecurKind::Xor: + IntrinsicOp = Intrinsic::vp_reduce_xor; + break; + case RecurKind::FAdd: + IntrinsicOp = Intrinsic::vp_reduce_fadd; + break; + case RecurKind::FMul: + IntrinsicOp = Intrinsic::vp_reduce_fmul; + break; + case RecurKind::SMin: + IntrinsicOp = Intrinsic::vp_reduce_smin; + break; + case RecurKind::SMax: + IntrinsicOp = Intrinsic::vp_reduce_smax; + break; + case RecurKind::UMin: + IntrinsicOp = Intrinsic::vp_reduce_umin; + break; + case RecurKind::UMax: + IntrinsicOp = Intrinsic::vp_reduce_umax; + break; + case RecurKind::FMin: + IntrinsicOp = Intrinsic::vp_reduce_fmin; + break; + case RecurKind::FMax: + IntrinsicOp = Intrinsic::vp_reduce_fmax; + break; + } + + auto *const F = Intrinsic::getOrInsertDeclaration( + B.GetInsertBlock()->getModule(), IntrinsicOp, Val->getType()); + assert(F && "Could not declare vector-predicated reduction intrinsic"); + + auto *const VecTy = cast(Val->getType()); + auto *const NeutralVal = + compiler::utils::getNeutralVal(Kind, VecTy->getElementType()); + auto *const Mask = createAllTrueMask(B, VecTy->getElementCount()); + return B.CreateCall(F, {NeutralVal, Val, Mask, VL}); +} + +Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty, + unsigned FixedVecElts, const Twine &N) { + auto *const Steps = B.CreateStepVector(Ty); + + const auto EltCount = multi_llvm::getVectorElementCount(Ty); + auto *const ElTy = multi_llvm::getVectorElementType(Ty); + + auto *const FixedVecEltsSplat = + B.CreateVectorSplat(EltCount, ConstantInt::get(ElTy, FixedVecElts)); + auto *const StepsMul = B.CreateMul(Steps, FixedVecEltsSplat); + return B.CreateAdd(StepsMul, Indices, N); +} + +Value *createAllTrueMask(IRBuilderBase &B, ElementCount EC) { + return ConstantInt::getTrue(VectorType::get(B.getInt1Ty(), EC)); +} + +Value *createIndexSequence(IRBuilder<> &Builder, VectorType *VecTy, + const Twine &Name) { + auto EC = VecTy->getElementCount(); + if (EC.isScalable()) { + // FIXME: This intrinsic works on fixed-length types too: should we migrate + // to using it starting from LLVM 13? + return Builder.CreateStepVector(VecTy, Name); + } + + SmallVector Indices; + auto *EltTy = VecTy->getElementType(); + for (unsigned i = 0, e = EC.getFixedValue(); i != e; i++) { + Indices.push_back(ConstantInt::get(EltTy, i)); + } + return ConstantVector::get(Indices); +} + +} // namespace vecz + +PacketRange PacketInfo::getRange(std::vector &d, + unsigned width) const { + auto found = packets.find(width); + if (found != packets.end()) { + return PacketRange(d, found->second, width); + } else { + return PacketRange(d); + } +} + +Value *Packetizer::Result::getAsValue() const { + if (!scalar || !info) { + return nullptr; + } + + if (info->vector) { + return info->vector; + } + + const auto numInstances = info->numInstances; + if (numInstances == 0) { + return broadcast(1).info->vector; + } + + const auto packet = getRange(numInstances); + assert(packet && "Packet doesn't exist when it should"); + + // If the instantiator broadcast the value, it will have set its own packet, + // so we fix that here. + bool splat = true; + for (auto *v : packet) { + if (v != scalar) { + splat = false; + break; + } + } + + if (splat) { + info->numInstances = 0; + return broadcast(1).info->vector; + } + + Type *const eleTy = packet.front()->getType(); + assert(!eleTy->isVoidTy() && "Should not be getting a vector of voids"); + + auto name = scalar->getName(); + + if (FixedVectorType::isValidElementType(eleTy)) { + Value *gather = + PoisonValue::get(FixedVectorType::get(eleTy, packet.size())); + + IRBuilder<> B(buildAfter(packet.back(), packetizer.F)); + for (unsigned i = 0; i < packet.size(); i++) { + gather = B.CreateInsertElement(gather, packet.at(i), B.getInt32(i), + Twine(name, ".gather")); + } + info->vector = gather; + } else if (eleTy->isVectorTy()) { + // Gathering an instantiated vector by concatenating all the lanes + auto parts = narrow(2); + auto *vecTy = cast(parts.front()->getType()); + const unsigned fullWidth = vecTy->getNumElements() * 2; + + SmallVector mask; + for (size_t j = 0; j < fullWidth; ++j) { + mask.push_back(j); + } + + IRBuilder<> B(buildAfter(parts[1], packetizer.F)); + info->vector = B.CreateShuffleVector(parts[0], parts[1], mask, + Twine(name, ".concatenate")); + } else { + Value *gather = PoisonValue::get(ArrayType::get(eleTy, packet.size())); + + IRBuilder<> B(buildAfter(packet.back(), packetizer.F)); + for (unsigned i = 0; i < packet.size(); i++) { + gather = + B.CreateInsertValue(gather, packet.at(i), i, Twine(name, ".gather")); + } + info->vector = gather; + } + return info->vector; +} + +PacketRange Packetizer::Result::getAsPacket(unsigned width) const { + if (!scalar || !info) { + return PacketRange(packetizer.packetData); + } + + if (const auto range = getRange(width)) { + return range; + } + + auto numInstances = info->numInstances; + if (numInstances == 0) { + return broadcast(width).getRange(width); + } + + if (numInstances != 1) { + if (numInstances < width) { + return widen(width); + } else if (numInstances > width) { + return narrow(width); + } else { + assert(false && "Supposedly unreachable condition in Packetizer::Result"); + } + } + + if (!info->vector) { + return PacketRange(packetizer.packetData); + } + + auto packet = createPacket(width); + + Value *vec = info->vector; + if (auto *const vecTy = dyn_cast(vec->getType())) { + assert(isa(vecTy) && "Must be a fixed vector type here!"); + const unsigned scalarWidth = vecTy->getNumElements() / width; + if (scalarWidth > 1 || scalar->getType()->isVectorTy()) { + auto *const poison = PoisonValue::get(vec->getType()); + + // Build shuffle mask to perform the subvector extracts. + IRBuilder<> B(buildAfter(vec, packetizer.F)); + for (size_t i = 0, k = 0; i < width; ++i) { + SmallVector mask; + for (size_t j = 0; j < scalarWidth; ++j, ++k) { + mask.push_back(k); + } + packet[i] = createOptimalShuffle(B, vec, poison, mask, + Twine(scalar->getName(), ".split")); + } + } else { + IRBuilder<> B(buildAfter(vec, packetizer.F)); + for (unsigned i = 0; i < width; i++) { + packet[i] = B.CreateExtractElement(vec, B.getInt32(i)); + } + } + } else { + assert(isa(vecTy) && "Must be an array here!"); + IRBuilder<> B(buildAfter(vec, packetizer.F)); + for (unsigned i = 0; i < width; i++) { + packet[i] = B.CreateExtractValue(vec, i); + } + } + return packet; +} + +void Packetizer::Result::getPacketValues(SmallVectorImpl &vals) const { + assert(info && "No packet info for this packetization result"); + const auto width = info->numInstances; + if (width != 0) { + getPacketValues(width, vals); + return; + } +} + +void Packetizer::Result::getPacketValues(unsigned width, + SmallVectorImpl &vals) const { + assert(width != 0 && "Can't get a zero width packet"); + if (width == 1) { + if (auto *const val = getAsValue()) { + vals.push_back(val); + } + } else { + auto p = getAsPacket(width); + vals.assign(p.begin(), p.end()); + } +} + +PacketRange Packetizer::Result::createPacket(unsigned width) const { + assert(info && "Can't create a packet on a fail state"); + assert(!info->packets.contains(width) && + "Shouldn't create the same packet twice"); + + const auto start = packetizer.packetData.size(); + packetizer.packetData.resize(start + width, nullptr); + info->packets[width] = start; + return PacketRange(packetizer.packetData, start, width); +} + +PacketRange Packetizer::Result::getRange(unsigned width) const { + return info->getRange(packetizer.packetData, width); +} + +// it makes a wider packet by splitting the sub-vectors +PacketRange Packetizer::Result::widen(unsigned width) const { + const auto numInstances = info->numInstances; + const auto parts = getRange(numInstances); + auto *const vecTy = dyn_cast(parts.front()->getType()); + assert(vecTy && "Expected a fixed vector type"); + + auto packet = createPacket(width); + const auto origWidth = vecTy->getNumElements(); + const auto newWidth = (origWidth * numInstances) / width; + const auto name = scalar->getName(); + + auto *it = parts.begin(); + IRBuilder<> B(buildAfter(parts.back(), packetizer.F)); + if (newWidth > 1) { + auto *const poison = PoisonValue::get(vecTy); + + // Build shuffle mask to perform the subvector extracts. + for (size_t i = 0, origIdx = 0; i < width; ++i) { + if (origIdx == origWidth) { + origIdx = 0; + ++it; + } + SmallVector mask; + for (size_t j = 0; j < newWidth; ++j, ++origIdx) { + mask.push_back(origIdx); + } + packet[i] = + createOptimalShuffle(B, *it, poison, mask, Twine(name, ".split")); + } + } else { + for (size_t i = 0, origIdx = 0; i < width; ++i, ++origIdx) { + if (origIdx == origWidth) { + origIdx = 0; + ++it; + } + packet[i] = B.CreateExtractElement(*it, B.getInt32(origIdx), + Twine(name, ".split")); + } + } + return packet; +} + +// it makes a narrower packet by concatenating the sub-vectors +PacketRange Packetizer::Result::narrow(unsigned width) const { + if (const auto range = getRange(width)) { + return range; + } + + // Narrow recursively + const auto parts = narrow(width * 2); + assert(parts && "Error during packet narrowing"); + + auto packet = createPacket(width); + auto *const ty = parts.front()->getType(); + auto *const vecTy = dyn_cast(ty); + if (!vecTy) { + // Build vectors out of pairs of scalar values + const auto name = scalar->getName(); + IRBuilder<> B(buildAfter(parts.back(), packetizer.F)); + Value *poison = PoisonValue::get(FixedVectorType::get(ty, 2)); + for (size_t i = 0, pairIdx = 0; i < width; ++i, pairIdx += 2) { + Value *in = B.CreateInsertElement(poison, parts[pairIdx], B.getInt32(0), + Twine(name, ".gather")); + packet[i] = B.CreateInsertElement(in, parts[pairIdx + 1], B.getInt32(1), + Twine(name, ".gather")); + } + return packet; + } + + const unsigned fullWidth = vecTy->getNumElements() * 2; + + SmallVector mask; + for (size_t j = 0; j < fullWidth; ++j) { + mask.push_back(j); + } + + // Build wider vectors by concatenating pairs of sub-vectors + const auto name = scalar->getName(); + IRBuilder<> B(buildAfter(parts.back(), packetizer.F)); + for (size_t i = 0, pairIdx = 0; i < width; ++i, pairIdx += 2) { + packet[i] = createOptimalShuffle(B, parts[pairIdx], parts[pairIdx + 1], + mask, Twine(name, ".concatenate")); + } + return packet; +} + +namespace { +// This method creates the following sequence to broadcast a fixed-length +// vector to a scalable one or broadcasting a scalable-vector by a fixed +// amount, barring any optimizations we can perform for broadcasting a splat +// vector. +// The general idea is first to store the subvector to a stack 'alloca', then +// use a gather operation with a vector of pointers created using a step vector +// modulo the fixed amount. +// Note that other sequences are possible, such as a series of blend +// operations. This could perhaps be a target choice. +Value *scalableBroadcastHelper(Value *subvec, ElementCount factor, + const vecz::TargetInfo &TI, IRBuilder<> &B, + bool URem) { + auto *ty = subvec->getType(); + const auto subVecEltCount = multi_llvm::getVectorElementCount(ty); + assert(subVecEltCount.isScalable() ^ factor.isScalable() && + "Must either broadcast fixed vector by scalable factor or scalable " + "vector by fixed factor"); + auto *const wideTy = getWideType(ty, factor); + auto wideEltCount = multi_llvm::getVectorElementCount(wideTy); + + // If this vector is a constant splat, just splat it to the wider scalable + // type. + if (auto *const cvec = dyn_cast(subvec)) { + if (auto *const splat = cvec->getSplatValue()) { + return ConstantVector::getSplat(wideEltCount, splat); + } + } + // Or if it's a splat value, re-splat it. Note we do Constants separately + // above as it generates more canonical code, e.g., a splat of 0 becomes + // zeroinitializer rather than a insertelement/shufflevector sequence. + if (const auto *const splat = getSplatValue(subvec)) { + return B.CreateVectorSplat(wideEltCount, const_cast(splat)); + } + + // Compiler support for masked.gather on i1 vectors is lacking, so emit this + // operation as the equivalent i8 vector instead. + const bool upcast_i1_as_i8 = ty->getScalarType()->isIntegerTy(1); + if (upcast_i1_as_i8) { + auto *const int8Ty = Type::getInt8Ty(B.getContext()); + ty = llvm::VectorType::get(int8Ty, subVecEltCount); + subvec = B.CreateSExt(subvec, ty); + } + + Value *gather = + URem ? TI.createOuterScalableBroadcast(B, subvec, /*VL*/ nullptr, factor) + : TI.createInnerScalableBroadcast(B, subvec, /*VL*/ nullptr, factor); + + // If we've been performing this broadcast as i8, now's the time to truncate + // back down to i1. + if (upcast_i1_as_i8) { + gather = B.CreateTrunc(gather, wideTy); + } + + return gather; +} +} // namespace + +const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const { + const auto factor = packetizer.width().divideCoefficientBy(width); + auto *const ty = scalar->getType(); + assert(!ty->isVoidTy() && "Should not be broadcasting a void type"); + + if (width != 1 && !factor.isScalable() && factor.getFixedValue() == 1) { + // Pure instantiation broadcast.. + for (auto &v : createPacket(width)) { + v = scalar; + } + return *this; + } + + auto &F = packetizer.F; + Value *result = nullptr; + const auto &TI = packetizer.context().targetInfo(); + if (isa(scalar)) { + result = PoisonValue::get(getWideType(ty, factor)); + } else if (isa(scalar)) { + result = PoisonValue::get(getWideType(ty, factor)); + } else if (ty->isVectorTy() && factor.isScalable()) { + IRBuilder<> B(buildAfter(scalar, F)); + result = createScalableBroadcastOfFixedVector(TI, B, scalar, factor); + } else if (ty->isVectorTy()) { + auto *const vecTy = cast(ty); + const unsigned scalarWidth = vecTy->getNumElements(); + + const unsigned simdWidth = factor.getFixedValue(); + + // Build shuffle mask to perform the splat. + SmallVector mask; + for (size_t i = 0; i < simdWidth; ++i) { + for (size_t j = 0; j < scalarWidth; ++j) { + mask.push_back(j); + } + } + + IRBuilder<> B(buildAfter(scalar, packetizer.F)); + result = createOptimalShuffle(B, scalar, PoisonValue::get(ty), mask, + Twine(scalar->getName(), ".broadcast")); + } else if (auto *const C = dyn_cast(scalar)) { + result = ConstantVector::getSplat(factor, C); + } else { + IRBuilder<> B(buildAfter(scalar, packetizer.F)); + result = B.CreateVectorSplat(factor, scalar); + } + + if (!result) { + // Failed to broadcast this value, return the empty result + return *this; + } + + if (width == 1) { + info->vector = result; + } else { + for (auto &v : createPacket(width)) { + v = result; + } + } + return *this; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp new file mode 100644 index 0000000000000..e45e2d91bf9d5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp @@ -0,0 +1,80 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/packetization_pass.h" + +#include +#include +#include + +#include "analysis/control_flow_analysis.h" +#include "analysis/divergence_analysis.h" +#include "analysis/simd_width_analysis.h" +#include "analysis/stride_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "transform/packetizer.h" +#include "vectorization_unit.h" +#include "vecz/vecz_target_info.h" + +#define DEBUG_TYPE "vecz-packetization" + +using namespace vecz; +using namespace llvm; + +STATISTIC(VeczPacketizeFail, + "Number of kernels that failed to packetize [ID#P80]"); +STATISTIC(VeczSimdAnalysisFail, "Number of kernels that SIMD Width Analysis " + "suggested not to packetize [ID#P81]"); + +char PacketizationPass::PassID = 0; + +PreservedAnalyses PacketizationPass::run(Function &F, + llvm::FunctionAnalysisManager &AM) { + VectorizationUnit &VU = AM.getResult(F).getVU(); + + if (!VU.width().isScalable()) { + const unsigned SimdWidth = VU.width().getFixedValue(); + if (VU.autoWidth() && VU.context().targetInfo().getTargetMachine()) { + LLVM_DEBUG(dbgs() << "vecz: Original SIMD width: " << SimdWidth << "\n"); + const unsigned NewSimdWidth = AM.getResult(F).value; + LLVM_DEBUG(dbgs() << "vecz: Re-determined SIMD width: " << NewSimdWidth + << "\n"); + + if (NewSimdWidth <= 1u) { + ++VeczSimdAnalysisFail; + return VU.setFailed("SIMD Width Analysis suggested not to packetize"); + } + + if (NewSimdWidth < SimdWidth) { + VU.setWidth(ElementCount::getFixed(NewSimdWidth)); + } + } + } + + if (!Packetizer::packetize(F, AM, VU.width(), VU.dimension())) { + ++VeczPacketizeFail; + return VU.setFailed("packetization failed"); + } + + PreservedAnalyses Preserved; + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + return Preserved; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp new file mode 100644 index 0000000000000..5e0a1fbc7e12e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp @@ -0,0 +1,4050 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/packetizer.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "analysis/instantiation_analysis.h" +#include "analysis/packetization_analysis.h" +#include "analysis/stride_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "llvm_helpers.h" +#include "memory_operations.h" +#include "transform/instantiation_pass.h" +#include "transform/packetization_helpers.h" +#include "vectorization_context.h" +#include "vectorization_unit.h" +#include "vecz/vecz_choices.h" +#include "vecz/vecz_target_info.h" + +#define DEBUG_TYPE "vecz-packetization" + +using namespace vecz; +using namespace llvm; + +STATISTIC(VeczPacketized, "Number of instructions packetized [ID#P00]"); +STATISTIC(VeczPacketizeFailCall, + "Packetize: missing function declarations [ID#P81]"); +STATISTIC(VeczPacketizeFailType, + "Packetize: inconsistent vector parameters [ID#P87]"); +STATISTIC(VeczPacketizeFailPtr, + "Packetize: inconsistent pointer parameters [ID#P88]"); +STATISTIC(VeczPacketizeFailStride, + "Packetize: non-constant strides in pointer parameters [ID#P8A]"); + +// Just a little macro that can return an empty SmallVector, as a drop-in +// replacement for VECZ_FAIL_IF.. +#define PACK_FAIL_IF(cond) \ + do { \ + if (cond) { \ + return {}; \ + } \ + } while (false) + +namespace { +// Returns a type equivalent to the input type plus padding. +// This converts a <3 x Ty> into a <4 x Ty>, leaving other types unchanged. +Type *getPaddedType(Type *Ty) { + if (auto *VecTy = dyn_cast(Ty)) { + if (VecTy->getNumElements() == 3) { + return VectorType::get(VecTy->getElementType(), + ElementCount::getFixed(4)); + } + } + return Ty; +} +} // namespace + +using ValuePacket = SmallVector; + +/// @brief Private implementation of the Packetizer. +/// It inherits its own outer class, which has only private constructors. This +/// allows us to pass it by reference to functions that need to access the +/// Packetizer, while also ensuring that a Packetizer cannot be created except +/// as the base class of its own implementation. +class Packetizer::Impl : public Packetizer { +public: + Impl(llvm::Function &F, llvm::FunctionAnalysisManager &AM, ElementCount Width, + unsigned Dim); + Impl() = delete; + Impl(const Packetizer &) = delete; + Impl(Packetizer &&) = delete; + ~Impl(); + + bool packetize(); + + /// @brief Handle packetization failure. This method ensures that + /// packetization failure does not leave behind invalid IR. + void onFailure(); + + /// @brief Packetize the given value from the function. + /// + /// @param[in] V Value to packetize. + /// + /// @return Packetized value. + Result packetize(Value *V); + + /// @brief Packetize the given value and return the packet by values + /// + /// @param[in] V Value to packetize. + /// + /// @return Packetized values. + ValuePacket packetizeAndGet(Value *V); + + /// @brief Packetize the given value to a specified packet width, and return + /// the packet by values + /// + /// @param[in] V Value to packetize. + /// @param[in] Width the requested packet width + /// + /// @return Packetized values. + ValuePacket packetizeAndGet(Value *V, unsigned Width); + + /// @brief Helper to produce a Result from a Packet + Packetizer::Result + getPacketizationResult(Instruction *I, const SmallVectorImpl &Packet, + bool UpdateStats = false); + + /// @brief Packetize the given value from the function, only if it is a + /// varying value. Ensures Mask Varying values are handled correctly. + /// + /// @param[in] V Value to packetize. + /// + /// @return Packetized value if varying, or the original value if Uniform. + Value *packetizeIfVarying(Value *V); + + /// @brief Packetize a uniform value by broadcasting to all vector lanes. + /// + /// @param[in] V Value to broadcast + /// + /// @return Packetized instruction + Result broadcast(Value *V); + /// @brief Reduce a varying boolean condition to a scalar + /// + /// @param[in] cond Condition to packetize. + /// @param[in] terminator Terminator instruction. + /// @param[in] allOf Whether to create a all of mask, or any of. + /// + /// @return reduced boolean value. + Value *reduceBranchCond(Value *cond, Instruction *terminator, bool allOf); + /// @brief Compute the ideal packet width for subwidening the given type + /// + /// @param[in] ty Type of the value to subwiden + /// @param[in] limit The maximum vector width we allow + /// + /// @return width of the packet to create + unsigned getPacketWidthForType(Type *ty, unsigned limit = ~0u) const; + /// @brief Packetize an instruction. + /// + /// @param[in] Ins Instruction to packetize. + /// + /// @return Packetized instructions. + Result packetizeInstruction(Instruction *Ins); + /// @brief Packetize a mask-varying instruction. + /// + /// @param[in] I Instruction to packetize. + /// + /// @return Packetized instruction. + Value *packetizeMaskVarying(Instruction *I); + /// @brief Packetize a mask-varying subgroup/workgroup reduction. + /// + /// @param[in] I Instruction to packetize. + /// + /// @return Packetized instruction. + Value *packetizeGroupReduction(Instruction *I); + /// @brief Packetize a subgroup/workgroup broadcast. + /// + /// @param[in] I Instruction to packetize. + /// + /// @return Packetized instruction. + Value *packetizeGroupBroadcast(Instruction *I); + /// @brief Returns true if the instruction is any subgroup shuffle. + /// + /// @param[in] I Instruction to query. + /// + /// @return The group collective data if the instruction is a call to any of + /// the mux subgroup shuffle builtins; std::nullopt otherwise. + std::optional + isSubgroupShuffleLike(Instruction *I); + /// @brief Packetize a sub-group shuffle builtin + /// + /// Note - not any shuffle-like operation, but specifically the 'shuffle' + /// builtin. + /// + /// @param[in] Ins Instruction to packetize. + /// + /// @return Packetized instructions. + Value *packetizeSubgroupShuffle(Instruction *Ins); + /// @brief Packetize a sub-group shuffle-xor builtin + /// + /// Note - not any shuffle-like operation, but specifically the 'shuffle_xor' + /// builtin. + /// + /// @param[in] Ins Instruction to packetize. + /// @param[in] ShuffleXor Shuffle to packetize. + /// + /// @return Packetized instructions. + Result + packetizeSubgroupShuffleXor(Instruction *Ins, + compiler::utils::GroupCollective ShuffleXor); + /// @brief Packetize a sub-group shuffle-up or shuffle-down builtin + /// + /// Note - not any shuffle-like operation, but specifically the 'shuffle_up' + /// and 'shuffle_down' builtins. + /// + /// @param[in] Ins Instruction to packetize. + /// @param[in] ShuffleUpDown Shuffle to packetize. + /// + /// @return Packetized instructions. + Result packetizeSubgroupShuffleUpDown( + Instruction *Ins, compiler::utils::GroupCollective ShuffleUpDown); + + /// @brief Packetize PHI node. + /// + /// @param[in] Phi PHI Node to packetize. + /// + /// @return Packetized values. + ValuePacket packetizePHI(PHINode *Phi); + /// @brief Packetize a call instruction. + /// + /// @param[in] CI Call Instruction to packetize. + /// + /// @return Packetized values. + ValuePacket packetizeCall(CallInst *CI); + /// @brief Packetize a subgroup/workgroup scan. + /// + /// @param[in] CI CallInst to packetize. + /// @param[in] Scan type of scan to packetized. + /// + /// @return Packetized values. + ValuePacket packetizeGroupScan(CallInst *CI, + compiler::utils::GroupCollective Scan); + /// @brief Perform post-packetization tasks for the given scalar value. + /// + /// @param[in] Scalar Scalar value to assign a vectorized value. + /// @param[in] Vectorized Packetized value to assign. + /// + /// @return Packetized values. + Result assign(Value *Scalar, Value *Vectorized); + /// @brief Vectorize an instruction. + /// + /// @param[in] Ins Instruction to packetize. + /// + /// @return Packetized instruction. + Value *vectorizeInstruction(Instruction *Ins); + /// @brief Packetize a load instruction. + /// + /// @param[in] Load Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeLoad(LoadInst *Load); + /// @brief Packetize a store instruction. + /// + /// @param[in] Store Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeStore(StoreInst *Store); + /// @brief Packetize a memory operation. + /// + /// @param[in] Op Memory operation to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeMemOp(MemOp &Op); + /// @brief Packetize a masked atomicrmw or cmpxchg operation. + /// + /// @param[in] CI Masked atomic builtin call to packetize. + /// @param[in] AtomicInfo Information about the masked atomic. + /// + /// @return Packetized instruction. + ValuePacket + packetizeMaskedAtomic(CallInst &CI, + VectorizationContext::MaskedAtomic AtomicInfo); + /// @brief Packetize a GEP instruction. + /// + /// @param[in] GEP Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeGEP(GetElementPtrInst *GEP); + /// @brief Packetize a cast instruction. + /// + /// @param[in] CastI Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeCast(CastInst *CastI); + /// @brief Packetize a binary operator instruction. + /// + /// @param[in] BinOp Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeBinaryOp(BinaryOperator *BinOp); + /// @brief Packetize a freeze instruction. + /// + /// @param[in] FreezeI Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeFreeze(FreezeInst *FreezeI); + /// @brief Packetize an atomic cmpxchg instruction. + /// + /// @param[in] AtomicI Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeAtomicCmpXchg(AtomicCmpXchgInst *AtomicI); + /// @brief Packetize a unary operator instruction. + /// + /// @param[in] UnOp Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeUnaryOp(UnaryOperator *UnOp); + /// @brief Packetize an integer compare instruction. + /// + /// @param[in] Cmp Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeICmp(ICmpInst *Cmp); + /// @brief Packetize a floating-point compare instruction. + /// + /// @param[in] Cmp Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeFCmp(FCmpInst *Cmp); + /// @brief Packetize a select instruction. + /// + /// @param[in] Select Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeSelect(SelectInst *Select); + /// @brief Packetize a return instruction. + /// + /// @param[in] Return Instruction to packetize. + /// + /// @return Packetized instruction. + Value *vectorizeReturn(ReturnInst *Return); + /// @brief Packetize a call instruction. + /// + /// @param[in] CI Instruction to packetize. + /// + /// @return Packetized instruction. + Value *vectorizeCall(CallInst *CI); + /// @brief Packetize a call to a work-group builtin. + /// + /// @param[in] CI Instruction to packetize. + /// @param[in] Builtin Builtin identifier. + /// + /// @return Packetized instruction. + Value *vectorizeWorkGroupCall(CallInst *CI, + const compiler::utils::BuiltinCall &Builtin); + /// @brief Packetize an alloca instruction. + /// + /// @param[in] Alloca Instruction to packetize. + /// + /// @return Packetized instruction. + Value *vectorizeAlloca(AllocaInst *Alloca); + /// @brief Packetize an extract value instruction. + /// + /// @param[in] ExtractElement Instruction to packetize. + /// + /// @return Packetized instruction. + Value *vectorizeExtractValue(ExtractValueInst *ExtractElement); + /// @brief Packetize an insert element instruction. + /// + /// @param[in] InsertElement Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeInsertElement(InsertElementInst *InsertElement); + /// @brief Packetize an extract element instruction. + /// + /// @param[in] ExtractElement Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeExtractElement(ExtractElementInst *ExtractElement); + /// @brief Packetize an insert value instruction. + /// + /// Only packetizes inserts into literal struct types. + /// + /// @param[in] InsertValue Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeInsertValue(InsertValueInst *InsertValue); + /// @brief Packetize an extract value instruction. + /// + /// Only packetizes extracts from literal struct types. + /// + /// @param[in] ExtractValue Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeExtractValue(ExtractValueInst *ExtractValue); + /// @brief Packetize a shuffle vector instruction. + /// + /// @param[in] Shuffle Instruction to packetize. + /// + /// @return Packetized instruction. + ValuePacket packetizeShuffleVector(ShuffleVectorInst *Shuffle); + /// @brief Preserves debug information attached to old scalar instruction, + /// updating the debug info type to match the vector width. + /// + /// @param[in] Scalar Scalar instruction before packetization. + /// @param[in] Packet Packetized instruction. + void vectorizeDI(Instruction *Scalar, Value *Packet); + + /// @brief Helps handle instructions that cannot be packetized. + std::unique_ptr Instantiator; + + /// @brief List of phi nodes that can be used by passes to defer the + /// processing of these nodes. + std::vector pendingPhis; + + /// @brief The target transform info + const TargetTransformInfo TTI; +}; + +Packetizer::Packetizer(llvm::Function &F, llvm::FunctionAnalysisManager &AM, + ElementCount Width, unsigned Dim) + : AM(AM), VU(AM.getResult(F).getVU()), + Ctx(AM.getResult(F).getContext()), + Choices(VU.choices()), UVR(AM.getResult(F)), + SAR(AM.getResult(F)), + PAR(AM.getResult(F)), F(F), SimdWidth(Width), + Dimension(Dim) {} + +Packetizer::Impl::Impl(llvm::Function &F, llvm::FunctionAnalysisManager &AM, + ElementCount Width, unsigned Dim) + : Packetizer(F, AM, Width, Dim), TTI(Ctx.getTargetTransformInfo(F)) { + Instantiator.reset(new InstantiationPass(*this)); +} + +Packetizer::Impl::~Impl() = default; + +bool Packetizer::packetize(llvm::Function &F, llvm::FunctionAnalysisManager &AM, + ElementCount Width, unsigned Dim) { + Impl impl(F, AM, Width, Dim); + const bool Res = impl.packetize(); + if (!Res) { + impl.onFailure(); + } + return Res; +} + +bool Packetizer::Impl::packetize() { + LLVM_DEBUG(if (PAR.isEmpty()) { + llvm::dbgs() << "No vector leaves in function " + << VU.scalarFunction()->getName() << "\n"; + }); + + // If requested, set up the base vector length for this kernel based on the + // number of remaining work items: the local size minus the local id. Since + // VP intrinsics are undefined for %evl values larger than the actual vector + // width, we also constrain it based on the vectorization width. + BasicBlock &EntryBB = F.getEntryBlock(); + IRBuilder<> B(&*EntryBB.getFirstInsertionPt()); + + if (Choices.vectorPredication()) { + auto &M = *F.getParent(); + auto *const I32Ty = Type::getInt32Ty(F.getContext()); + auto *const LocalIdFn = Ctx.builtins().getOrDeclareMuxBuiltin( + compiler::utils::eMuxBuiltinGetLocalId, M); + auto *const LocalSizeFn = Ctx.builtins().getOrDeclareMuxBuiltin( + compiler::utils::eMuxBuiltinGetLocalSize, M); + assert(LocalIdFn && LocalSizeFn && "Unable to create mux builtins"); + auto *const ID = + B.CreateCall(LocalIdFn, B.getInt32(VU.dimension()), "local.id"); + ID->setAttributes(LocalIdFn->getAttributes()); + ID->setCallingConv(LocalIdFn->getCallingConv()); + auto *const Size = + B.CreateCall(LocalSizeFn, B.getInt32(VU.dimension()), "local.size"); + Size->setAttributes(LocalSizeFn->getAttributes()); + Size->setCallingConv(LocalSizeFn->getCallingConv()); + VECZ_FAIL_IF(!ID || !Size); + + VL = B.CreateSub(Size, ID, "work.remaining", /*HasNUW*/ true, + /*HasNSW*/ true); + + if (auto *RVVVL = Ctx.targetInfo().createVPKernelWidth( + B, VL, /*WidestType*/ 32, VU.width())) { + VL = RVVVL; + } else { + auto *const VectorLength = + B.CreateElementCount(VL->getType(), VU.width()); + VL = B.CreateIntrinsic(Intrinsic::umin, {VL->getType()}, + {VL, VectorLength}); + + VL = B.CreateTrunc(VL, I32Ty); + } + } + + // Manifest the memory operation stride values as actual `llvm::Value`s + SAR.manifestAll(B); + + // Pre-process the arguments first to replace any placeholders with their + // proper vector values, and convert pointer return arguments to vector of + // pointers where required. + { + Value *idxVector = nullptr; + for (const auto &TargetArg : VU.arguments()) { + if (auto *const Placeholder = TargetArg.Placeholder) { + auto &info = packets[Placeholder]; + info.vector = TargetArg.NewArg; + info.numInstances = 1; + } else if (TargetArg.PointerRetPointeeTy && + PAR.needsPacketization(TargetArg.NewArg)) { + if (!idxVector) { + idxVector = createIndexSequence( + B, VectorType::get(B.getInt32Ty(), SimdWidth), "index.vec"); + } + + // This implementation looks unlikely to be correct, but for + // now we just maintain the original behaviour, until we have a better + // idea of what is going on or whether any of this is still needed. + // This case will never be encountered during kernel vectorization. + auto *const Arg = TargetArg.NewArg; + auto *const EleTy = TargetArg.PointerRetPointeeTy; + auto &info = packets[Arg]; + info.vector = B.CreateGEP(EleTy, Arg, idxVector); + info.numInstances = 1; + } + } + } + + // Build an ordered list of the instructions to packetize, in depth first + // order so that we don't have to recurse too much. We build the list first + // because packetization of calls can produce loops, which messes up our + // iteration over the basic blocks of the function. + std::vector ordered; + for (auto *BB : depth_first(&F)) { + for (auto &I : *BB) { + if (PAR.needsPacketization(&I)) { + ordered.push_back(&I); + } + } + } + + for (auto *const I : ordered) { + if (!packetize(I)) { + emitVeczRemarkMissed(&F, I, "Could not packetize"); + VECZ_FAIL(); + } + } + + // Packetize remaining phi nodes until they have all been packetized. + // Packetizing one phi node may involve the packetization of another node. + // Some nodes might need to be instantiated instead of being packetized, but + // we are handling this here because the instantiation pass is not run as a + // standalone pass. + // Note: pendingPhis *may* change as we progress through this loop, by + // calling packetize(Incoming). Therefore we can't cache the vector size when + // setting up the loop. + for (unsigned i = 0; i < pendingPhis.size(); i++) { + PHINode *Phi = pendingPhis[i]; + auto &info = packets[Phi]; + assert(info.numInstances > 0 && "A PHI pending packetization has no stub"); + if (info.numInstances == 1) { + auto *NewPhi = cast(info.vector); + for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) { + Value *Incoming = Phi->getIncomingValue(i); + BasicBlock *BB = Phi->getIncomingBlock(i); + Value *VecIncoming = packetize(Incoming).getAsValue(); + VECZ_FAIL_IF(!VecIncoming); + NewPhi->addIncoming(VecIncoming, BB); + } + } else { + const auto PhiPacket = info.getRange(packetData); + for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) { + Value *Incoming = Phi->getIncomingValue(i); + BasicBlock *BB = Phi->getIncomingBlock(i); + auto PackIncoming = packetize(Incoming).getAsPacket(PhiPacket.size()); + for (unsigned j = 0; j < PhiPacket.size(); ++j) { + auto *NewPhi = cast(PhiPacket.at(j)); + auto *Incoming = PackIncoming.at(j); + VECZ_FAIL_IF(!NewPhi); + VECZ_FAIL_IF(!Incoming); + NewPhi->addIncoming(Incoming, BB); + } + } + } + IC.deleteInstructionLater(Phi); + } + + auto *insertPt = &*EntryBB.begin(); + for (auto &I : EntryBB) { + auto *const alloca = dyn_cast(&I); + if (!alloca) { + insertPt = I.getNextNode(); + continue; + } + + while (isa(insertPt)) { + insertPt = insertPt->getNextNode(); + } + + // It's possible for some uses of the alloca to be packetized and others + // not. For instance, where we have a store to a constant address, since + // the execution order of work items is undefined, the data operand need + // not be packetized, and we can end up with uses of the scalar alloca + // still present in the vector function. In such a case we can replace it + // with the first element of the packetized alloca. + if (auto res = getPacketized(alloca)) { + SmallVector vals; + res.getPacketValues(vals); + if (vals.empty()) { + // It is a broadcast value, so we don't need to do anything. + continue; + } + auto *element0 = vals.front(); + + if (!isa(element0)) { + assert(isa(element0) && "vecz: expected GEP"); + auto *const GEP = cast(element0); + // If the alloca was packetized, it will be indexed by a GEP. + // We only need the original, un-indexed pointer. + alloca->replaceAllUsesWith(GEP->getPointerOperand()); + continue; + } + + if (element0->getType()->isVectorTy()) { + B.SetInsertPoint(insertPt); + element0 = B.CreateExtractElement(element0, B.getInt32(0)); + } + alloca->replaceAllUsesWith(element0); + continue; + } + + // We have to widen allocas if they are varying, regardless of the result + // of the packetization analysis, because they need enough storage for all + // lanes, even though they are only accessed through a scalar pointer. + // We do this last, otherwise it messes with the stride analysis etc. + // Only non-instantiated allocas should be left by now. + if (!UVR.isVarying(alloca)) { + continue; + } + // Array allocas need to be instantiated. + assert(!alloca->isArrayAllocation() && + "vecz: unexpected array alloca; should have been instantiated"); + + B.SetInsertPoint(alloca); + auto *const dataTy = alloca->getAllocatedType(); + if (dataTy->isVectorTy() || VectorType::isValidElementType(dataTy)) { + // We can vectorize or vector widen this type. + auto *const newAlloca = + B.CreateAlloca(getWideType(getPaddedType(dataTy), SimdWidth)); + newAlloca->setAlignment(alloca->getAlign()); + newAlloca->takeName(alloca); + + // Absorb other bitcasts (e.g. i8* for lifetime instrinsics, or bitcasts + // back to vector type for contiguous loads/stores) + bool needCast = false; + auto *const newTy = newAlloca->getType(); + for (const Use &U : alloca->uses()) { + auto *const user = dyn_cast(U.getUser()); + if (!user) { + needCast = true; + continue; + } + + auto *const dstTy = user->getType(); + if (dstTy == newTy) { + // Bitcasts totally redundant + user->replaceAllUsesWith(newAlloca); + } else { + // Bitcast into different bitcast + B.SetInsertPoint(user); + user->replaceAllUsesWith(B.CreateBitCast(newAlloca, user->getType())); + } + IC.deleteInstructionLater(cast(user)); + } + + if (needCast) { + // Insert the bitcast after all the allocas + B.SetInsertPoint(insertPt); + auto *const scalarPtr = + B.CreatePointerCast(newAlloca, alloca->getType()); + alloca->replaceAllUsesWith(scalarPtr); + } + } else { + // We couldn't vectorize the type, so create an array instead. + VECZ_FAIL_IF(SimdWidth.isScalable()); + const unsigned fixedWidth = SimdWidth.getFixedValue(); + + AllocaInst *const wideAlloca = + B.CreateAlloca(dataTy, getSizeInt(B, fixedWidth), alloca->getName()); + auto align = alloca->getAlign(); + + // Make sure the alloca has an alignment at least as wide as any of the + // packetized loads or stores using it. + SmallVector users; + for (const Use &U : alloca->uses()) { + users.push_back(cast(U.getUser())); + } + while (!users.empty()) { + auto *const user = users.pop_back_val(); + if (isa(user) || isa(user)) { + for (const Use &U : user->uses()) { + users.push_back(cast(U.getUser())); + } + } else if (auto memop = MemOp::get(user)) { + const auto memAlign = memop->getAlignment(); + if (memAlign > align.value()) { + align = Align(memAlign); + } + } + } + + wideAlloca->setAlignment(align); + wideAlloca->takeName(alloca); + + // It's just a direct replacement. + alloca->replaceAllUsesWith(wideAlloca); + } + + // Note that we don't assign the widened allocas a packet, because they + // are not really being packetized. The problem is, a packetized alloca + // would be expected to be a vector of pointers to scalars, not a scalar + // pointer to a vector. Only instantiation can create such a packet. + IC.deleteInstructionLater(alloca); + } + + const compiler::utils::NameMangler Mangler(&F.getContext()); + + // Handle __mux_get_sub_group_size specially (i.e., not in BuiltinInfo) since + // inlining it requires extra vectorization context, such as the vectorization + // width and choices; this inlining is too tightly coupled to the vectorizer + // context to exist in a generic sense. + for (auto &BB : F) { + for (auto &I : BB) { + CallInst *CI = dyn_cast(&I); + if (!CI) { + continue; + } + + auto *const Callee = CI->getCalledFunction(); + if (!Callee) { + continue; + } + auto B = Ctx.builtins().analyzeBuiltin(*Callee); + if (!B) { + continue; + } + if (B->ID == compiler::utils::eMuxBuiltinGetSubGroupSize) { + auto *const replacement = [this](CallInst *CI) -> Value * { + // The vectorized sub-group size is the mux sub-group reduction sum + // of all of the vectorized sub-group sizes: + // | mux 0 | mux 1 | + // | < a,b,c,d > | < e,f,g > (vl=3) | + // The total sub-group size above is 4 + 3 => 7. + // Note that this expects that the mux sub-group consists entirely of + // equivalently vectorized kernels. + Value *VecgroupSize; + IRBuilder<> B(CI); + auto *const I32Ty = B.getInt32Ty(); + if (VL) { + VecgroupSize = VL; + } else { + VecgroupSize = B.CreateElementCount(I32Ty, SimdWidth); + } + assert(VecgroupSize && "Could not determine vector group size"); + + auto *ReduceFn = Ctx.builtins().getOrDeclareMuxBuiltin( + compiler::utils::eMuxBuiltinSubgroupReduceAdd, *F.getParent(), + {I32Ty}); + assert(ReduceFn && "Could not get reduction builtin"); + + return B.CreateCall(ReduceFn, VecgroupSize, "subgroup.size"); + }(CI); + CI->replaceAllUsesWith(replacement); + IC.deleteInstructionLater(CI); + } + } + } + + IC.deleteInstructions(); + return true; +} + +void Packetizer::Impl::onFailure() { + // On failure, clean up pending Phis, which may still be invalid in that they + // have no incoming operands. For simplicity, just erase and replace all of + // them with poison: the failed vectorized function will be removed anyway. + for (auto *Phi : pendingPhis) { + auto &info = packets[Phi]; + assert(info.numInstances > 0 && "A PHI pending packetization has no stub"); + if (info.numInstances == 1) { + IRCleanup::deleteInstructionNow(cast(info.vector)); + } else { + const auto PhiPacket = info.getRange(packetData); + for (unsigned j = 0; j < PhiPacket.size(); ++j) { + IRCleanup::deleteInstructionNow(cast(PhiPacket.at(j))); + } + } + } +} + +Packetizer::Result Packetizer::packetize(Value *V) { + // This is safe because we only ever create an instance of Impl, never an + // instance of the base class. + return static_cast(this)->packetize(V); +} + +Packetizer::Result Packetizer::getPacketized(Value *V) { + auto found = packets.find(V); + auto *info = found != packets.end() ? &found->second : nullptr; + return Packetizer::Result(*this, V, info); +} + +PacketRange Packetizer::createPacket(Value *V, unsigned width) { + auto &info = packets[V]; + info.numInstances = width; + return Result(*this, V, &info).createPacket(width); +} + +Packetizer::Result Packetizer::Impl::getPacketizationResult( + Instruction *I, const SmallVectorImpl &Packet, bool UpdateStats) { + if (Packet.empty()) { + return Result(*this); + } + auto PacketWidth = Packet.size(); + + // If there's only one value in the packet, we can assign the new packetized + // value to the old instruction directly. + if (PacketWidth == 1) { + Value *Vec = Packet.front(); + if (Vec != I) { + // Only delete if the vectorized value is different from the scalar. + IC.deleteInstructionLater(I); + } + vectorizeDI(I, Vec); + return assign(I, Vec); + } + + // Otherwise we have to create a 'Result' out of the packetized values. + IC.deleteInstructionLater(I); + auto &Info = packets[I]; + auto Res = Result(*this, I, &Info); + auto P = Res.createPacket(PacketWidth); + for (unsigned i = 0; i < PacketWidth; ++i) { + P[i] = Packet[i]; + } + + if (UpdateStats) { + ++VeczPacketized; + } + Info.numInstances = PacketWidth; + return Res; +} + +Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator, + bool allOf) { + // Get the branch condition at its natural packet width + auto conds = packetizeAndGet(cond); + VECZ_FAIL_IF(conds.empty()); + + // Branches can only take a scalar mask. The new branch condition is true + // only if the original condition is true for any lane (or for all lanes if + // the condition is used in a BOSCC block indirection.) + IRBuilder<> B(terminator); + const auto name = cond->getName(); + + // Reduce the packet to a single value + auto w = conds.size(); + + if (VL && w != 1) { + emitVeczRemarkMissed(&F, cond, + "Can not vector-predicate packets larger than 1"); + return nullptr; + } + + while ((w >>= 1)) { + for (decltype(w) i = 0; i < w; ++i) { + conds[i] = + allOf ? B.CreateAnd(conds[i], conds[i + w], Twine(name, ".all_of")) + : B.CreateOr(conds[i], conds[i + w], Twine(name, ".any_of")); + } + } + + const RecurKind kind = allOf ? RecurKind::And : RecurKind::Or; + + // VP reduction intrinsics didn't make it into LLVM 13 so we have to make do + // by pre-sanitizing the input such that elements past VL get the identity + // value. + Value *&f = conds.front(); + + return createMaybeVPReduction(B, f, kind, VL); +} + +Packetizer::Result Packetizer::Impl::assign(Value *Scalar, Value *Vectorized) { + if (!Vectorized) { + emitVeczRemarkMissed(&F, Scalar, "Failed to vectorize"); + return Packetizer::Result(*this); + } else { + ++VeczPacketized; + auto &info = packets[Scalar]; + info.vector = Vectorized; + info.numInstances = 1; + return Packetizer::Result(*this, Scalar, &info); + } +} + +Value *Packetizer::Impl::packetizeIfVarying(Value *V) { + if (UVR.isVarying(V)) { + return packetize(V).getAsValue(); + } else if (UVR.isMaskVarying(V)) { + VECZ_FAIL_IF(!packetize(V)); + } + return V; +} + +Packetizer::Result Packetizer::Impl::packetize(Value *V) { + // Do not packetize the same value twice. + if (const auto res = getPacketized(V)) { + return res; + } + // Now check whether this value is actually packetizable. + if (!Ctx.targetInfo().canPacketize(V, SimdWidth)) { + return Packetizer::Result(*this); + } + + if (!isa(V)) { + return broadcast(V); + } + + auto *const Ins = cast(V); + + if (auto *const Branch = dyn_cast(Ins)) { + if (Branch->isConditional()) { + // varying reductions need to be packetized + auto *newCond = packetize(Branch->getCondition()).getAsValue(); + if (!newCond) { + return Packetizer::Result(*this); + } + + // Packetization should normally have produced a reduction to scalar. + // However, when Packetize Uniform is on, a uniform branch won't have + // a divergence reduction so it will need reducing manually here. + if (newCond->getType()->isVectorTy()) { + IRBuilder<> B(Branch); + const RecurKind kind = RecurKind::Or; + newCond = createMaybeVPReduction(B, newCond, kind, VL); + } + + Branch->setCondition(newCond); + } + return broadcast(Ins); + } + + if (isa(Ins)) { + // we can't handle varying switches + return Packetizer::Result(*this); + } + + if (UVR.isMaskVarying(Ins)) { + if (auto *const res = packetizeMaskVarying(Ins)) { + return broadcast(res); + } + // Fall back on instantiation if the instruction could not be packetized + Instantiator->instantiate(Ins); + return getPacketized(Ins); + } + + if (auto *reduction = packetizeGroupReduction(Ins)) { + return broadcast(reduction); + } + + if (auto *brdcast = packetizeGroupBroadcast(Ins)) { + return broadcast(brdcast); + } + + if (auto shuffle = isSubgroupShuffleLike(Ins)) { + switch (shuffle->Op) { + default: + break; + case compiler::utils::GroupCollective::OpKind::Shuffle: + if (auto *s = packetizeSubgroupShuffle(Ins)) { + return broadcast(s); + } + break; + case compiler::utils::GroupCollective::OpKind::ShuffleXor: + if (auto s = packetizeSubgroupShuffleXor(Ins, *shuffle)) { + return s; + } + break; + case compiler::utils::GroupCollective::OpKind::ShuffleUp: + case compiler::utils::GroupCollective::OpKind::ShuffleDown: + if (auto s = packetizeSubgroupShuffleUpDown(Ins, *shuffle)) { + return s; + } + break; + } + // We can't packetize all sub-group shuffle-like operations, but we also + // can't vectorize or instantiate them - so provide a diagnostic saying as + // much. + emitVeczRemarkMissed(&F, Ins, "Could not packetize sub-group shuffle"); + return Packetizer::Result(*this); + } + + // Check if we should broadcast the instruction. + // Broadcast uniform instructions, unless we want to packetize uniform + // instructions as well. We can assume that isMaskVarying is false at this + // point. + bool shouldBroadcast = !UVR.isVarying(Ins) && !Choices.packetizeUniform(); + // Or unless this instruction is in a loop and we want to packetize uniform + // instructions in loops + if (shouldBroadcast && Choices.packetizeUniformInLoops()) { + const LoopInfo &LI = AM.getResult(F); + shouldBroadcast = !LI.getLoopFor(Ins->getParent()); + } + + // The packetization of a mask-varying value takes care of its own broadcast + if (shouldBroadcast) { + // Insert broadcast instructions after the instruction to broadcast + return broadcast(Ins); + } + + if (const auto res = packetizeInstruction(Ins)) { + return res; + } + // Fall back on instantiation if the instruction could not be packetized, + // unless we're vector predicating. + if (VL) { + return Packetizer::Result(*this); + } + Instantiator->instantiate(Ins); + return getPacketized(Ins); +} + +ValuePacket Packetizer::Impl::packetizeAndGet(Value *v) { + ValuePacket results; + if (auto res = packetize(v)) { + res.getPacketValues(results); + } + return results; +} + +ValuePacket Packetizer::Impl::packetizeAndGet(Value *v, unsigned w) { + ValuePacket results; + if (auto res = packetize(v)) { + res.getPacketValues(w, results); + } + return results; +} + +Packetizer::Result Packetizer::Impl::broadcast(Value *V) { + return Result(*this, V, &packets[V]); +} + +unsigned Packetizer::Impl::getPacketWidthForType(Type *ty, + unsigned limit) const { + if (SimdWidth.isScalable()) { + return 1; + } + + const unsigned simdWidth = SimdWidth.getFixedValue(); + unsigned maxWidth = 0; + + if (!Choices.targetIndependentPacketization()) { + maxWidth = std::min(limit, Ctx.targetInfo().getVectorWidthForType( + TTI, *ty->getScalarType())); + + // We let the target return a value wider than the SIMD Width, but not + // narrower. + if (maxWidth) { + maxWidth = std::max(simdWidth, maxWidth); + } + } + + if (maxWidth == 0) { + maxWidth = std::max(simdWidth, 16u); + } + + unsigned elts = 1; + if (ty->isVectorTy()) { + auto *vecTy = cast(ty); + elts = vecTy->getNumElements(); + } + + const unsigned fullWidth = elts * simdWidth; + if (fullWidth <= maxWidth) { + return 1; + } + + // Round up to the next power of two.. + // This should only be needed if the type was a 3-vector.. + // Note that we don't really expect huge values here, over 16 is still + // currently not officially supported, over 256 would be astonishing, + // and over 65536 would be inconcievable, so we don't bother to >> 16. + unsigned width = (fullWidth / maxWidth) - 1; + width |= width >> 1; + width |= width >> 2; + width |= width >> 4; + width |= width >> 8; + + // Can't have a packet wider than the simdWidth.. + return std::min(width + 1, simdWidth); +} + +Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) { + ValuePacket results; + + // Figure out what kind of instruction it is and try to vectorize it. + switch (Ins->getOpcode()) { + default: + if (Ins->isBinaryOp()) { + results = packetizeBinaryOp(cast(Ins)); + } else if (Ins->isCast()) { + results = packetizeCast(cast(Ins)); + } else if (Ins->isUnaryOp()) { + results = packetizeUnaryOp(cast(Ins)); + } + break; + + case Instruction::PHI: + results = packetizePHI(cast(Ins)); + break; + case Instruction::GetElementPtr: + results = packetizeGEP(cast(Ins)); + break; + case Instruction::Store: + results = packetizeStore(cast(Ins)); + break; + case Instruction::Load: + results = packetizeLoad(cast(Ins)); + break; + case Instruction::Call: + results = packetizeCall(cast(Ins)); + break; + case Instruction::ICmp: + results = packetizeICmp(cast(Ins)); + break; + case Instruction::FCmp: + results = packetizeFCmp(cast(Ins)); + break; + case Instruction::Select: + results = packetizeSelect(cast(Ins)); + break; + case Instruction::InsertElement: + results = packetizeInsertElement(cast(Ins)); + break; + case Instruction::ExtractElement: + results = packetizeExtractElement(cast(Ins)); + break; + case Instruction::InsertValue: + results = packetizeInsertValue(cast(Ins)); + break; + case Instruction::ExtractValue: + results = packetizeExtractValue(cast(Ins)); + break; + case Instruction::ShuffleVector: + results = packetizeShuffleVector(cast(Ins)); + break; + case Instruction::Freeze: + results = packetizeFreeze(cast(Ins)); + break; + case Instruction::AtomicCmpXchg: + results = packetizeAtomicCmpXchg(cast(Ins)); + break; + } + + if (auto res = getPacketizationResult(Ins, results, /*update stats*/ true)) { + return res; + } + + if (auto *vec = vectorizeInstruction(Ins)) { + return assign(Ins, vec); + } + + return Packetizer::Result(*this, Ins, nullptr); +} + +Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) { + auto *const CI = dyn_cast(I); + if (!CI) { + return nullptr; + } + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + Function *callee = CI->getCalledFunction(); + if (!callee) { + return nullptr; + } + + const auto Builtin = BI.analyzeBuiltin(*callee); + if (!Builtin) { + return nullptr; + } + const auto Info = BI.isMuxGroupCollective(Builtin->ID); + + if (!Info || (!Info->isSubGroupScope() && !Info->isWorkGroupScope()) || + (!Info->isAnyAll() && !Info->isReduction())) { + return nullptr; + } + + const bool isWorkGroup = Info->isWorkGroupScope(); + const unsigned argIdx = isWorkGroup ? 1 : 0; + + SmallVector opPackets; + IRBuilder<> B(CI); + auto *const argTy = CI->getArgOperand(argIdx)->getType(); + auto packetWidth = getPacketWidthForType(argTy); + + // Don't vector predicate if we have to split into multiple packets. The + // introduction of instructions to manage the splitting up of our VL into N + // chunks is likely to kill performance anyway. + if (VL && packetWidth != 1) { + emitVeczRemarkMissed(&F, CI, + "Can not vector-predicate packets larger than 1"); + return nullptr; + } + + auto op = packetize(CI->getArgOperand(argIdx)); + + // Reduce the packet values in-place. + // TODO: can we add 'reassoc' to the floating-point reductions to absolve + // them of ordering? + op.getPacketValues(packetWidth, opPackets); + + assert((!VL || packetWidth) && + "Should have bailed if dealing with more than one VP packet"); + + // According to the OpenCL Spec, we are allowed to rearrange the operation + // order of a workgroup/subgroup reduction any way we like (even though + // floating point addition is not associative so might not produce exactly + // the same result), so we reduce to a single vector first, if necessary, and + // then do a single reduction to scalar. This is more efficient than doing + // multiple reductions to scalar and then BinOp'ing multiple scalars + // together. + // + // Reduce to a single vector. + while ((packetWidth >>= 1)) { + for (decltype(packetWidth) i = 0; i < packetWidth; ++i) { + Value *const lhs = opPackets[i]; + Value *const rhs = opPackets[i + packetWidth]; + opPackets[i] = compiler::utils::createBinOpForRecurKind(B, lhs, rhs, + Info->Recurrence); + } + } + + // Reduce to a scalar. + Value *v = createMaybeVPReduction(B, opPackets.front(), Info->Recurrence, VL); + + // We leave the original reduction function and divert the vectorized + // reduction through it, giving us a reduction over the full apparent + // sub-group or work-group size (vecz * mux). + CI->setOperand(argIdx, v); + + return CI; +} + +Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) { + auto *const CI = dyn_cast(I); + if (!CI) { + return nullptr; + } + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + Function *callee = CI->getCalledFunction(); + if (!callee) { + return nullptr; + } + const auto Builtin = BI.analyzeBuiltin(*callee); + if (!Builtin) { + return nullptr; + } + + bool isWorkGroup = false; + if (auto Info = BI.isMuxGroupCollective(Builtin->ID)) { + if (!Info->isBroadcast() || + (!Info->isSubGroupScope() && !Info->isWorkGroupScope())) { + return nullptr; + } + isWorkGroup = Info->isWorkGroupScope(); + } else { + return nullptr; + } + + IRBuilder<> B(CI); + + const unsigned argIdx = isWorkGroup ? 1 : 0; + auto *const src = CI->getArgOperand(argIdx); + + auto op = packetize(src); + PACK_FAIL_IF(!op); + + // If the source operand happened to be a broadcast value already, we can use + // it directly. + if (op.info->numInstances == 0) { + IC.deleteInstructionLater(CI); + CI->replaceAllUsesWith(src); + return src; + } + + auto *idx = CI->getArgOperand(argIdx + 1); + // We need to sanitize the input index so that it stays within the range of + // one vectorized group. + Value *idxFactor = B.CreateElementCount(idx->getType(), SimdWidth); + auto *const vecIdx = B.CreateURem(idx, idxFactor); + + Value *val = nullptr; + // Optimize the constant fixed-vector case, where we can choose the exact + // subpacket to extract from directly. + if (isa(vecIdx) && !SimdWidth.isScalable()) { + ValuePacket opPackets; + op.getPacketValues(opPackets); + auto factor = SimdWidth.divideCoefficientBy(opPackets.size()); + const unsigned subvecSize = factor.getFixedValue(); + assert(subvecSize > 0 && "Subvector size cannot be zero"); + const unsigned idxVal = cast(vecIdx)->getZExtValue(); + // If individual elements are scalar (through instantiation, say) then just + // use the desired packet directly. + if (subvecSize == 1) { + val = opPackets[idxVal]; + } else { + // Else extract from the correct packet, adjusting the index as we go. + val = B.CreateExtractElement( + opPackets[idxVal / subvecSize], + ConstantInt::get(vecIdx->getType(), idxVal % subvecSize)); + } + } else { + val = B.CreateExtractElement(op.getAsValue(), vecIdx); + } + + // We leave the original broadcast function and divert the vectorized + // broadcast through it, giving us a broadcast over the full apparent + // sub-group or work-group size (vecz * mux). + CI->setOperand(argIdx, val); + if (!isWorkGroup) { + // For sub-groups, we need to normalize the sub-group ID into the range of + // mux sub-groups. + // |-----------------|-----------------| + // | broadcast(X, 6) | broadcast(A, 6) | + // VF=4 |-----------------|-----------------| + // | b(, 6) | b(, 6) | + // |-----------------|-----------------| + // M=I/4 | 1 | 1 | + // V=I%4 | 2 | 2 | + // |-----------------|-----------------| + // | [V] | [V] | + // | Z | C | + // |-----------------|-----------------| + // | broadcast(Z, M) | broadcast(C, M) | + // res | C | C | + // splat | | | + // |-----------------|-----------------| + auto *const muxIdx = B.CreateUDiv(idx, idxFactor); + CI->setOperand(argIdx + 1, muxIdx); + } + + return CI; +} + +std::optional +Packetizer::Impl::isSubgroupShuffleLike(Instruction *I) { + auto *const CI = dyn_cast(I); + if (!CI) { + return std::nullopt; + } + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + Function *callee = CI->getCalledFunction(); + if (!callee) { + return std::nullopt; + } + + const auto Builtin = BI.analyzeBuiltin(*callee); + if (!Builtin) { + return std::nullopt; + } + + const auto Info = BI.isMuxGroupCollective(Builtin->ID); + + if (Info && Info->isSubGroupScope() && Info->isShuffleLike()) { + return Info; + } + + return std::nullopt; +} + +Value *Packetizer::Impl::packetizeSubgroupShuffle(Instruction *I) { + auto *const CI = cast(I); + + // We don't support scalable vectorization of sub-group shuffles. + if (SimdWidth.isScalable()) { + return nullptr; + } + + auto *const Data = CI->getArgOperand(0); + auto *const Idx = CI->getArgOperand(1); + + auto PackData = packetize(Data); + if (!PackData) { + return nullptr; + } + + // If the data operand happened to be a broadcast value already, we can use + // it directly. + if (PackData.info->numInstances == 0) { + IC.deleteInstructionLater(CI); + CI->replaceAllUsesWith(Data); + return Data; + } + + // We can't packetize varying shuffle indices yet. + if (UVR.isVarying(Idx)) { + return nullptr; + } + + IRBuilder<> B(CI); + + // We need to sanitize the input index so that it stays within the range of + // one vectorized group. + const unsigned VF = SimdWidth.getFixedValue(); + auto *const VecIdxFactor = ConstantInt::get(Idx->getType(), VF); + // This index is the element of the vector-group which holds the desired + // data, per mux sub-group. + // , : idx 1 -> vector element 1, idx 2 -> vector element 0. + auto *const VecIdx = B.CreateURem(Idx, VecIdxFactor); + // This index is the mux sub-group in which the desired data resides. + // , : idx 1 -> mux sub-group 0, idx 3 -> mux sub-group 1. + auto *const MuxIdx = B.CreateUDiv(Idx, VecIdxFactor); + + Value *VecData = PackData.getAsValue(); + + // Note: in each illustrative example, imagine two invocations across a + // single mux sub-groups, each being vectorized by 4; in other words, 8 + // 'original' invocations to a sub-group, running in two vectorized + // invocations. + if (auto *const DataVecTy = dyn_cast(Data->getType()); + !DataVecTy) { + // The vectorized shuffle is producing a scalar (assuming uniform indices, + // see above). Imagine i=6 (6 % 4 = 2 and 6 / 4 = 1): + // | shuffle(X, 6) | shuffle(A, 6) | + // VF=4 |-----------------|-----------------| + // | s(, 2) | s(, 2) | + // elt 2 | Z | C | + // shuff | shuffle(Z, 1) | shuffle(C, 1) | + // | C | C | + // bcast | | | + // This way we can see how each of the 8 invocations end up with the 6th + // element of the total sub-group. + VecData = B.CreateExtractElement(VecData, VecIdx, "vec.extract"); + } else if (auto *const CIdx = dyn_cast(VecIdx)) { + // The shuffle produces a vector, and we have a constant shuffle index - we + // can extract a subvector easily. + // Imagine i=6 (6 % 4 = 2 and 6 / 4 = 1): + // | shuffle(, 6) | shuffle(, 6) | + // VF=4 |-------------------------|-------------------------| + // | s(, 2) | s(, 2) | + // vec 2 | | | + // shuff | shuffle(, 1) | shuffle(, 1) | + // | | | + // bcast | | | + // This way we can see how each of the 8 invocations end up with the 6th + // element of the total sub-group, which is a two-element vector. + + // Note: the subvector vector index type has to be i64. Scale it up by the + // size of the vector we're extracting: the index is the *element* from + // which to extract - it is not implicitly scaled by the vector size. + auto *const ExtractIdx = B.getInt64( + CIdx->getZExtValue() * DataVecTy->getElementCount().getFixedValue()); + VecData = B.CreateExtractVector(Data->getType(), VecData, ExtractIdx, + "vec.extract"); + } else { + // This is as above, but the process of extracting the initial vector is + // more complicated - we have to manually extract and insert each element. + // It's possible that for some targets and for some combinations of vector + // width and vectorization factor, that going through memory would be + // faster. + Value *ExtractedVec = PoisonValue::get(DataVecTy); + const unsigned DataNumElts = DataVecTy->getElementCount().getFixedValue(); + auto *const BaseIdx = B.CreateMul(VecIdx, B.getInt32(DataNumElts)); + for (unsigned i = 0; i < DataNumElts; i++) { + auto *const SubIdx = B.CreateAdd(BaseIdx, B.getInt32(i)); + auto *const Elt = B.CreateExtractElement(VecData, SubIdx); + ExtractedVec = B.CreateInsertElement(ExtractedVec, Elt, B.getInt32(i)); + } + VecData = ExtractedVec; + } + + // We leave the original shuffle function and divert the vectorized + // shuffle through it, giving us a shuffle over the full apparent + // sub-group size (vecz * mux). + CI->setOperand(0, VecData); + CI->setOperand(1, MuxIdx); + + return CI; +} + +Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor( + Instruction *I, compiler::utils::GroupCollective ShuffleXor) { + auto *const CI = cast(I); + + // We don't support scalable vectorization of sub-group shuffles. + if (SimdWidth.isScalable()) { + return Packetizer::Result(*this); + } + const unsigned VF = SimdWidth.getFixedValue(); + + auto *const Data = CI->getArgOperand(0); + auto *const Val = CI->getArgOperand(1); + + auto PackData = packetize(Data); + if (!PackData) { + return Packetizer::Result(*this); + } + + // If the data operand happened to be a broadcast value already, we can use + // it directly. + if (PackData.info->numInstances == 0) { + IC.deleteInstructionLater(CI); + CI->replaceAllUsesWith(Data); + return PackData; + } + + auto PackVal = packetize(Val); + if (!PackVal) { + return Packetizer::Result(*this); + } + + // With the packetize operands in place, we have to perform the actual + // shuffling operation. Since we are one layer higher than the mux + // sub-groups, our IDs do not easily translate to the mux level. Therefore we + // perform each shuffle using the regular 'shuffle' and do the XOR of the IDs + // ourselves. + + // Note: in this illustrative example, imagine two invocations across a + // single mux sub-groups, each being vectorized by 4; in other words, 8 + // 'original' invocations to a sub-group, running in two vectorized + // invocations. Imagine value = 5: + // | shuffle(X, 5) | shuffle(A, 5) | + // VF=4 |----------------------|----------------------| + // | s(, 5) | s(, 5) | + // SG IDs | 0,1,2,3 | 4,5,6,7 | + // SG IDs^5 | 5,4,7,6 | 1,0,3,2 | + // I=(SG IDs^5)/4 | 1,1,1,1 | 0,0,0,0 | + // J=(SG IDs^5)%4 | 1,0,3,2 | 1,0,3,2 | + // [J] | Y,X,W,Z | B,A,D,A | + // Mux-shuffle[I] | [Y,B][1],[X,A][1],.. | [Y,B][0],[X,A][1],.. | + // | B,A,D,A | Y,X,W,Z | + IRBuilder<> B(CI); + + auto *const SubgroupLocalIDFn = Ctx.builtins().getOrDeclareMuxBuiltin( + compiler::utils::eMuxBuiltinGetSubGroupLocalId, *F.getParent(), + {CI->getType()}); + assert(SubgroupLocalIDFn); + + auto *const SubgroupLocalID = + B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id"); + const auto Builtin = + Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension); + if (!Builtin) { + return Packetizer::Result(*this); + } + + // Vectorize the sub-group local ID + auto *const VecSubgroupLocalID = + vectorizeWorkGroupCall(SubgroupLocalID, *Builtin); + if (!VecSubgroupLocalID) { + return Packetizer::Result(*this); + } + VecSubgroupLocalID->setName("vec.sg.local.id"); + + // The value is always i32, as is the sub-group local ID. Vectorizing both of + // them should result in the same vector type, with as many elements as the + // vectorization factor. + auto *const VecVal = PackVal.getAsValue(); + + assert(VecVal->getType() == VecSubgroupLocalID->getType() && + VecVal->getType()->isVectorTy() && + cast(VecVal->getType()) + ->getElementCount() + .getKnownMinValue() == VF && + "Unexpected vectorization of sub-group shuffle xor"); + + // Perform the XOR of the sub-group IDs with the 'value', as per the + // semantics of the builtin. + auto *const XoredID = B.CreateXor(VecSubgroupLocalID, VecVal); + + // We need to sanitize the input index so that it stays within the range of + // one vectorized group. + auto *const VecIdxFactor = ConstantInt::get(SubgroupLocalID->getType(), VF); + + // Bring this ID into the range of 'mux' sub-groups by dividing it by the + // vector size. + auto *const MuxXoredID = + B.CreateUDiv(XoredID, B.CreateVectorSplat(VF, VecIdxFactor)); + // And into the range of the vector group + auto *const VecXoredID = + B.CreateURem(XoredID, B.CreateVectorSplat(VF, VecIdxFactor)); + + // Now we defer to an *exclusive* scan over the group. + auto RegularShuffle = ShuffleXor; + RegularShuffle.Op = compiler::utils::GroupCollective::OpKind::Shuffle; + + auto RegularShuffleID = Ctx.builtins().getMuxGroupCollective(RegularShuffle); + assert(RegularShuffleID); + + auto *const RegularShuffleFn = Ctx.builtins().getOrDeclareMuxBuiltin( + *RegularShuffleID, *F.getParent(), {CI->getType()}); + assert(RegularShuffleFn); + + auto *const VecData = PackData.getAsValue(); + Value *CombinedShuffle = PoisonValue::get(VecData->getType()); + + for (unsigned i = 0; i < VF; i++) { + auto *Idx = B.getInt32(i); + // Get the XORd index local to the vector group that this vector group + // element wants to shuffle with. + auto *const VecGroupIdx = B.CreateExtractElement(VecXoredID, Idx); + // Grab that element. It may be a vector, in which case we must extract + // each element individually. + Value *DataElt = nullptr; + if (auto *DataVecTy = dyn_cast(Data->getType()); !DataVecTy) { + DataElt = B.CreateExtractElement(VecData, VecGroupIdx); + } else { + DataElt = PoisonValue::get(DataVecTy); + auto VecWidth = DataVecTy->getElementCount().getFixedValue(); + // VecGroupIdx is the 'base' of the subvector, whose elements are stored + // sequentially from that point. + auto *const VecVecGroupIdx = + B.CreateMul(VecGroupIdx, B.getInt32(VecWidth)); + for (unsigned j = 0; j != VecWidth; j++) { + auto *const Elt = B.CreateExtractElement( + VecData, B.CreateAdd(VecVecGroupIdx, B.getInt32(j))); + DataElt = B.CreateInsertElement(DataElt, Elt, B.getInt32(j)); + } + } + assert(DataElt); + // Shuffle it across the mux sub-group. + auto *const MuxID = B.CreateExtractElement(MuxXoredID, Idx); + auto *const Shuff = B.CreateCall(RegularShuffleFn, {DataElt, MuxID}); + // Combine that back into the final shuffled vector. + if (auto *DataVecTy = dyn_cast(Data->getType()); !DataVecTy) { + CombinedShuffle = B.CreateInsertElement(CombinedShuffle, Shuff, Idx); + } else { + auto VecWidth = DataVecTy->getElementCount().getFixedValue(); + CombinedShuffle = B.CreateInsertVector( + CombinedShuffle->getType(), CombinedShuffle, Shuff, + B.getInt64(static_cast(i) * VecWidth)); + } + } + + IC.deleteInstructionLater(CI); + return assign(CI, CombinedShuffle); +} + +Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown( + Instruction *I, compiler::utils::GroupCollective ShuffleUpDown) { + const bool IsDown = + ShuffleUpDown.Op == compiler::utils::GroupCollective::OpKind::ShuffleDown; + assert((IsDown || ShuffleUpDown.Op == + compiler::utils::GroupCollective::OpKind::ShuffleUp) && + "Invalid shuffle kind"); + + auto *const CI = cast(I); + + // We don't support scalable vectorization of sub-group shuffles. + if (SimdWidth.isScalable()) { + return Packetizer::Result(*this); + } + const unsigned VF = SimdWidth.getFixedValue(); + + // LHS is 'current' for a down-shuffle, and 'previous' for an up-shuffle. + auto *const LHSOp = CI->getArgOperand(0); + // RHS is 'next' for a down-shuffle, and 'current' for an up-shuffle. + auto *const RHSOp = CI->getArgOperand(1); + auto *const DeltaOp = CI->getArgOperand(2); + + auto PackDelta = packetize(DeltaOp); + if (!PackDelta) { + return Packetizer::Result(*this); + } + + auto PackLHS = packetize(LHSOp); + if (!PackLHS) { + return Packetizer::Result(*this); + } + + auto PackRHS = packetize(RHSOp); + if (!PackRHS) { + return Packetizer::Result(*this); + } + + auto *const LHSPackVal = PackLHS.getAsValue(); + auto *const RHSPackVal = PackRHS.getAsValue(); + assert(LHSPackVal && RHSPackVal && + LHSPackVal->getType() == RHSPackVal->getType()); + + // Remember in the example below that the builtins take *deltas* which add + // onto the mux sub-group local ID. Therefore a delta of 2 returns different + // data for each of the mux sub-group elements. + // |----------------------------|----------------------------| + // | shuffle_down(A, X, 2) | shuffle_down(E, I, 2) | + // VF=4 |----------------------------|----------------------------| + // | s(, , 2) | s(, , 2) | + // SGIds | 0,1,2,3 | 4,5,6,7 | + // SGIds+D | 2,3,4,5 | 6,7,8,9 | + // MuxSGIds | 0,0,0,0 | 1,1,1,1 | + // |----------------------------|----------------------------| + // M=(SGIds+D)/VF | 0,0,1,1 | 1,1,2,2 | + // V=(SGIds+D)%VF | 2,3,0,1 | 2,3,0,1 | + // |----------------------------|----------------------------| + // M - MuxSGIds | 0,0,1,1 | 0,0,1,1 | + // |----------------------------|----------------------------| + // Shuff[0] | s(, , 0) | s(, , 0) | + // Data returned | 0+0 => 0 => | 1+0 => 1 => | + // Shuff[0][V[0]] | [2] = C | [2] = G | + // |----------------------------|----------------------------| + // Shuff[1] | s(, , 0) | s(, , 0) | + // Data returned | 0+0 => 0 => | 1+0 => 1 => | + // Shuff[1][V[1]] | [3] = D | [3] = H | + // |----------------------------|----------------------------| + // Shuff[2] | s(, , 1) | s(, , 1) | + // Data returned | 0+1 => 1 => | 1+1 => 2 => 0 => | + // Shuff[2][V[2]] | [0] = E | [0] = X | + // |----------------------------|----------------------------| + // Shuff[3] | s(, , 1) | s(, , 1) | + // Data returned | 0+1 => 1 => | 1+1 => 2 => 0 => | + // Shuff[3][V[3]] | [1] = F | [1] = Y | + // |----------------------------|----------------------------| + // Result | C,D,E,F | G,H,X,Y | + IRBuilder<> B(CI); + + // Grab the packetized/vectorized sub-group local IDs + auto *const SubgroupLocalIDFn = Ctx.builtins().getOrDeclareMuxBuiltin( + compiler::utils::eMuxBuiltinGetSubGroupLocalId, *F.getParent(), + {CI->getType()}); + assert(SubgroupLocalIDFn); + + auto *const SubgroupLocalID = + B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id"); + const auto Builtin = + Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension); + if (!Builtin) { + return Packetizer::Result(*this); + } + + // Vectorize the sub-group local ID + auto *const VecSubgroupLocalID = + vectorizeWorkGroupCall(SubgroupLocalID, *Builtin); + if (!VecSubgroupLocalID) { + return Packetizer::Result(*this); + } + VecSubgroupLocalID->setName("vec.sg.local.id"); + + auto *const DeltaVal = PackDelta.getAsValue(); + + // The delta is always i32, as is the sub-group local ID. Vectorizing both of + // them should result in the same vector type, with as many elements as the + // vectorization factor. + assert(DeltaVal->getType() == VecSubgroupLocalID->getType() && + DeltaVal->getType()->isVectorTy() && + cast(DeltaVal->getType()) + ->getElementCount() + .getKnownMinValue() == VF && + "Unexpected vectorization of sub-group shuffle up/down"); + + // Produce the sum of the sub-group IDs with the 'delta', as per the + // semantics of the builtin. + auto *const IDPlusDelta = IsDown ? B.CreateAdd(VecSubgroupLocalID, DeltaVal) + : B.CreateSub(VecSubgroupLocalID, DeltaVal); + + // We need to sanitize the input indices so that they stay within the range + // of one vectorized group. + auto *const VecIdxFactor = ConstantInt::get(SubgroupLocalID->getType(), VF); + + // Bring this ID into the range of 'mux' sub-groups by dividing it by the + // vector size. We have to do this differently for 'up' and 'down' shuffles + // because the 'up' shuffles use signed indexing, and we need to round down + // to negative infinity to get the right sub-group delta. + Value *MuxAbsoluteIDs = nullptr; + Value *VecEltIDs = nullptr; + if (IsDown) { + MuxAbsoluteIDs = + B.CreateUDiv(IDPlusDelta, B.CreateVectorSplat(VF, VecIdxFactor)); + // And into the range of the vector group + VecEltIDs = + B.CreateURem(IDPlusDelta, B.CreateVectorSplat(VF, VecIdxFactor)); + } else { + // Note that shuffling up is more complicated, owing to the signed + // sub-group local IDs. + // The steps are identical to the example outlined above, except both the + // division and modulo operations performed on the sub-group IDs have to + // floor towards negative infinity. That is, we want to see: + // |----------------------------|---------------------------| + // | shuffle_up(A, X, 2) | shuffle_up(E, I, 2) | + // VF=4 |----------------------------|---------------------------| + // | s(, , 2) | s(, , 2)| + // SGIds | 0,1,2,3 | 4,5,6,7 | + // SGIds-D | -2,-1,0,1 | 2,3,4,5 | + // MuxSGIds | 0,0,0,0 | 1,1,1,1 | + // |----------------------------|---------------------------| + // both flooring: | | | + // M=(SGIds-D)/VF | -1,-1,0,0 | 0,0,1,1 | + // V=(SGIds-D)%VF | 2,3,0,1 | 2,3,0,1 | + // |----------------------------|---------------------------| + // MuxSGIds - M | 1,1,0,0 | 1,1,0,0 | + // |----------------------------|---------------------------| + // + // We use the following formulae for division and modulo: + // int div_floor(int x, int y) { + // int q = x/y; + // int r = x%y; + // if ((r!=0) && ((r<0) != (y<0))) --q; + // return q; + // } + // int mod_floor(int x, int y) { + // int r = x%y; + // if ((r!=0) && ((r<0) != (y<0))) { r += y; } + // return r; + // } + // We note also that the conditions are equal between the two operations, + // and that the condition is equivalent to: + // if ((r!=0) && ((x ^ y) < 0)) { ... } + // (see https://alive2.llvm.org/ce/z/ebGrdL) + auto *X = IDPlusDelta; + auto *Y = B.CreateVectorSplat(VF, VecIdxFactor); + auto *const Quotient = B.CreateSDiv(X, Y, "quotient"); + auto *const Remainder = B.CreateSRem(X, Y, "remainder"); + + auto *const ArgXor = B.CreateXor(X, Y, "arg.xor"); + auto *const One = ConstantInt::get(ArgXor->getType(), 1); + auto *const Zero = ConstantInt::get(ArgXor->getType(), 0); + auto *const ArgSignDifferent = + B.CreateICmpSLT(ArgXor, Zero, "signs.different"); + auto *const RemainderIsNotZero = + B.CreateICmpNE(Remainder, Zero, "remainder.nonzero"); + auto *const ConditionHolds = + B.CreateAnd(RemainderIsNotZero, ArgSignDifferent, "condition.holds"); + auto *const QuotientMinus1 = B.CreateSub(Quotient, One, "quotient.minus.1"); + auto *const RemainderPlusY = B.CreateAdd(Remainder, Y, "remainder.plus.y"); + + MuxAbsoluteIDs = B.CreateSelect(ConditionHolds, QuotientMinus1, Quotient); + VecEltIDs = B.CreateSelect(ConditionHolds, RemainderPlusY, Remainder); + } + + // We've produced the 'absolute' mux sub-group local IDs for the data we want + // to access in each shuffle, but we want to get back to 'relative' IDs in + // the form of deltas. Splat the mux sub-group local ID. + auto *const SplatSubgroupLocalID = + B.CreateVectorSplat(VF, SubgroupLocalID, "splat.sg.local.id"); + auto *DeltaLHS = MuxAbsoluteIDs; + auto *DeltaRHS = SplatSubgroupLocalID; + if (!IsDown) { + // For 'up' shuffles, we invert the operation as the deltas are implicitly + // negative. See above. + std::swap(DeltaLHS, DeltaRHS); + } + auto *const MuxDeltas = + B.CreateSub(DeltaLHS, DeltaRHS, "mux.sg.local.id.deltas"); + + auto ShuffleID = Ctx.builtins().getMuxGroupCollective(ShuffleUpDown); + assert(ShuffleID); + auto *const ShuffleFn = Ctx.builtins().getOrDeclareMuxBuiltin( + *ShuffleID, *F.getParent(), {LHSPackVal->getType()}); + assert(ShuffleFn); + + SmallVector Results(VF); + for (unsigned i = 0; i != VF; i++) { + auto *const MuxDelta = B.CreateExtractElement(MuxDeltas, B.getInt32(i)); + auto *const Shuffle = + B.CreateCall(ShuffleFn, {LHSPackVal, RHSPackVal, MuxDelta}); + + Value *Elt = nullptr; + auto *const Idx = B.CreateExtractElement(VecEltIDs, B.getInt32(i)); + if (auto *DataVecTy = dyn_cast(LHSOp->getType()); !DataVecTy) { + Elt = B.CreateExtractElement(Shuffle, Idx); + } else { + // For vector data types we need to extract consecutive elements starting + // at the sub-vector whose index is Idx. + Elt = PoisonValue::get(DataVecTy); + auto VecWidth = DataVecTy->getElementCount().getFixedValue(); + // Idx is the 'base' of the subvector, whose elements are stored + // sequentially from that point. + auto *const VecVecGroupIdx = B.CreateMul(Idx, B.getInt32(VecWidth)); + for (unsigned j = 0; j != VecWidth; j++) { + auto *const E = B.CreateExtractElement( + Shuffle, B.CreateAdd(VecVecGroupIdx, B.getInt32(j))); + Elt = B.CreateInsertElement(Elt, E, B.getInt32(j)); + } + } + Results[i] = Elt; + } + + IC.deleteInstructionLater(CI); + return getPacketizationResult(I, Results); +} + +Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) { + if (auto memop = MemOp::get(I)) { + auto *const mask = memop->getMaskOperand(); + if (!mask) { + return nullptr; + } + + Value *vecMask = nullptr; + + const MemOpDesc desc = memop->getDesc(); + const bool isVector = desc.getDataType()->isVectorTy(); + + // If only the mask operand is varying, we do not need to vectorize the + // MemOp itself, only reduce the mask with an OR. + if (!isVector) { + vecMask = packetize(mask).getAsValue(); + } else { + // If it's a vector, and the mask is splatted, then packetize the + // splatted value, reduce it, then re-splat it as a vector. Otherwise, we + // send it to the instantiator. + auto *const splatVal = getSplatValue(mask); + if (!splatVal) { + return nullptr; + } + vecMask = packetize(splatVal).getAsValue(); + } + + VECZ_FAIL_IF(!vecMask); + + // Build the reduction right after the vector to reduce register + // pressure, and to make it easier for CSE/GVN to combine them if there + // are multiple uses of the same value (we could cache these?) + auto *maskInst = dyn_cast(vecMask); + IRBuilder<> B = [&] { + if (maskInst) { + return buildAfter(maskInst, F); + } else { + return IRBuilder<>(I); + } + }(); + + Value *anyOfMask = createMaybeVPReduction(B, vecMask, RecurKind::Or, VL); + anyOfMask->setName("any_of_mask"); + + if (isVector) { + anyOfMask = B.CreateVectorSplat( + multi_llvm::getVectorElementCount(desc.getDataType()), anyOfMask); + } + + memop->setMaskOperand(anyOfMask); + + return I; + } + + auto *const CI = dyn_cast(I); + if (!CI) { + return nullptr; + } + + Function *callee = CI->getCalledFunction(); + + // Handle internal builtins. + if (Ctx.isInternalBuiltin(callee)) { + // Handle lane mask reductions. + // We treat these as Mask Varying instructions since their single argument + // represents a lane mask and their result is a reduction over all lanes, + // which means it is effectively uniform. We don't actually have to check + // that they are mask varying, because that is the only possible uniformity + // value of these function calls. + compiler::utils::Lexer L(callee->getName()); + VECZ_FAIL_IF(!L.Consume(VectorizationContext::InternalBuiltinPrefix)); + bool any = false; + bool divergence = false; + if (L.Consume("divergence_any")) { + divergence = true; + } else if (L.Consume("divergence_all")) { + any = true; + divergence = true; + } + + if (divergence) { + IC.deleteInstructionLater(CI); + auto *const reduce = reduceBranchCond(CI->getOperand(0), CI, any); + CI->replaceAllUsesWith(reduce); + return reduce; + } + } + + return nullptr; +} + +ValuePacket Packetizer::Impl::packetizePHI(PHINode *Phi) { + ValuePacket results; + auto *const ty = Phi->getType(); + + auto *wideTy = ty; + unsigned packetWidth = 0; + if (auto structTy = dyn_cast(ty); + ty->isVectorTy() || VectorType::isValidElementType(ty) || + (structTy && structTy->isLiteral())) { + packetWidth = getPacketWidthForType(ty); + wideTy = + getWideType(Phi->getType(), SimdWidth.divideCoefficientBy(packetWidth)); + } else { + // It's not a type we can widen, but we can save the instantiator the job.. + if (SimdWidth.isScalable()) { + // as long as we aren't requesting a scalable vectorization factor.. + return results; + } + packetWidth = SimdWidth.getFixedValue(); + } + + IRBuilder<> B(buildAfter(Phi, F, true)); + auto numVals = Phi->getNumIncomingValues(); + auto name = Phi->getName(); + for (unsigned i = 0; i < packetWidth; ++i) { + results.push_back(B.CreatePHI(wideTy, numVals, name)); + } + + // To avoid cycles in the use/def chain, packetize the incoming values later. + // This allows packetizing phi uses by creating an 'empty' phi placeholder. + pendingPhis.push_back(Phi); + return results; +} + +ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) { + ValuePacket results; + + Function *Callee = CI->getCalledFunction(); + if (!Callee) { + return results; + } + + IRBuilder<> B(CI); + // Handle LLVM intrinsics. + if (Callee->isIntrinsic()) { + auto IntrID = Intrinsic::ID(Callee->getIntrinsicID()); + if (IntrID == llvm::Intrinsic::lifetime_end || + IntrID == llvm::Intrinsic::lifetime_start) { + auto *ptr = CI->getArgOperand(CI->arg_size() - 1); + if (auto *const bcast = dyn_cast(ptr)) { + ptr = bcast->getOperand(0); + } + + if (auto *const alloca = dyn_cast(ptr)) { + if (!needsInstantiation(Ctx, *alloca)) { +#if LLVM_VERSION_GREATER_EQUAL(23, 0) + const bool HaveSizeArg = false; +#elif LLVM_VERSION_GREATER_EQUAL(22, 0) + // TODO Remove runtime check when we no longer need to worry about + // older LLVM 22 snapshots. + const bool HaveSizeArg = CI->arg_size() == 2; +#else + const bool HaveSizeArg = true; +#endif + if (HaveSizeArg) { + // If it's an alloca we can widen, we can just change the size + const llvm::TypeSize allocSize = + Ctx.dataLayout()->getTypeAllocSize(alloca->getAllocatedType()); + const auto lifeSize = + allocSize.isScalable() || SimdWidth.isScalable() + ? -1 + : allocSize.getKnownMinValue() * + SimdWidth.getKnownMinValue(); + CI->setOperand( + 0, ConstantInt::get(CI->getOperand(0)->getType(), lifeSize)); + } + results.push_back(CI); + } + } + return results; + } + + auto Builtin = Ctx.builtins().analyzeBuiltin(*Callee); + if (!Builtin || !(Builtin->properties & + compiler::utils::eBuiltinPropertyVectorEquivalent)) { + return results; + } + + // Only floating point intrinsics need this to be set to CI. + // The IR Builder helpfully crashes when we pass it unnecessarily. + Instruction *fastMathSrc = isa(CI) ? CI : nullptr; + + // Using a native array with hard coded size for simplicity, make sure + // to increase this if intrinsics with more operands are to be handled + size_t constexpr maxOperands = 3; + // Some llvm intrinsic functions like abs have argument that are constants + // and define as llvm_i1_ty. This means that thoses operand can't + // be packetized. To solve that temporary, we use this vector so every + // cases can set independently what operand must be skipped. + SmallVector operandsToSkip(maxOperands, false); + switch (IntrID) { + case Intrinsic::abs: + case Intrinsic::ctlz: + case Intrinsic::cttz: + // def abs [LLVMMatchType<0>, llvm_i1_ty] + operandsToSkip = {false, true}; + break; + default: + break; + } + + auto *const ty = CI->getType(); + auto packetWidth = getPacketWidthForType(ty); + auto *const wideTy = + getWideType(ty, SimdWidth.divideCoefficientBy(packetWidth)); + + const auto n = CI->arg_size(); + assert(n <= maxOperands && "Intrinsic has too many arguments"); + + SmallVector opPackets[maxOperands]; + for (auto i = decltype(n){0}; i < n; ++i) { + auto *argOperand = CI->getArgOperand(i); + + if (operandsToSkip[i]) { + assert(isa(argOperand) && "Operand should be a Constant"); + opPackets[i].resize(packetWidth); + std::fill(opPackets[i].begin(), opPackets[i].end(), argOperand); + } else { + auto op = packetize(CI->getArgOperand(i)); + if (!op) { + return results; + } + op.getPacketValues(packetWidth, opPackets[i]); + PACK_FAIL_IF(opPackets[i].empty()); + } + } + + const auto name = CI->getName(); + Type *const types[1] = {wideTy}; // because LLVM 13 is a numpty + Value *opVals[maxOperands]; + for (unsigned i = 0; i < packetWidth; ++i) { + for (unsigned j = 0; j < n; ++j) { + opVals[j] = opPackets[j][i]; + } + + results.push_back(B.CreateIntrinsic( + IntrID, types, ArrayRef(opVals, n), fastMathSrc, name)); + } + return results; + } + + // Handle internal builtins. + if (Ctx.isInternalBuiltin(Callee)) { + // Handle masked loads and stores. + if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) { + if (MaskedOp->isMaskedMemOp()) { + return packetizeMemOp(*MaskedOp); + } + } + if (auto AtomicInfo = Ctx.isMaskedAtomicFunction(*Callee)) { + return packetizeMaskedAtomic(*CI, *AtomicInfo); + } + } + + const auto Builtin = Ctx.builtins().analyzeBuiltin(*Callee); + + // Handle scans, which defer to internal builtins. + if (Builtin) { + if (auto Info = Ctx.builtins().isMuxGroupCollective(Builtin->ID)) { + if (Info->isScan()) { + return packetizeGroupScan(CI, *Info); + } + } + + // Handle external builtins. + const auto Props = Builtin->properties; + if (Props & compiler::utils::eBuiltinPropertyExecutionFlow || + Props & compiler::utils::eBuiltinPropertyWorkItem) { + return results; + } + } + + auto *const ty = CI->getType(); + + // Our builtins are only defined up to a width of 16 so will not vectorize + // above that. Inspect the operands as well in case they are wider, for + // instance a convert from float to i8, we would rather widen according to + // the float and not the i8 so we don't create too wide a vector of floats. + auto packetWidth = getPacketWidthForType(ty, 16u); + for (const auto &op : CI->data_ops()) { + auto *const vTy = op.get()->getType(); + if (!vTy->isPointerTy()) { + packetWidth = std::max(packetWidth, getPacketWidthForType(vTy, 16u)); + } + } + + auto factor = SimdWidth.divideCoefficientBy(packetWidth); + + // Try to find a unit for this builtin. + auto CalleeVec = Ctx.getVectorizedFunction(*Callee, factor); + if (!CalleeVec) { + // No vectorization strategy found. Fall back on Instantiation. + return results; + } + + // Packetize call operands. + // But not if they have pointer return arguments (handled in vectorizeCall). + for (const auto &TargetArg : CalleeVec.args) { + PACK_FAIL_IF(TargetArg.kind == VectorizationResult::Arg::POINTER_RETURN); + } + + auto *const vecTy = dyn_cast(ty); + const unsigned scalarWidth = vecTy ? vecTy->getNumElements() : 1; + unsigned i = 0; + SmallVector, 4> opPackets; + for (const auto &TargetArg : CalleeVec.args) { + opPackets.emplace_back(); + + // Handle scalar arguments. + Value *scalarOp = CI->getArgOperand(i); + if (TargetArg.kind == VectorizationResult::Arg::SCALAR) { + for (unsigned j = 0; j < packetWidth; ++j) { + opPackets.back().push_back(scalarOp); + } + i++; + continue; + } + + // Vectorize scalar operands. + auto op = packetize(CI->getOperand(i)); + PACK_FAIL_IF(!op); + + // The vector versions of some builtins can have a mix of vector and scalar + // arguments. We need to widen any scalar arguments by sub-splatting. + auto *const scalarTy = scalarOp->getType(); + auto *const argTy = TargetArg.type; + if (vecTy && !scalarTy->isVectorTy()) { + PACK_FAIL_IF(argTy->getScalarType() != scalarTy); + + op.getPacketValues(packetWidth, opPackets.back()); + PACK_FAIL_IF(opPackets.back().empty()); + + // Widen the scalar operands. + PACK_FAIL_IF( + !createSubSplats(Ctx.targetInfo(), B, opPackets.back(), scalarWidth)); + } else { + // Make sure the type is correct for vector arguments. + Type *wideTy = getWideType(scalarOp->getType(), factor); + PACK_FAIL_IF(argTy != wideTy); + + op.getPacketValues(packetWidth, opPackets.back()); + PACK_FAIL_IF(opPackets.back().empty()); + } + i++; + } + + auto numArgs = opPackets.size(); + SmallVector opVals; + opVals.resize(numArgs); + + auto *vecFn = CalleeVec.get(); + for (unsigned i = 0; i < packetWidth; ++i) { + for (unsigned j = 0; j < numArgs; ++j) { + opVals[j] = opPackets[j][i]; + } + + CallInst *newCI = B.CreateCall(vecFn, opVals, CI->getName()); + newCI->setCallingConv(CI->getCallingConv()); + results.push_back(newCI); + } + + return results; +} + +ValuePacket +Packetizer::Impl::packetizeGroupScan(CallInst *CI, + compiler::utils::GroupCollective Scan) { + ValuePacket results; + + Function *callee = CI->getCalledFunction(); + if (!callee) { + return results; + } + + compiler::utils::NameMangler mangler(&CI->getContext()); + + const unsigned ArgOffset = Scan.isWorkGroupScope() ? 1 : 0; + + // The operands and types for the internal builtin + SmallVector Ops = { + packetize(CI->getArgOperand(ArgOffset)).getAsValue()}; + SmallVector Tys = {getWideType(CI->getType(), SimdWidth)}; + + const bool isInclusive = + Scan.Op == compiler::utils::GroupCollective::OpKind::ScanInclusive; + StringRef op = "add"; + // min/max scans are prefixed with s/u if they are signed/unsigned integer + // operations. The value 'None' here represents an operation where the sign + // of the operands is unimportant, such as floating-point operations, or + // integer addition. + bool opIsSignedInt = false; + + switch (Scan.Recurrence) { + default: + assert(false && "Impossible subgroup scan kind"); + return results; + case llvm::RecurKind::Add: + case llvm::RecurKind::FAdd: + op = "add"; + break; + case llvm::RecurKind::SMin: + op = "smin"; + opIsSignedInt = true; + break; + case llvm::RecurKind::UMin: + op = "umin"; + break; + case llvm::RecurKind::FMin: + op = "min"; + break; + case llvm::RecurKind::SMax: + op = "smax"; + opIsSignedInt = true; + break; + case llvm::RecurKind::UMax: + op = "umax"; + break; + case llvm::RecurKind::FMax: + op = "max"; + break; + case llvm::RecurKind::Mul: + case llvm::RecurKind::FMul: + op = "mul"; + break; + case llvm::RecurKind::And: + op = "and"; + break; + case llvm::RecurKind::Or: + op = "or"; + break; + case llvm::RecurKind::Xor: + op = "xor"; + break; + } + + // Now create the mangled builtin function name. + SmallString<128> NameSV; + raw_svector_ostream O(NameSV); + + // We don't bother with VP for fixed vectors, because it doesn't save us + // anything. + const bool VP = VL && SimdWidth.isScalable(); + + O << VectorizationContext::InternalBuiltinPrefix << "sub_group_scan_" + << (isInclusive ? "inclusive" : "exclusive") << "_" << op + << (VP ? "_vp" : "") << "_"; + + const compiler::utils::TypeQualifiers VecQuals( + compiler::utils::eTypeQualNone, opIsSignedInt + ? compiler::utils::eTypeQualSignedInt + : compiler::utils::eTypeQualNone); + if (!mangler.mangleType(O, Tys[0], VecQuals)) { + return results; + } + + // VP operations mangle the extra i32 VL operand. + if (VP) { + Ops.push_back(VL); + Tys.push_back(VL->getType()); + const compiler::utils::TypeQualifiers VLQuals( + compiler::utils::eTypeQualNone); + if (!mangler.mangleType(O, Tys[1], VLQuals)) { + return results; + } + } + + auto *VecgroupScanFnTy = FunctionType::get(Tys[0], Tys, /*isVarArg*/ false); + auto *const VecgroupFn = + Ctx.getOrCreateInternalBuiltin(NameSV, VecgroupScanFnTy); + + IRBuilder<> B(CI); + + auto *VectorScan = B.CreateCall(VecgroupFn, Ops); + + // We've currently got a scan over each vector group, but the full group scan + // is further multiplied by the group size (either the work-group size or the + // 'mux' hardware sub-group size). For example, we may have a vectorization + // factor sized group of 4 and a group size of 2. Together the full group + // size to the user is 4*2 = 8. + // In terms of invocations, we've essentially currently got: + // (invocation 0) + // (invocation 1) + // These two iterations need to be further scanned over the group + // size. We do this by adding the identity to the first invocation, the + // result of the scan over the first invocation to the second, etc. This is + // an exclusive scan over the *reduction* of the input vector: + // (invocation 0) + // (invocation 1) + // -> reduction + // (a0+a1+a2+a3) (invocation 0) + // (a4+a5+a6+a7) (invocation 1) + // -> exclusive group scan + // I (invocation 0) + // (a0+a1+a2+a3) (invocation 1) + // -> adding that to the result of the vector scan: + // (invocation 0) + // <(a0+a1+a2+a3)+a4, (a0+a1+a2+a3)+a4+a5, (invocation 1) + // (a0+a1+a2+a3)+a4+a5+a6, (a0+a1+a2+a3)+a4+a5+a6+a7> + // When viewed as a full 8-element vector, this is our final scan. + // Thus we essentially keep the original group scan, but change it to be an + // exclusive one. + auto *Reduction = Ops.front(); + Reduction = createMaybeVPReduction(B, Reduction, Scan.Recurrence, VL); + + // Now we defer to an *exclusive* scan over the group. + auto ExclScan = Scan; + ExclScan.Op = compiler::utils::GroupCollective::OpKind::ScanExclusive; + + auto ExclScanID = Ctx.builtins().getMuxGroupCollective(ExclScan); + assert(ExclScanID); + + auto *const ExclScanFn = Ctx.builtins().getOrDeclareMuxBuiltin( + *ExclScanID, *F.getParent(), {CI->getType()}); + assert(ExclScanFn); + + SmallVector ExclScanOps = {Reduction}; + if (Scan.isWorkGroupScope()) { + // Forward on the current barrier ID. + ExclScanOps.insert(ExclScanOps.begin(), CI->getArgOperand(0)); + } + auto *const ExclScanCI = B.CreateCall(ExclScanFn, ExclScanOps); + + Value *const Splat = B.CreateVectorSplat(SimdWidth, ExclScanCI); + + auto *const Result = compiler::utils::createBinOpForRecurKind( + B, VectorScan, Splat, Scan.Recurrence); + + results.push_back(Result); + return results; +} + +Value *Packetizer::Impl::vectorizeInstruction(Instruction *Ins) { + if (needsInstantiation(Ctx, *Ins)) { + return nullptr; + } + + // Figure out what kind of instruction it is and try to vectorize it. + Value *Result = nullptr; + switch (Ins->getOpcode()) { + default: + break; + case Instruction::Call: + Result = vectorizeCall(cast(Ins)); + break; + case Instruction::Ret: + Result = vectorizeReturn(cast(Ins)); + break; + case Instruction::Alloca: + Result = vectorizeAlloca(cast(Ins)); + break; + case Instruction::ExtractValue: + Result = vectorizeExtractValue(cast(Ins)); + break; + } + + if (Result) { + vectorizeDI(Ins, Result); + } + return Result; +} + +ValuePacket Packetizer::Impl::packetizeLoad(LoadInst *Load) { + if (auto Op = MemOp::get(Load)) { + return packetizeMemOp(*Op); + } + return ValuePacket{}; +} + +ValuePacket Packetizer::Impl::packetizeStore(StoreInst *Store) { + if (auto Op = MemOp::get(Store)) { + return packetizeMemOp(*Op); + } + return ValuePacket{}; +} + +ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) { + ValuePacket results; + + // Determine the stride of the memory operation. + // Vectorize the pointer if there is no valid stride. + Value *ptr = op.getPointerOperand(); + assert(ptr && "Could not get pointer operand of Op"); + + auto *const dataTy = op.getDataType(); + if (!dataTy->isVectorTy() && !VectorType::isValidElementType(dataTy)) { + return results; + } + + if (auto *const vecTy = dyn_cast(dataTy)) { + const auto elts = vecTy->getNumElements(); + if (elts & (elts - 1)) { + // If the data type is a vector with number of elements not a power of 2, + // it is not safe to widen, because of alignment padding. Reject it and + // let instantiation deal with it.. + return results; + } + } + + const auto packetWidth = getPacketWidthForType(dataTy); + // Note: NOT const because LLVM 11 can't multiply a const ElementCount. + auto factor = SimdWidth.divideCoefficientBy(packetWidth); + + if (factor.isScalar()) { + // not actually widening anything here, so just instantiate it + return results; + } + + if (VL && packetWidth != 1) { + emitVeczRemarkMissed(&F, op.getInstr(), + "Can not vector-predicate packets larger than 1"); + return {}; + } + + IRBuilder<> B(op.getInstr()); + IC.deleteInstructionLater(op.getInstr()); + + const auto name = op.getInstr()->getName(); + auto *const mask = op.getMaskOperand(); + auto *const data = op.getDataOperand(); + auto *const stride = SAR.buildMemoryStride(B, ptr, dataTy); + + auto *const vecPtrTy = dyn_cast(dataTy); + + // If we're vector-predicating a vector access, scale the vector length up by + // the original number of vector elements. + // Adjust the MemOp so that it is VL-predicated, if we must. + Value *EVL = VL; + if (vecPtrTy && VL) { + EVL = B.CreateMul(VL, B.getInt32(vecPtrTy->getNumElements())); + } + + auto *const constantStrideVal = dyn_cast_or_null(stride); + const int constantStride = + constantStrideVal ? constantStrideVal->getSExtValue() : 0; + const bool validStride = + stride && (!constantStrideVal || constantStride != 0); + if (!validStride) { + if (dataTy->isPointerTy()) { + // We do not have vector-of-pointers support in Vecz builtins, hence + // instantiate instead of packetize + return results; + } + + const bool scalable = SimdWidth.isScalable(); + if (!mask && dataTy->isVectorTy() && !scalable) { + // unmasked scatter/gathers are better off instantiated.. + return results; + } + + // Assume that individual masked loads/stores are more efficient when the + // type does not fit into a native integer. Since instantiation is never an + // option for scalable vectors, we do not consider this option. + if (vecPtrTy && !scalable && + !Ctx.dataLayout()->fitsInLegalInteger( + dataTy->getPrimitiveSizeInBits())) { + return results; + } + + auto ptrPacket = packetizeAndGet(ptr, packetWidth); + PACK_FAIL_IF(ptrPacket.empty()); + + auto *const scalarTy = dataTy->getScalarType(); + auto *const ptrTy = cast(ptr->getType()->getScalarType()); + + // When scattering/gathering with a vector type, we can cast it to a + // vector of pointers to the scalar type and widen it into a vector + // of pointers to all the individual elements, and then gather/scatter + // using that. + if (vecPtrTy && scalable) { + // Scalable requires special codegen that avoids shuffles, but the idea + // is the same. + // We only handle the one packet right now. + PACK_FAIL_IF(ptrPacket.size() != 1); + const auto scalarWidth = vecPtrTy->getNumElements(); + Value *&vecPtr = ptrPacket.front(); + const ElementCount wideEC = factor * scalarWidth; + // Sub-splat the pointers such that we get, e.g.: + // -> x4 -> + const bool success = + createSubSplats(Ctx.targetInfo(), B, ptrPacket, scalarWidth); + PACK_FAIL_IF(!success); + auto *const newPtrTy = llvm::VectorType::get(ptrTy, wideEC); + // Bitcast the above sub-splat to purely scalar pointers + vecPtr = B.CreateBitCast(vecPtr, newPtrTy); + // Create an index sequence to start the offseting process + Value *idxVector = createIndexSequence( + B, VectorType::get(B.getInt32Ty(), wideEC), "index.vec"); + PACK_FAIL_IF(!idxVector); + // Modulo the indices 0,1,2,.. with the original vector type, producing, + // e.g., for the above: <0,1,2,3,0,1,2,3> + auto *const subVecEltsSplat = + B.CreateVectorSplat(wideEC, B.getInt32(scalarWidth)); + idxVector = B.CreateURem(idxVector, subVecEltsSplat); + // Index into the pointer vector with the offsets, e.g.,: + // + vecPtr = B.CreateInBoundsGEP(scalarTy, vecPtr, idxVector); + } else if (vecPtrTy && !scalable) { + const auto simdWidth = factor.getFixedValue(); + const auto scalarWidth = vecPtrTy->getNumElements(); + + // Build shuffle mask to widen the pointer + SmallVector indices; + SmallVector widenMask; + for (size_t i = 0; i < simdWidth; ++i) { + for (size_t j = 0; j < scalarWidth; ++j) { + widenMask.push_back(i); + indices.push_back(B.getInt32(j)); + } + } + + auto *const newPtrTy = FixedVectorType::get(ptrTy, simdWidth); + + auto *const idxVector = ConstantVector::get(indices); + auto *const poison = PoisonValue::get(newPtrTy); + for (auto &vecPtr : ptrPacket) { + vecPtr = B.CreateBitCast(vecPtr, newPtrTy); + vecPtr = B.CreateShuffleVector(vecPtr, poison, widenMask); + vecPtr = B.CreateInBoundsGEP(scalarTy, vecPtr, idxVector); + } + } + + ValuePacket dataPacket; + if (data) { + auto src = packetize(data); + PACK_FAIL_IF(!src); + src.getPacketValues(packetWidth, dataPacket); + PACK_FAIL_IF(dataPacket.empty()); + } else { + dataPacket.resize(packetWidth, nullptr); + } + + // Vector-predicated scatters/gathers are always masked. + ValuePacket maskPacket(packetWidth, nullptr); + auto *const packetVecTy = getWideType(dataTy, factor); + if (mask || EVL) { + if (!mask) { + // If there's no mask then just splat a trivial one. + auto *const trueMask = createAllTrueMask( + B, multi_llvm::getVectorElementCount(packetVecTy)); + std::fill(maskPacket.begin(), maskPacket.end(), trueMask); + } else { + maskPacket = packetizeAndGet(mask, packetWidth); + PACK_FAIL_IF(maskPacket.empty()); + } + } + + // Gather load or scatter store. + for (unsigned i = 0; i != packetWidth; ++i) { + if (op.isLoad()) { + auto *gather = + createGather(Ctx, packetVecTy, ptrPacket[i], maskPacket[i], EVL, + op.getAlignment(), name); + PACK_FAIL_IF(!gather); + gather->insertBefore(op.getInstr()->getIterator()); + results.push_back(gather); + } else { + auto *scatter = + createScatter(Ctx, dataPacket[i], ptrPacket[i], maskPacket[i], EVL, + op.getAlignment(), name); + PACK_FAIL_IF(!scatter); + scatter->insertBefore(op.getInstr()->getIterator()); + results.push_back(scatter); + } + } + } else if (!constantStrideVal || constantStride != 1) { + if (dataTy->isPointerTy() || dataTy->isVectorTy()) { + // No builtins for memops on pointer types, and we can't do interleaved + // memops over vector types. + return results; + } + + ValuePacket dataPacket; + if (data) { + auto src = packetize(data); + PACK_FAIL_IF(!src); + src.getPacketValues(packetWidth, dataPacket); + PACK_FAIL_IF(dataPacket.empty()); + } else { + dataPacket.resize(packetWidth, nullptr); + } + + Value *packetStride = nullptr; + if (packetWidth != 1) { + // Make sure the stride is at least as wide as a GEP index needs to be + const unsigned indexBits = Ctx.dataLayout()->getIndexSizeInBits( + ptr->getType()->getPointerAddressSpace()); + unsigned strideBits = stride->getType()->getPrimitiveSizeInBits(); + auto *const elementStride = + (indexBits > strideBits) + ? B.CreateSExt(stride, B.getIntNTy((strideBits = indexBits))) + : stride; + + const auto simdWidth = factor.getFixedValue(); + packetStride = + B.CreateMul(elementStride, B.getIntN(strideBits, simdWidth), + Twine(name, ".packet_stride")); + } + + // Vector-predicated interleaved operations are always masked. + ValuePacket maskPacket(packetWidth, nullptr); + auto *const packetVecTy = getWideType(dataTy, factor); + if (mask || EVL) { + if (!mask) { + // If there's no mask then just splat a trivial one. + auto *const trueMask = createAllTrueMask( + B, multi_llvm::getVectorElementCount(packetVecTy)); + std::fill(maskPacket.begin(), maskPacket.end(), trueMask); + } else { + maskPacket = packetizeAndGet(mask, packetWidth); + PACK_FAIL_IF(maskPacket.empty()); + } + } + + // Interleaved (strided) load or store. + for (unsigned i = 0; i != packetWidth; ++i) { + if (i != 0) { + ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride, + Twine(name, ".incr")); + } + if (op.isLoad()) { + auto *newLoad = + createInterleavedLoad(Ctx, packetVecTy, ptr, stride, maskPacket[i], + EVL, op.getAlignment(), name); + newLoad->insertBefore(op.getInstr()->getIterator()); + results.push_back(newLoad); + } else { + auto *newStore = + createInterleavedStore(Ctx, dataPacket[i], ptr, stride, + maskPacket[i], EVL, op.getAlignment(), name); + newStore->insertBefore(op.getInstr()->getIterator()); + results.push_back(newStore); + } + } + } else { + ValuePacket dataPacket; + if (data) { + auto src = packetize(data); + PACK_FAIL_IF(!src); + src.getPacketValues(packetWidth, dataPacket); + PACK_FAIL_IF(dataPacket.empty()); + } else if (mask) { + // don't need the data packet for unmasked stores + dataPacket.resize(packetWidth, nullptr); + } + + Value *packetStride = nullptr; + if (packetWidth != 1) { + const auto simdWidth = factor.getFixedValue(); + packetStride = B.getInt64(simdWidth); + } + + // Calculate the alignment. The MemOp's alignment is the original + // alignment, but may be overaligned. After vectorization it can't be + // larger than the pointee element type. + unsigned alignment = op.getAlignment(); + const unsigned sizeInBits = + dataTy->getPrimitiveSizeInBits().getKnownMinValue(); + alignment = std::min(alignment, std::max(sizeInBits, 8u) / 8u); + + // Regular load or store. + if (mask) { + const bool isVectorMask = mask->getType()->isVectorTy(); + auto maskPacket = packetizeAndGet(mask, packetWidth); + PACK_FAIL_IF(maskPacket.empty()); + + // If the original instruction was a vector but the mask was a scalar i1, + // we have to broadcast the mask elements across the data vector. + auto *const vecTy = dyn_cast(dataTy); + if (vecTy && !isVectorMask) { + PACK_FAIL_IF(factor.isScalable()); + const unsigned simdWidth = factor.getFixedValue(); + const unsigned scalarWidth = vecTy->getNumElements(); + + // Build shuffle mask to widen the vector condition. + SmallVector widenMask; + for (size_t i = 0; i < simdWidth; ++i) { + for (size_t j = 0; j < scalarWidth; ++j) { + widenMask.push_back(i); + } + } + + auto *const poison = PoisonValue::get(maskPacket.front()->getType()); + for (auto &vecMask : maskPacket) { + vecMask = createOptimalShuffle(B, vecMask, poison, widenMask); + } + } + + for (unsigned i = 0; i != packetWidth; ++i) { + if (i != 0) { + ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride, + Twine(name, ".incr")); + } + if (op.isLoad()) { + auto *newLoad = + createMaskedLoad(Ctx, getWideType(dataTy, factor), ptr, + maskPacket[i], EVL, op.getAlignment(), name); + newLoad->insertBefore(op.getInstr()->getIterator()); + results.push_back(newLoad); + } else { + auto *newStore = + createMaskedStore(Ctx, dataPacket[i], ptr, maskPacket[i], EVL, + op.getAlignment(), name); + newStore->insertBefore(op.getInstr()->getIterator()); + results.push_back(newStore); + } + } + } else { + const TargetInfo &VTI = Ctx.targetInfo(); + if (op.isLoad()) { + auto *const one = B.getInt64(1); + for (unsigned i = 0; i != packetWidth; ++i) { + if (i != 0) { + ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride, + Twine(name, ".incr")); + } + results.push_back(VTI.createLoad(B, getWideType(dataTy, factor), ptr, + one, alignment, EVL)); + } + } else { + auto *const one = B.getInt64(1); + for (unsigned i = 0; i != packetWidth; ++i) { + if (i != 0) { + ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride, + Twine(name, ".incr")); + } + results.push_back( + VTI.createStore(B, dataPacket[i], ptr, one, alignment, EVL)); + } + } + } + } + + // Transfer attributes from an old call instruction to a new one. + if (CallInst *oldCI = op.getCall()) { + for (auto *r : results) { + if (CallInst *newCI = dyn_cast_or_null(r)) { + newCI->setCallingConv(oldCI->getCallingConv()); + } + } + } + return results; +} + +ValuePacket Packetizer::Impl::packetizeMaskedAtomic( + CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo) { + ValuePacket results; + + const bool IsCmpXchg = AtomicInfo.isCmpXchg(); + + Value *const ptrArg = CI.getArgOperand(0); + Value *const valOrCmpArg = CI.getArgOperand(1); + Value *const maskArg = CI.getArgOperand(2 + IsCmpXchg); + + assert(AtomicInfo.ValTy == valOrCmpArg->getType() && "AtomicInfo mismatch"); + const auto packetWidth = getPacketWidthForType(valOrCmpArg->getType()); + + if (VL && packetWidth != 1) { + emitVeczRemarkMissed(&F, &CI, + "Can not vector-predicate packets larger than 1"); + return {}; + } + + ValuePacket valOrCmpPacket; + const Result valResult = packetize(valOrCmpArg); + PACK_FAIL_IF(!valResult); + valResult.getPacketValues(packetWidth, valOrCmpPacket); + PACK_FAIL_IF(valOrCmpPacket.empty()); + + ValuePacket newValPacket; + if (IsCmpXchg) { + Value *const newValArg = CI.getArgOperand(2); + const Result newValResult = packetize(newValArg); + PACK_FAIL_IF(!newValResult); + newValResult.getPacketValues(packetWidth, newValPacket); + PACK_FAIL_IF(newValPacket.empty()); + } + + ValuePacket ptrPacket; + const Result ptrResult = packetize(ptrArg); + PACK_FAIL_IF(!ptrResult); + ptrResult.getPacketValues(packetWidth, ptrPacket); + PACK_FAIL_IF(ptrPacket.empty()); + + ValuePacket maskPacket; + const Result maskResult = packetize(maskArg); + PACK_FAIL_IF(!maskResult); + maskResult.getPacketValues(packetWidth, maskPacket); + PACK_FAIL_IF(maskPacket.empty()); + + IRBuilder<> B(&CI); + IC.deleteInstructionLater(&CI); + + for (unsigned i = 0; i != packetWidth; ++i) { + auto *const ptr = ptrPacket[i]; + auto *const valOrCmp = valOrCmpPacket[i]; + + AtomicInfo.ValTy = valOrCmp->getType(); + AtomicInfo.PointerTy = ptr->getType(); + auto *maskedAtomicF = + Ctx.getOrCreateMaskedAtomicFunction(AtomicInfo, Choices, SimdWidth); + PACK_FAIL_IF(!maskedAtomicF); + + SmallVector args = {ptr, valOrCmp}; + if (IsCmpXchg) { + args.push_back(newValPacket[i]); + } + args.push_back(maskPacket[i]); + if (AtomicInfo.IsVectorPredicated) { + assert(VL && "Missing vector length"); + args.push_back(VL); + } + + results.push_back(B.CreateCall(maskedAtomicF, args)); + } + + return results; +} + +void Packetizer::Impl::vectorizeDI(Instruction *, Value *) { + // FIXME: Reinstate support for vectorizing debug info + return; +} + +ValuePacket Packetizer::Impl::packetizeGEP(GetElementPtrInst *GEP) { + ValuePacket results; + Value *pointer = GEP->getPointerOperand(); + if (isa(pointer)) { + return results; + } + + if (isa(GEP->getType())) { + // instantiate vector GEPs, for safety + return results; + } + + // Work out the packet width from the pointed to type, rather than the + // pointer type itself, because this is the width the memops will be using. + auto *const ty = GEP->getSourceElementType(); + const auto packetWidth = getPacketWidthForType(ty); + + // It is legal to create a GEP with a mixture of scalar and vector operands. + // If any operand is a vector, the result will be a vector of pointers. + ValuePacket pointerPacket; + if (UVR.isVarying(pointer)) { + auto res = packetize(pointer); + PACK_FAIL_IF(!res); + res.getPacketValues(packetWidth, pointerPacket); + PACK_FAIL_IF(pointerPacket.empty()); + } else { + for (unsigned i = 0; i != packetWidth; ++i) { + pointerPacket.push_back(pointer); + } + } + + // Packetize the GEP indices. + SmallVector, 4> opPackets; + for (unsigned i = 0, n = GEP->getNumIndices(); i != n; i++) { + Value *idx = GEP->getOperand(i + 1); + opPackets.emplace_back(); + + // Handle constant indices + if (isa(idx)) { + for (unsigned j = 0; j < packetWidth; ++j) { + opPackets.back().push_back(idx); + } + } else { + auto op = packetize(idx); + PACK_FAIL_IF(!op); + op.getPacketValues(packetWidth, opPackets.back()); + PACK_FAIL_IF(opPackets.back().empty()); + } + } + + IRBuilder<> B(GEP); + IC.deleteInstructionLater(GEP); + + const bool inBounds = GEP->isInBounds(); + const auto name = GEP->getName(); + + const auto numIndices = opPackets.size(); + SmallVector opVals; + opVals.resize(numIndices); + for (unsigned i = 0; i < packetWidth; ++i) { + for (unsigned j = 0; j < numIndices; ++j) { + opVals[j] = opPackets[j][i]; + } + + if (inBounds) { + results.push_back( + B.CreateInBoundsGEP(ty, pointerPacket[i], opVals, name)); + } else { + results.push_back(B.CreateGEP(ty, pointerPacket[i], opVals, name)); + } + } + return results; +} + +ValuePacket Packetizer::Impl::packetizeBinaryOp(BinaryOperator *BinOp) { + ValuePacket results; + auto packetWidth = getPacketWidthForType(BinOp->getType()); + + auto LHS = packetizeAndGet(BinOp->getOperand(0), packetWidth); + auto RHS = packetizeAndGet(BinOp->getOperand(1), packetWidth); + PACK_FAIL_IF(LHS.empty() || RHS.empty()); + + auto opcode = BinOp->getOpcode(); + auto name = BinOp->getName(); + IRBuilder<> B(BinOp); + if (VL) { + auto *const VecTy = LHS[0]->getType(); + // Support for VP legalization is still lacking so fall back to non-VP + // operations in other cases. This support will improve over time. + if (Ctx.targetInfo().isVPVectorLegal(F, VecTy)) { + PACK_FAIL_IF(packetWidth != 1); + auto VPId = VPIntrinsic::getForOpcode(opcode); + PACK_FAIL_IF(VPId == Intrinsic::not_intrinsic); + auto *const Mask = createAllTrueMask( + B, multi_llvm::getVectorElementCount(LHS[0]->getType())); + // Scale the base length by the number of vector elements, where + // appropriate. + Value *EVL = VL; + if (auto *const VecTy = dyn_cast(BinOp->getType())) { + EVL = B.CreateMul( + EVL, + B.getInt32( + multi_llvm::getVectorElementCount(VecTy).getKnownMinValue())); + } + auto *const NewBinOp = B.CreateIntrinsic(VPId, {LHS[0]->getType()}, + {LHS[0], RHS[0], Mask, EVL}); + NewBinOp->copyIRFlags(BinOp, true); + NewBinOp->copyMetadata(*BinOp); + results.push_back(NewBinOp); + return results; + } + // If we haven't matched [us]div or [us]rem then we may be executing + // out-of-bounds elements if we don't predicate. Since this isn't safe, + // bail. + PACK_FAIL_IF( + opcode == BinaryOperator::UDiv || opcode == BinaryOperator::SDiv || + opcode == BinaryOperator::URem || opcode == BinaryOperator::SRem); + } + for (unsigned i = 0; i < packetWidth; ++i) { + auto *const NewV = B.CreateBinOp(opcode, LHS[i], RHS[i], name); + if (auto *const NewBinOp = dyn_cast(NewV)) { + NewBinOp->copyIRFlags(BinOp, true); + NewBinOp->copyMetadata(*BinOp); + } + results.push_back(NewV); + } + return results; +} + +ValuePacket Packetizer::Impl::packetizeFreeze(FreezeInst *FreezeI) { + ValuePacket results; + auto resC = packetize(FreezeI->getOperand(0)); + PACK_FAIL_IF(!resC); + + SmallVector src; + resC.getPacketValues(src); + PACK_FAIL_IF(src.empty()); + + const auto packetWidth = src.size(); + const auto name = FreezeI->getName(); + + IRBuilder<> B(FreezeI); + for (unsigned i = 0; i < packetWidth; ++i) { + results.push_back(B.CreateFreeze(src[i], name)); + } + return results; +} + +ValuePacket +Packetizer::Impl::packetizeAtomicCmpXchg(AtomicCmpXchgInst *AtomicI) { + ValuePacket results; + + VectorizationContext::MaskedAtomic MA; + MA.VF = SimdWidth; + MA.IsVectorPredicated = VU.choices().vectorPredication(); + + MA.Align = AtomicI->getAlign(); + MA.BinOp = AtomicRMWInst::BAD_BINOP; + MA.IsWeak = AtomicI->isWeak(); + MA.IsVolatile = AtomicI->isVolatile(); + MA.Ordering = AtomicI->getSuccessOrdering(); + MA.CmpXchgFailureOrdering = AtomicI->getFailureOrdering(); + MA.SyncScope = AtomicI->getSyncScopeID(); + + IRBuilder<> B(AtomicI); + + // Set up the arguments to this function + Value *Ptr = packetize(AtomicI->getPointerOperand()).getAsValue(); + Value *Cmp = packetize(AtomicI->getCompareOperand()).getAsValue(); + Value *New = packetize(AtomicI->getNewValOperand()).getAsValue(); + + MA.ValTy = Cmp->getType(); + MA.PointerTy = Ptr->getType(); + + auto *const TrueMask = createAllTrueMask(B, SimdWidth); + SmallVector MaskedFnArgs = {Ptr, Cmp, New, TrueMask}; + if (VL) { + MaskedFnArgs.push_back(VL); + } + + Function *MaskedAtomicFn = + Ctx.getOrCreateMaskedAtomicFunction(MA, VU.choices(), SimdWidth); + PACK_FAIL_IF(!MaskedAtomicFn); + + CallInst *MaskedCI = B.CreateCall(MaskedAtomicFn, MaskedFnArgs); + + results.push_back(MaskedCI); + + return results; +} + +ValuePacket Packetizer::Impl::packetizeUnaryOp(UnaryOperator *UnOp) { + ValuePacket results; + + auto opcode = UnOp->getOpcode(); + + auto packetWidth = getPacketWidthForType(UnOp->getType()); + auto src = packetizeAndGet(UnOp->getOperand(0), packetWidth); + PACK_FAIL_IF(src.empty()); + + auto name = UnOp->getName(); + IRBuilder<> B(UnOp); + for (unsigned i = 0; i < packetWidth; ++i) { + Value *New = B.CreateUnOp(opcode, src[i], name); + auto *NewUnOp = cast(New); + NewUnOp->copyIRFlags(UnOp, true); + results.push_back(NewUnOp); + } + return results; +} + +ValuePacket Packetizer::Impl::packetizeCast(CastInst *CastI) { + ValuePacket results; + + auto *const ty = CastI->getType(); + auto packetWidth = std::max(getPacketWidthForType(ty), + getPacketWidthForType(CastI->getSrcTy())); + + auto src = packetizeAndGet(CastI->getOperand(0), packetWidth); + PACK_FAIL_IF(src.empty()); + + auto *const wideTy = + getWideType(ty, SimdWidth.divideCoefficientBy(packetWidth)); + auto name = CastI->getName(); + IRBuilder<> B(CastI); + for (unsigned i = 0; i < packetWidth; ++i) { + results.push_back(B.CreateCast(CastI->getOpcode(), src[i], wideTy, name)); + } + return results; +} + +ValuePacket Packetizer::Impl::packetizeICmp(ICmpInst *Cmp) { + ValuePacket results; + auto packetWidth = getPacketWidthForType(Cmp->getOperand(0)->getType()); + + auto LHS = packetizeAndGet(Cmp->getOperand(0), packetWidth); + auto RHS = packetizeAndGet(Cmp->getOperand(1), packetWidth); + PACK_FAIL_IF(LHS.empty() || RHS.empty()); + + auto pred = Cmp->getPredicate(); + auto name = Cmp->getName(); + IRBuilder<> B(Cmp); + for (unsigned i = 0; i < packetWidth; ++i) { + auto *const NewICmp = B.CreateICmp(pred, LHS[i], RHS[i], name); + if (isa(NewICmp)) { + cast(NewICmp)->copyIRFlags(Cmp, true); + } + results.push_back(NewICmp); + } + return results; +} + +ValuePacket Packetizer::Impl::packetizeFCmp(FCmpInst *Cmp) { + ValuePacket results; + auto packetWidth = getPacketWidthForType(Cmp->getOperand(0)->getType()); + + auto LHS = packetizeAndGet(Cmp->getOperand(0), packetWidth); + auto RHS = packetizeAndGet(Cmp->getOperand(1), packetWidth); + PACK_FAIL_IF(LHS.empty() || RHS.empty()); + + auto pred = Cmp->getPredicate(); + auto name = Cmp->getName(); + IRBuilder<> B(Cmp); + for (unsigned i = 0; i < packetWidth; ++i) { + auto *NewICmp = cast(B.CreateFCmp(pred, LHS[i], RHS[i], name)); + NewICmp->copyIRFlags(Cmp, true); + results.push_back(NewICmp); + } + return results; +} + +ValuePacket Packetizer::Impl::packetizeSelect(SelectInst *Select) { + ValuePacket results; + auto *const ty = Select->getType(); + if (!ty->isVectorTy() && !VectorType::isValidElementType(ty)) { + // Selects can work on struct/aggregate types, but we can't widen them.. + return results; + } + + auto packetWidth = getPacketWidthForType(ty); + auto vecT = packetizeAndGet(Select->getOperand(1), packetWidth); + auto vecF = packetizeAndGet(Select->getOperand(2), packetWidth); + PACK_FAIL_IF(vecT.empty() || vecF.empty()); + + auto *cond = Select->getOperand(0); + auto resC = packetize(cond); + PACK_FAIL_IF(!resC); + + IRBuilder<> B(Select); + const bool isVectorSelect = cond->getType()->isVectorTy(); + SmallVector vecC; + if (UVR.isVarying(cond)) { + resC.getPacketValues(packetWidth, vecC); + PACK_FAIL_IF(vecC.empty()); + + // If the original select returns a vector, but the condition was scalar, + // and its packet members are widened, we have to sub-broadcast it across + // the lanes of the original vector. + if (!isVectorSelect && vecC.front()->getType()->isVectorTy()) { + if (auto *vecTy = dyn_cast(Select->getType())) { + PACK_FAIL_IF(!createSubSplats(Ctx.targetInfo(), B, vecC, + vecTy->getNumElements())); + } + } + } else if (isVectorSelect) { + // If the condition is a uniform vector, get its broadcast packets + resC.getPacketValues(packetWidth, vecC); + PACK_FAIL_IF(vecC.empty()); + } else { + // If the condition is a uniform scalar, we can just use it as is + vecC.assign(packetWidth, cond); + } + + auto name = Select->getName(); + for (unsigned i = 0; i < packetWidth; ++i) { + results.push_back(B.CreateSelect(vecC[i], vecT[i], vecF[i], name)); + } + return results; +} + +Value *Packetizer::Impl::vectorizeReturn(ReturnInst *Return) { + IRBuilder<> B(Return); + Value *Op = packetize(Return->getOperand(0)).getAsValue(); + VECZ_FAIL_IF(!Op); + IC.deleteInstructionLater(Return); + return B.CreateRet(Op); +} + +Value *Packetizer::Impl::vectorizeCall(CallInst *CI) { + Function *Callee = CI->getCalledFunction(); + VECZ_STAT_FAIL_IF(!Callee, VeczPacketizeFailCall); + + IRBuilder<> B(CI); + // Handle LLVM intrinsics. + if (Callee->isIntrinsic()) { + Value *Result = nullptr; + auto IntrID = Intrinsic::ID(Callee->getIntrinsicID()); + if (IntrID == Intrinsic::fmuladd || IntrID == Intrinsic::fma) { + SmallVector Ops; + SmallVector Tys; + for (unsigned i = 0; i < 3; ++i) { + Value *P = packetize(CI->getOperand(i)).getAsValue(); + VECZ_FAIL_IF(!P); + Ops.push_back(P); + } + Tys.push_back(getWideType(CI->getType(), SimdWidth)); + Result = B.CreateIntrinsic(IntrID, Tys, Ops, CI, CI->getName()); + } + + if (Result) { + IC.deleteInstructionLater(CI); + return Result; + } + } + + // Handle internal builtins. + if (Ctx.isInternalBuiltin(Callee)) { + // These should have been handled by packetizeCall, if not, off to the + // instantiator they go... + if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) { + if (MaskedOp->isMaskedMemOp()) { + return nullptr; + } + } + } + + if (VectorizationContext::isVector(*CI)) { + return nullptr; + } + + // Handle external builtins. + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + if (const auto Builtin = BI.analyzeBuiltinCall(*CI, Dimension)) { + if (Builtin->properties & compiler::utils::eBuiltinPropertyExecutionFlow) { + return nullptr; + } + if (Builtin->properties & compiler::utils::eBuiltinPropertyWorkItem) { + return vectorizeWorkGroupCall(CI, *Builtin); + } + } + + // Try to find a unit for this builtin. + auto CalleeVec = Ctx.getVectorizedFunction(*Callee, SimdWidth); + if (!CalleeVec) { + // No vectorization strategy found. Fall back on Instantiation. + return nullptr; + } + IC.deleteInstructionLater(CI); + + // Vectorize call operands. + unsigned i = 0; + AllocaInst *PointerRetAlloca = nullptr; + Value *PointerRetAddr = nullptr; + int PointerRetStride = 0; + SmallVector Ops; + for (const auto &TargetArg : CalleeVec.args) { + // Handle scalar arguments. + Value *ScalarOp = CI->getArgOperand(i); + Type *ScalarTy = ScalarOp->getType(); + if (TargetArg.kind == VectorizationResult::Arg::POINTER_RETURN) { + // 'Pointer return' arguments that are not sequential need to be handled + // specially. + auto *const PtrTy = dyn_cast(ScalarOp->getType()); + auto *const PtrEleTy = TargetArg.pointerRetPointeeTy; + Value *Stride = SAR.buildMemoryStride(B, ScalarOp, PtrEleTy); + VECZ_STAT_FAIL_IF(!Stride, VeczPacketizeFailStride); + bool hasConstantStride = false; + int64_t ConstantStride = 0; + if (ConstantInt *CInt = dyn_cast(Stride)) { + ConstantStride = CInt->getSExtValue(); + hasConstantStride = true; + } + VECZ_STAT_FAIL_IF(!hasConstantStride || ConstantStride < 1, + VeczPacketizeFailStride); + if (ConstantStride == 1) { + Ops.push_back(B.CreateBitCast(ScalarOp, TargetArg.type)); + i++; + continue; + } + // Create an alloca in the function's entry block. The alloca will be + // passed instead of the original pointer. After the function call, + // the value from the alloca will be loaded sequentially and stored to the + // original address using an interleaved store. + VECZ_STAT_FAIL_IF(!PtrTy || PointerRetAddr, VeczPacketizeFailPtr); + BasicBlock *BB = CI->getParent(); + VECZ_FAIL_IF(!BB); + Function *F = BB->getParent(); + VECZ_FAIL_IF(!F); + BasicBlock &EntryBB = F->getEntryBlock(); + B.SetInsertPoint(&*EntryBB.getFirstInsertionPt()); + Type *AllocaTy = getWideType(PtrEleTy, SimdWidth); + PointerRetAlloca = B.CreateAlloca(AllocaTy, nullptr, "ptr_ret_temp"); + Value *NewOp = B.CreateAddrSpaceCast(PointerRetAlloca, PtrTy); + PointerRetAddr = ScalarOp; + PointerRetStride = ConstantStride; + Ops.push_back(NewOp); + i++; + continue; + } else if (TargetArg.kind != VectorizationResult::Arg::VECTORIZED) { + Ops.push_back(ScalarOp); + i++; + continue; + } + + // Make sure the type is correct for vector arguments. + auto VectorTy = dyn_cast(TargetArg.type); + VECZ_STAT_FAIL_IF(!VectorTy || VectorTy->getElementType() != ScalarTy, + VeczPacketizeFailType); + + // Vectorize scalar operands. + Value *VecOp = packetize(ScalarOp).getAsValue(); + VECZ_FAIL_IF(!VecOp); + Ops.push_back(VecOp); + i++; + } + + CallInst *NewCI = B.CreateCall(CalleeVec.get(), Ops, CI->getName()); + NewCI->setCallingConv(CI->getCallingConv()); + if (PointerRetAddr) { + // Load the 'pointer return' value from the alloca and store it to the + // original address using an interleaved store. + LoadInst *PointerRetResult = + B.CreateLoad(PointerRetAlloca->getAllocatedType(), PointerRetAlloca); + Value *Stride = getSizeInt(B, PointerRetStride); + auto *Store = + createInterleavedStore(Ctx, PointerRetResult, PointerRetAddr, Stride, + /*Mask*/ nullptr, /*EVL*/ nullptr, + PointerRetAlloca->getAlign().value()); + if (!Store) { + return nullptr; + } + Store->insertBefore(B.GetInsertPoint()); + } + return NewCI; +} + +Value *Packetizer::Impl::vectorizeWorkGroupCall( + CallInst *CI, const compiler::utils::BuiltinCall &Builtin) { + // Insert instructions after the call to the builtin, since they reference + // the result of that call. + IRBuilder<> B(buildAfter(CI, F)); + + // Do not vectorize ranks equal to vectorization dimension. The value of + // get_global_id with other ranks is uniform. + + Value *IDToSplat = CI; + // Multiply the sub-group local ID by the vectorization factor, to vectorize + // across the entire sub-group size. + // For example, with a vector width of 4 and a mux sub-group size of 2, the + // apparent sub-group size is 8 and the sub-group IDs are: + // | mux sub group 0 | mux sub group 1 | + // |-----------------|-----------------| + // | 0 1 2 3 | 4 5 6 7 | + if (Builtin.ID == compiler::utils::eMuxBuiltinGetSubGroupLocalId) { + IDToSplat = B.CreateMul( + IDToSplat, B.CreateElementCount(IDToSplat->getType(), SimdWidth)); + } + + // Broadcast the builtin's return value. + Value *Splat = B.CreateVectorSplat(SimdWidth, IDToSplat); + + // Add an index sequence [0, 1, 2, ...] to the value unless uniform. + const auto Uniformity = Builtin.uniformity; + if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID || + Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) { + Value *StepVector = + createIndexSequence(B, cast(Splat->getType()), "index.vec"); + VECZ_FAIL_IF(!StepVector); + + Value *Result = B.CreateAdd(Splat, StepVector); + + if (Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) { + Value *Rank = CI->getArgOperand(0); + + // if the Rank is varying, need to packetize that as well! + if (UVR.isVarying(Rank)) { + Rank = packetize(Rank).getAsValue(); + VECZ_FAIL_IF(!Rank); + } + Value *dim = ConstantInt::get(Rank->getType(), Dimension); + Value *Test = B.CreateICmpEQ(Rank, dim); + Result = B.CreateSelect(Test, Result, Splat, "maybe_rank"); + } + return Result; + } else if (Uniformity == compiler::utils::eBuiltinUniformityNever) { + VECZ_FAIL(); + } + + return Splat; +} + +Value *Packetizer::Impl::vectorizeAlloca(AllocaInst *alloca) { + // We create an array allocation here, because the resulting value needs to + // represent a vector of pointers, not a pointer to vector. As such, it's a + // bit of a trick to handle scalable vectorization factors, since that would + // require creating instrucions *before* the alloca, to get the array length, + // which could be a surprise to some of our later passes that expect allocas + // to be grouped at the top of the first Basic Block. This is not an LLVM + // requirement, however, so it should be investigated. + // + // Note that normally, an alloca would not be packtized anyway, since access + // is contiguous, Load and Store operations don't need to packetize their + // pointer operand and the alloca would be widened after packetization, which + // has no trouble with scalables. This function is required for the case that + // some pointer-dependent instruction unexpectedly fails to packetize, and + // falls back to instantiation, in which case we need a pointer per lane. In + // actual fact, "normal" alloca vectorization is not very common, since such + // allocas tend to be easy to remove by the Mem-to-Reg pass, so this "edge + // case" is actually the most likely. + // + VECZ_FAIL_IF(SimdWidth.isScalable()); + const unsigned fixedWidth = SimdWidth.getFixedValue(); + IRBuilder<> B(alloca); + auto *const ty = alloca->getAllocatedType(); + AllocaInst *wideAlloca = + B.CreateAlloca(ty, getSizeInt(B, fixedWidth), alloca->getName()); + wideAlloca->setAlignment(alloca->getAlign()); + + // Put the GEP after all allocas. + Instruction *insertPt = alloca; + while (isa(*insertPt)) { + insertPt = insertPt->getNextNode(); + } + B.SetInsertPoint(insertPt); + deleteInstructionLater(alloca); + + auto *const idxTy = Ctx.dataLayout()->getIndexType(wideAlloca->getType()); + Value *const indices = + createIndexSequence(B, VectorType::get(idxTy, SimdWidth)); + + return B.CreateInBoundsGEP(ty, wideAlloca, ArrayRef{indices}, + Twine(alloca->getName(), ".lanes")); +} + +Value *Packetizer::Impl::vectorizeExtractValue(ExtractValueInst *ExtractValue) { + IRBuilder<> B(buildAfter(ExtractValue, F)); + + Value *Aggregate = + packetize(ExtractValue->getAggregateOperand()).getAsValue(); + SmallVector Indices; + Indices.push_back(0); + for (auto Index : ExtractValue->indices()) { + Indices.push_back(Index); + } + + SmallVector Extracts; + + VECZ_FAIL_IF(SimdWidth.isScalable()); + auto Width = SimdWidth.getFixedValue(); + + // Check that the width is non-zero so the zeroth element is initialized. + VECZ_FAIL_IF(Width < 1); + + for (decltype(Width) i = 0; i < Width; i++) { + Indices[0] = i; + Extracts.push_back(B.CreateExtractValue(Aggregate, Indices)); + } + + Type *CompositeTy = getWideType(Extracts[0]->getType(), SimdWidth); + Value *Result = PoisonValue::get(CompositeTy); + for (decltype(Width) i = 0; i < Width; i++) { + Result = B.CreateInsertElement(Result, Extracts[i], B.getInt32(i)); + } + + return Result; +} + +ValuePacket +Packetizer::Impl::packetizeInsertElement(InsertElementInst *InsertElement) { + ValuePacket results; + Value *Result = nullptr; + + Value *Into = InsertElement->getOperand(0); + assert(Into && "Could not get operand 0 of InsertElement"); + const auto ScalarWidth = multi_llvm::getVectorNumElements(Into->getType()); + + Value *Elt = InsertElement->getOperand(1); + Value *Index = InsertElement->getOperand(2); + assert(Elt && "Could not get operand 1 of InsertElement"); + assert(Index && "Could not get operand 2 of InsertElement"); + + if (SimdWidth.isScalable()) { + auto packetWidth = getPacketWidthForType(Into->getType()); + auto intoVals = packetizeAndGet(Into, packetWidth); + // Scalable vectorization (currently) only ever generates 1 packet + PACK_FAIL_IF(intoVals.size() != 1); + Value *packetizedInto = intoVals.front(); + + auto eltPacketWidth = getPacketWidthForType(Elt->getType()); + auto eltVals = packetizeAndGet(Elt, eltPacketWidth); + // Scalable vectorization (currently) only ever generates 1 packet + PACK_FAIL_IF(eltVals.size() != 1); + Value *packetizedElt = eltVals.front(); + + Value *packetizedIndices = packetizeIfVarying(Index); + + auto *packetizedEltTy = packetizedElt->getType(); + auto *packetizedIntoTy = packetizedInto->getType(); + auto *scalarTy = packetizedEltTy->getScalarType(); + + // Compiler support for masked.gather/riscv.vrgather* on i1 vectors is + // lacking, so emit this operation as the equivalent i8 vector instead. + auto *const origPacketizedIntoTy = packetizedIntoTy; + const bool upcastI1AsI8 = scalarTy->isIntegerTy(1); + IRBuilder<> B(buildAfter(InsertElement, F)); + if (upcastI1AsI8) { + auto *const int8Ty = Type::getInt8Ty(F.getContext()); + packetizedIntoTy = llvm::VectorType::get( + int8Ty, multi_llvm::getVectorElementCount(packetizedIntoTy)); + packetizedEltTy = llvm::VectorType::get( + int8Ty, multi_llvm::getVectorElementCount(packetizedEltTy)); + packetizedElt = B.CreateSExt(packetizedElt, packetizedEltTy); + packetizedInto = B.CreateSExt(packetizedInto, packetizedIntoTy); + } + + // If we're vector predicating, scale the vector length up by the original + // number of vector elements. + auto *const EVL = VL ? B.CreateMul(VL, B.getInt32(ScalarWidth)) : nullptr; + + auto *packetizedInsert = Ctx.targetInfo().createScalableInsertElement( + B, Ctx, InsertElement, packetizedElt, packetizedInto, packetizedIndices, + EVL); + + // If we've been performing this broadcast as i8, now's the time to + // truncate back down to i1 + if (upcastI1AsI8) { + packetizedInsert = B.CreateTrunc(packetizedInsert, origPacketizedIntoTy); + } + + IC.deleteInstructionLater(InsertElement); + results.push_back(packetizedInsert); + return results; + } + + auto Width = SimdWidth.getFixedValue(); + + IRBuilder<> B(buildAfter(InsertElement, F)); + + const auto Name = InsertElement->getName(); + if (auto *CIndex = dyn_cast(Index)) { + auto IdxVal = CIndex->getZExtValue(); + + auto packetWidth = getPacketWidthForType(Into->getType()); + PACK_FAIL_IF(packetWidth == Width); + + auto Intos = packetizeAndGet(Into, packetWidth); + PACK_FAIL_IF(Intos.empty()); + + auto res = packetize(Elt); + PACK_FAIL_IF(!res); + + if (res.info->numInstances == 0) { + // If the element was broadcast, it's better just to create more insert + // element instructions.. + const auto instanceWidth = + multi_llvm::getVectorNumElements(Intos.front()->getType()); + for (unsigned i = 0; i < packetWidth; ++i) { + results.push_back(Intos[i]); + for (unsigned j = IdxVal; j < instanceWidth; j += ScalarWidth) { + results.back() = + B.CreateInsertElement(results.back(), Elt, B.getInt32(j), Name); + } + } + return results; + } + + SmallVector Elts; + res.getPacketValues(packetWidth, Elts); + PACK_FAIL_IF(Elts.empty()); + + const auto *VecTy = cast(Intos.front()->getType()); + const unsigned VecWidth = VecTy->getNumElements(); + PACK_FAIL_IF(VecWidth == ScalarWidth); + { + // Can only shuffle two vectors of the same size, so redistribute + // the packetized elements vector + SmallVector Mask; + for (size_t i = 0; i < VecWidth; ++i) { + Mask.push_back(i / ScalarWidth); + } + + auto *Undef = PoisonValue::get(Elts.front()->getType()); + for (unsigned i = 0; i < packetWidth; ++i) { + results.push_back(createOptimalShuffle(B, Elts[i], Undef, Mask, Name)); + } + } + if (isa(Into)) { + // Inserting into nothing so we can just use it as is.. + return results; + } else { + SmallVector Mask; + for (size_t i = 0; i < VecWidth; ++i) { + int j = VecWidth + i; + if (i == IdxVal) { + j = i; + IdxVal += ScalarWidth; + } + Mask.push_back(j); + } + + for (unsigned i = 0; i < packetWidth; ++i) { + results[i] = createOptimalShuffle(B, results[i], Intos[i], Mask, Name); + } + return results; + } + } else { + Into = packetize(Into).getAsValue(); + PACK_FAIL_IF(!Into); + Value *Elts = packetizeIfVarying(Elt); + PACK_FAIL_IF(!Elts); + Value *Indices = packetizeIfVarying(Index); + PACK_FAIL_IF(!Indices); + + Result = Into; + if (Indices != Index) { + Type *IdxTy = Index->getType(); + SmallVector Offsets; + for (size_t i = 0; i < Width; ++i) { + Offsets.push_back(ConstantInt::get(IdxTy, i * ScalarWidth)); + } + Value *Add = B.CreateAdd(Indices, ConstantVector::get(Offsets)); + + for (size_t i = 0; i < Width; ++i) { + Value *ExtractElt = + (Elts != Elt) ? B.CreateExtractElement(Elts, B.getInt32(i)) : Elt; + Value *ExtractIdx = B.CreateExtractElement(Add, B.getInt32(i)); + Result = B.CreateInsertElement(Result, ExtractElt, ExtractIdx, Name); + } + } else { + for (size_t i = 0; i < Width; ++i) { + Value *ExtractElt = + (Elts != Elt) ? B.CreateExtractElement(Elts, B.getInt32(i)) : Elt; + Value *InsertIdx = B.CreateAdd(Index, B.getInt32(i * ScalarWidth)); + Result = B.CreateInsertElement(Result, ExtractElt, InsertIdx, Name); + } + } + } + IC.deleteInstructionLater(InsertElement); + results.push_back(Result); + return results; +} + +ValuePacket +Packetizer::Impl::packetizeExtractElement(ExtractElementInst *ExtractElement) { + ValuePacket results; + Value *Result = nullptr; + + Value *Src = ExtractElement->getOperand(0); + Value *Index = ExtractElement->getOperand(1); + assert(Src && "Could not get operand 0 of ExtractElement"); + assert(Index && "Could not get operand 1 of ExtractElement"); + + if (SimdWidth.isScalable()) { + auto packetWidth = getPacketWidthForType(Src->getType()); + auto srcVals = packetizeAndGet(Src, packetWidth); + // Scalable vectorization (currently) only ever generates 1 packet + PACK_FAIL_IF(srcVals.size() != 1); + Value *packetizedSrc = srcVals.front(); + + Value *packetizedIndices = packetizeIfVarying(Index); + + Value *packetizedExtract = [&]() { + IRBuilder<> B(buildAfter(ExtractElement, F)); + + auto *narrowTy = getWideType(ExtractElement->getType(), SimdWidth); + auto *const origNarrowTy = narrowTy; + auto *origSrc = ExtractElement->getOperand(0); + auto *origTy = origSrc->getType(); + auto *eltTy = origTy->getScalarType()->getScalarType(); + + // Compiler support for masked.gather/riscv.vrgather* on i1 + // vectors is lacking, so emit this operation as the equivalent + // i8 vector instead. + const bool upcastI1AsI8 = eltTy->isIntegerTy(/*BitWidth*/ 1); + if (upcastI1AsI8) { + auto *const int8Ty = B.getInt8Ty(); + auto *wideTy = llvm::VectorType::get( + int8Ty, + multi_llvm::getVectorElementCount(packetizedSrc->getType())); + narrowTy = llvm::VectorType::get( + int8Ty, multi_llvm::getVectorElementCount(narrowTy)); + packetizedSrc = B.CreateSExt(packetizedSrc, wideTy); + } + + Value *extract = Ctx.targetInfo().createScalableExtractElement( + B, Ctx, ExtractElement, narrowTy, packetizedSrc, packetizedIndices, + VL); + + // If we've been performing this broadcast as i8, now's the time to + // truncate back down to i1 + if (extract && upcastI1AsI8) { + extract = B.CreateTrunc(extract, origNarrowTy); + } + + return extract; + }(); + PACK_FAIL_IF(!packetizedExtract); + + IC.deleteInstructionLater(ExtractElement); + results.push_back(packetizedExtract); + return results; + } + + auto Width = SimdWidth.getFixedValue(); + + const auto ScalarWidth = multi_llvm::getVectorNumElements(Src->getType()); + + IRBuilder<> B(buildAfter(ExtractElement, F)); + const auto Name = ExtractElement->getName(); + if (auto *CIndex = dyn_cast(Index)) { + auto IdxVal = CIndex->getZExtValue(); + + auto packetWidth = getPacketWidthForType(ExtractElement->getType()); + auto srcVals = packetizeAndGet(Src, packetWidth); + PACK_FAIL_IF(srcVals.empty()); + + auto resultWidth = Width / packetWidth; + if (packetWidth == 1) { + srcVals.push_back(PoisonValue::get(srcVals.front()->getType())); + } else { + resultWidth *= 2; + } + + SmallVector Mask; + for (size_t i = 0, j = IdxVal; i < resultWidth; ++i, j += ScalarWidth) { + Mask.push_back(j); + } + + for (unsigned i = 0; i < packetWidth; i += 2) { + results.push_back( + createOptimalShuffle(B, srcVals[i], srcVals[i + 1], Mask, Name)); + } + return results; + } else { + Value *Sources = packetizeIfVarying(Src); + PACK_FAIL_IF(!Sources); + Value *Indices = packetizeIfVarying(Index); + PACK_FAIL_IF(!Indices); + + Result = + PoisonValue::get(getWideType(ExtractElement->getType(), SimdWidth)); + if (Indices != Index) { + Type *IdxTy = Index->getType(); + SmallVector Offsets; + for (unsigned i = 0; i < Width; ++i) { + Offsets.push_back(ConstantInt::get(IdxTy, i * ScalarWidth)); + } + + if (Sources != Src) { + Indices = B.CreateAdd(Indices, ConstantVector::get(Offsets)); + } + + for (unsigned i = 0; i < Width; ++i) { + Value *ExtractIdx = B.CreateExtractElement(Indices, B.getInt32(i)); + Value *ExtractElt = B.CreateExtractElement(Sources, ExtractIdx); + Result = B.CreateInsertElement(Result, ExtractElt, B.getInt32(i), Name); + } + } else { + for (unsigned i = 0, j = 0; i < Width; ++i, j += ScalarWidth) { + Value *ExtractIdx = (Sources != Src && i != 0) + ? B.CreateAdd(Index, B.getInt32(j)) + : Index; + Value *ExtractElt = B.CreateExtractElement(Sources, ExtractIdx); + Result = B.CreateInsertElement(Result, ExtractElt, B.getInt32(i), Name); + } + } + } + IC.deleteInstructionLater(ExtractElement); + results.push_back(Result); + return results; +} + +ValuePacket +Packetizer::Impl::packetizeInsertValue(InsertValueInst *InsertValue) { + ValuePacket results; + + Value *const Val = InsertValue->getInsertedValueOperand(); + Value *const Aggregate = InsertValue->getAggregateOperand(); + + // We can only packetize literal struct types + if (auto *StructTy = dyn_cast(Aggregate->getType()); + !StructTy || !StructTy->isLiteral()) { + return results; + } + + Value *PackAggregate = packetizeIfVarying(Aggregate); + PACK_FAIL_IF(!PackAggregate); + + Value *PackVal = packetizeIfVarying(Val); + PACK_FAIL_IF(!PackVal); + + const bool IsValVarying = Val != PackVal; + const bool IsAggregateVarying = Aggregate != PackAggregate; + if (!IsAggregateVarying && IsValVarying) { + // If the aggregate wasn't varying but the value was + PackAggregate = packetize(Aggregate).getAsValue(); + } else if (IsAggregateVarying && !IsValVarying) { + // If the aggregate was varying but the value wasn't + PackVal = packetize(Val).getAsValue(); + } else if (!IsAggregateVarying && !IsValVarying) { + // If both were uniform + return results; + } + + IRBuilder<> B(buildAfter(InsertValue, F)); + + results.push_back( + B.CreateInsertValue(PackAggregate, PackVal, InsertValue->getIndices())); + + IC.deleteInstructionLater(InsertValue); + return results; +} + +ValuePacket +Packetizer::Impl::packetizeExtractValue(ExtractValueInst *ExtractValue) { + ValuePacket results; + + Value *const Aggregate = ExtractValue->getAggregateOperand(); + // We can only packetize literal struct types + if (auto *StructTy = dyn_cast(Aggregate->getType()); + !StructTy || !StructTy->isLiteral()) { + return results; + } + + Value *PackAggregate = packetizeIfVarying(Aggregate); + PACK_FAIL_IF(!PackAggregate); + + IRBuilder<> B(buildAfter(ExtractValue, F)); + + results.push_back( + B.CreateExtractValue(PackAggregate, ExtractValue->getIndices())); + + IC.deleteInstructionLater(ExtractValue); + return results; +} + +ValuePacket +Packetizer::Impl::packetizeShuffleVector(ShuffleVectorInst *Shuffle) { + Value *const srcA = Shuffle->getOperand(0); + Value *const srcB = Shuffle->getOperand(1); + assert(srcA && "Could not get operand 0 from Shuffle"); + assert(srcB && "Could not get operand 1 from Shuffle"); + auto *const ty = Shuffle->getType(); + auto *const tyA = srcA->getType(); + auto packetWidth = + std::max(getPacketWidthForType(ty), getPacketWidthForType(tyA)); + + ValuePacket results; + IRBuilder<> B(buildAfter(Shuffle, F)); + const auto scalarWidth = multi_llvm::getVectorNumElements(tyA); + + if (SimdWidth.isScalable()) { + PACK_FAIL_IF(packetWidth != 1); + if (auto *const SplatVal = getSplatValue(Shuffle)) { + // Handle splats as a special case. + auto Splats = packetizeAndGet(SplatVal); + PACK_FAIL_IF(!createSubSplats(Ctx.targetInfo(), B, Splats, scalarWidth)); + return Splats; + } else { + // It isn't safe to do it if it's not a power of 2. + PACK_FAIL_IF(!isPowerOf2_32(scalarWidth)); + const TargetInfo &VTI = Ctx.targetInfo(); + + const auto dstScalarWidth = multi_llvm::getVectorNumElements(ty); + const auto fullWidth = SimdWidth * dstScalarWidth; + + // If we're vector-predicating a vector access, scale the vector length + // up by the original number of vector elements. + auto *const EVL = + VL ? B.CreateMul(VL, B.getInt32(dstScalarWidth)) : nullptr; + + auto *const mask = Shuffle->getShuffleMaskForBitcode(); + auto *const vecMask = + VTI.createOuterScalableBroadcast(B, mask, EVL, SimdWidth); + + auto *const idxVector = + createIndexSequence(B, VectorType::get(B.getInt32Ty(), fullWidth)); + + // We need to create offsets into the source operand subvectors, to add + // onto the broadcast shuffle mask, so that each subvector of the + // destination indices into the corresponding subvector of the source. + // That is, for a source vector width of `n` we need the indices + // `[0, n, 2*n, 3*n ...]`, which correspond to the indices of the first + // element of each subvector of the packetized source. For a destination + // vector of width `m` we need `m` instances of each index. + // + // We can compute the offset vector as `offset[i] = floor(i / m) * n`. + Value *offset = nullptr; + if (dstScalarWidth == scalarWidth) { + // If the source and destination are the same size, we have a special + // case and can mask off the LSBs of the index vector instead. i.e. + // `offset[i] = i & -n` + // For instance, for `n == 4` we have offset indices: + // [0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, ... ]. + offset = B.CreateAnd( + idxVector, + ConstantVector::getSplat(fullWidth, B.getInt32(-scalarWidth))); + } else { + auto *const subVecID = B.CreateUDiv( + idxVector, + ConstantVector::getSplat(fullWidth, B.getInt32(dstScalarWidth))); + offset = B.CreateMul(subVecID, ConstantVector::getSplat( + fullWidth, B.getInt32(scalarWidth))); + } + + auto *const vecA = packetizeAndGet(srcA, 1).front(); + if (isa(srcB)) { + auto *const adjust = B.CreateAdd(vecMask, offset, "shuffleMask"); + auto *const shuffleA = VTI.createVectorShuffle(B, vecA, adjust, EVL); + results.push_back(shuffleA); + } else { + // For a two-source shuffle, we shuffle each source separately and then + // select between the results. It might sound tempting to concatenate + // the sources first and use a single shuffle, but since the results + // need to be interleaved, it makes the mask computation somewhat more + // complicated, with indices dependent on the vector scale factor. + auto *const vecB = packetizeAndGet(srcB, 1).front(); + + auto *const whichCmp = B.CreateICmpUGE( + vecMask, + ConstantVector::getSplat(fullWidth, B.getInt32(scalarWidth))); + auto *const safeMask = B.CreateAnd( + vecMask, + ConstantVector::getSplat(fullWidth, B.getInt32(scalarWidth - 1))); + + auto *const adjust = B.CreateAdd(safeMask, offset, "shuffleMask"); + auto *const shuffleA = VTI.createVectorShuffle(B, vecA, adjust, EVL); + auto *const shuffleB = VTI.createVectorShuffle(B, vecB, adjust, EVL); + results.push_back(B.CreateSelect(whichCmp, shuffleB, shuffleA)); + } + + return results; + } + } + + auto srcsA = packetizeAndGet(srcA, packetWidth); + auto srcsB = packetizeAndGet(srcB, packetWidth); + PACK_FAIL_IF(srcsA.empty() || srcsB.empty()); + + auto width = SimdWidth.getFixedValue() / packetWidth; + + // Because up to and including LLVM 10, the IR Builder accepts a mask as a + // vector of uint32_t, but getShuffleMask returns an array of ints. So + // we do it this way. + const auto &origMask = Shuffle->getShuffleMask(); + SmallVector mask(origMask.begin(), origMask.end()); + + // Adjust any indices that select from the second source vector + const auto adjust = + isa(srcB) ? -scalarWidth : (width - 1) * scalarWidth; + for (auto &idx : mask) { + if (idx != -1 && idx >= int(scalarWidth)) { + idx += adjust; + } + } + + // Duplicate the mask over the vectorized width + const auto size = mask.size(); + mask.reserve(size * width); + for (unsigned i = 1, k = 0; i < width; ++i, k += size) { + for (unsigned j = 0; j < size; ++j) { + auto maskElem = mask[k + j]; + if (maskElem != -1) { + maskElem += scalarWidth; + } + mask.push_back(maskElem); + } + } + + const auto name = Shuffle->getName(); + for (unsigned i = 0; i < packetWidth; ++i) { + results.push_back(createOptimalShuffle(B, srcsA[i], srcsB[i], mask, name)); + } + return results; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp new file mode 100644 index 0000000000000..a496b0fdb44c1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp @@ -0,0 +1,180 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/passes.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "analysis/control_flow_analysis.h" +#include "analysis/divergence_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "ir_cleanup.h" +#include "memory_operations.h" +#include "vectorization_unit.h" +#include "vecz/vecz_target_info.h" + +#define DEBUG_TYPE "vecz" + +using namespace llvm; + +namespace vecz { +PreservedAnalyses DivergenceCleanupPass::run(Function &F, + FunctionAnalysisManager &AM) { + UniformValueResult &UVR = AM.getResult(F); + + for (BasicBlock &BB : F) { + auto *TI = BB.getTerminator(); + if (BranchInst *Branch = dyn_cast(TI)) { + if (!Branch->isConditional()) { + continue; + } + + if (auto *const call = dyn_cast(Branch->getCondition())) { + compiler::utils::Lexer L(call->getCalledFunction()->getName()); + if (L.Consume(VectorizationContext::InternalBuiltinPrefix) && + L.Consume("divergence_")) { + // uniform reductions can just disappear + auto *const newCond = call->getOperand(0); + if (!UVR.isVarying(newCond)) { + Branch->setCondition(newCond); + if (call->use_empty()) { + UVR.remove(call); + call->eraseFromParent(); + } + } + } + } + } + } + + return PreservedAnalyses::all(); +} + +//////////////////////////////////////////////////////////////////////////////// + +/// @brief Try to replace or remove masked memory operations that are trivially +/// not needed or can be converted to non-masked operations. +PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &Ctx = AM.getResult(F).getContext(); + + const TargetInfo &VTI = Ctx.targetInfo(); + std::vector ToDelete; + for (Function &Builtin : F.getParent()->functions()) { + std::optional BuiltinDesc = + MemOpDesc::analyzeMaskedMemOp(Builtin); + if (!BuiltinDesc) { + continue; + } + for (User *U : Builtin.users()) { + CallInst *CI = dyn_cast(U); + if (!CI) { + continue; + } + Function *Parent = CI->getParent()->getParent(); + if (Parent != &F) { + continue; + } + auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked); + if (!MaskedOp || !MaskedOp->isMaskedMemOp()) { + continue; + } + Value *Mask = MaskedOp->getMaskOperand(); + Constant *CMask = dyn_cast(Mask); + if (!CMask) { + continue; + } + + // Handle special constants. + if (CMask->isZeroValue()) { + // A null mask means no lane executes the memory operation. + if (BuiltinDesc->isLoad()) { + CI->replaceAllUsesWith(PoisonValue::get(BuiltinDesc->getDataType())); + } + ToDelete.push_back(CI); + } else if (CMask->isAllOnesValue()) { + // An 'all ones' mask means all lane execute the memory operation. + IRBuilder<> B(CI); + Value *Data = MaskedOp->getDataOperand(); + Value *Ptr = MaskedOp->getPointerOperand(); + Type *DataTy = MaskedOp->getDataType(); + auto Alignment = BuiltinDesc->getAlignment(); + if (MaskedOp->isLoad()) { + Value *Load = nullptr; + if (DataTy->isVectorTy()) { + // Skip this optimization for scalable vectors for now. It's + // theoretically possible to perform but without scalable-vector + // builtins we can't test it; leave any theoretical scalable-vector + // maksed mem operation unoptimized. + if (isa(DataTy)) { + continue; + } + Load = + VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1), Alignment); + } else { + Load = B.CreateAlignedLoad(CI->getType(), Ptr, Align(Alignment), + /*isVolatile*/ false, CI->getName()); + } + Load->takeName(CI); + CI->replaceAllUsesWith(Load); + } else { + if (DataTy->isVectorTy()) { + // Skip this optimization for scalable vectors for now. It's + // theoretically possible to perform but without scalable-vector + // builtins we can't test it; leave any theoretical scalable-vector + // maksed mem operation unoptimized. + if (isa(DataTy)) { + continue; + } + VTI.createStore(B, Data, Ptr, B.getInt64(1), + BuiltinDesc->getAlignment()); + } else { + B.CreateAlignedStore(Data, Ptr, Align(Alignment)); + } + } + ToDelete.push_back(CI); + } + } + } + + // Clean up. + while (!ToDelete.empty()) { + Instruction *I = ToDelete.back(); + IRCleanup::deleteInstructionNow(I); + ToDelete.pop_back(); + } + + PreservedAnalyses Preserved; + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + return Preserved; +} + +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp new file mode 100644 index 0000000000000..b72ab38121384 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp @@ -0,0 +1,353 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// This pass aims to optimize the CFG by hoisting instructions out of triangle +// or diamond patterns (i.e. "if" or "if..else" constructs) where it determines +// that executing all the instructions in all branch targets is cheaper than +// actually branching. This is especially the case when BOSCC is active as the +// BOSCC gadget introduces potentially-expensive AND/OR reduction operations +// in order to branch to the uniform version of each Basic Block. To such end, +// the pass needs to use the Uniform Value Analysis result, since only varying +// branch conditions will be affected by BOSCC in such a way. We also need +// access to the Target Transform Info result from the Vectorization Unit in +// order to make target-dependent cost-based decisions. +// +// This pass only hoists instructions out of conditional blocks, and does not +// directly modify the CFG, so it is intended that CFG Simplification Pass to +// be run afterwards, in order to eliminate the now-redundant Basic Blocks and +// transform PHI nodes into select instructions. Therefore, the +// pre-linearization pass is implemented as an llvm::FunctionPass so it can +// be run in the middle of the Vecz Preparation Pass. +// +// Pre-Linearization is currently unable to hoist memory operations, since +// doing so will require the correct masked versions to be generated, which +// would require a lot of special extra handling. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "transform/passes.h" +#include "vectorization_unit.h" +#include "vecz/vecz_choices.h" + +using namespace llvm; +using namespace vecz; + +namespace { +bool isTrivialBlock(const llvm::BasicBlock &BB) { + for (const auto &I : BB) { + if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects() || + llvm::isa(&I)) { + return false; + } + } + return true; +} + +// This is an estimate of the cycle count for executing the entire block, +// not including the terminating branch instruction, obtained by summing +// the cost (Reciprocal Throughput) of each individual instruction. +// This assumes sequential execution (no Instruction Level Parallelism) +// and takes no account of Data Hazards &c so is not guaranteed to be +// entirely accurate. +InstructionCost calculateBlockCost(const BasicBlock &BB, + const TargetTransformInfo &TTI) { + InstructionCost cost; + for (const auto &I : BB) { + if (I.isTerminator()) { + break; + } + + InstructionCost inst_cost = + TTI.getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput); + + // When a vector instruction is encountered, we multiply by the vector + // width, because it will either be scalarized into that many individual + // instructions during scalarization, or packetized by duplication. + // This works on the assumption that throughput does not depend on the + // vector width. This calculation may need refining in future. + if (I.getType()->isVectorTy()) { + inst_cost *= multi_llvm::getVectorNumElements(I.getType()); + } + + cost += inst_cost; + } + return cost; +} + +// It creates a temporary function in order to build a target-dependent +// vector AND reduction inside it, in order to calculate the cost of it. +InstructionCost calculateBoolReductionCost(LLVMContext &context, Module *module, + const TargetTransformInfo &TTI, + llvm::ElementCount width) { + Type *cond_ty = VectorType::get(Type::getInt1Ty(context), width); + + FunctionType *new_fty = + FunctionType::get(Type::getVoidTy(context), {cond_ty}, false); + + // LLVM 11 requires the function to be in a valid (existing) module in + // order to create a simple vector reduction with the specified opcode. + auto *F = Function::Create(new_fty, Function::InternalLinkage, "tmp", module); + auto *BB = BasicBlock::Create(context, "reduce", F); + IRBuilder<> B(BB); + createSimpleReduction(B, &*F->arg_begin(), RecurKind::And); + const InstructionCost cost = calculateBlockCost(*BB, TTI); + + // We don't really need that function in the module anymore because it's + // only purpose was to be used for analysis, so we go ahead and remove it. + F->removeFromParent(); + delete F; + return cost; +} + +bool hoistInstructions(BasicBlock &BB, BranchInst &Branch, bool exceptions) { + const auto &DL = BB.getModule()->getDataLayout(); + const bool TrueBranch = (Branch.getSuccessor(0) == &BB); + DenseMap safeDivisors; + + bool modified = false; + while (!BB.front().isTerminator()) { + auto &I = BB.front(); + I.moveBefore(*Branch.getParent(), Branch.getIterator()); + modified = true; + + if (!exceptions) { + // we don't need to mask division operations if they don't trap + continue; + } + + if (!isa(&I)) { + // we only hoist binary operators + continue; + } + auto *binOp = cast(&I); + // It is potentially dangerous to hoist division operations, since + // the RHS could be zero or INT_MIN on some lanes, unless it's a + // constant. + bool isUnsigned = false; + switch (binOp->getOpcode()) { + default: + break; + case Instruction::UDiv: + case Instruction::URem: + isUnsigned = true; + LLVM_FALLTHROUGH; + case Instruction::SDiv: + case Instruction::SRem: { + auto *divisor = binOp->getOperand(1); + if (auto *C = dyn_cast(divisor)) { + if (C->isZeroValue()) { + // Divides by constant zero can be a NOP since there is no + // division by zero exception in OpenCL. + I.replaceAllUsesWith(binOp->getOperand(0)); + I.eraseFromParent(); + } + } else { + // if the divisor could be illegal, we need to guard it with a + // select instruction generated from the branch condition. + auto &masked = safeDivisors[divisor]; + if (!masked) { + // NOTE this function does not check for the pattern + // "select (x eq 0) 1, x" or equivalent, so we might want to + // write it ourselves, but Instruction Combining cleans it + // up. NOTE that for a signed division, we also have to + // consider the potential overflow situation, which is not + // so simple + if (isUnsigned && isKnownNonZero(divisor, DL)) { + // Static analysis concluded it can't be zero, so we don't + // need to do anything. + masked = divisor; + } else { + Value *one = ConstantInt::get(divisor->getType(), 1); + Value *cond = Branch.getCondition(); + + Instruction *SI; + if (TrueBranch) { + SI = SelectInst::Create(cond, divisor, one, + divisor->getName() + ".hoist_guard"); + } else { + SI = SelectInst::Create(cond, one, divisor, + divisor->getName() + ".hoist_guard"); + } + SI->insertBefore(I.getIterator()); + masked = SI; + } + } + + if (masked != divisor) { + binOp->setOperand(1, masked); + } + } + } break; + } + } + return modified; +} +} // namespace + +PreservedAnalyses PreLinearizePass::run(Function &F, + FunctionAnalysisManager &AM) { + VectorizationUnitAnalysis::Result R = + AM.getResult(F); + const TargetTransformInfo &TTI = AM.getResult(F); + const VectorizationUnit &VU = R.getVU(); + + bool modified = false; + auto &LI = AM.getResult(F); + const bool div_exceptions = + VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions); + + InstructionCost boscc_cost; + const UniformValueResult *UVR = nullptr; + if (VU.choices().linearizeBOSCC()) { + boscc_cost = calculateBoolReductionCost(F.getContext(), F.getParent(), TTI, + VU.width()); + UVR = &AM.getResult(F); + } + + auto dfo = depth_first(&F.getEntryBlock()); + SmallVector blocks(dfo.begin(), dfo.end()); + + DenseMap single_succs; + for (auto *BB : blocks) { + single_succs[BB] = BB->getSingleSuccessor(); + } + + for (auto BBit = blocks.rbegin(), BBe = blocks.rend(); BBit != BBe; ++BBit) { + BasicBlock *BB = *BBit; + + // Check that all hoistable successor blocks are in the same loop + Loop *block_loop = LI.getLoopFor(BB); + + if (succ_size(BB) >= 2) { + bool simple = true; + SmallPtrSet targets; + for (auto *succ : successors(BB)) { + if (BasicBlock *target = single_succs[succ]) { + targets.insert(target); + } + } + + SmallVector hoistable; + SmallPtrSet new_succs; + for (auto *succ : successors(BB)) { + if (!targets.contains(succ)) { + if (single_succs[succ] == nullptr || pred_size(succ) != 1 || + LI.getLoopFor(succ) != block_loop || !isTrivialBlock(*succ)) { + simple = false; + break; + } + hoistable.push_back(succ); + } else { + // these "bypass" successors are going to stay where they are + new_succs.insert(succ); + } + } + if (!simple || hoistable.empty()) { + continue; + } + + // The cost of a "bypass" branch is essentially zero. This occurs in a + // "triangle" type control struct (i.e. if with no else). + InstructionCost min_cost = new_succs.empty() ? InstructionCost::getMax() + : InstructionCost::getMin(); + + // The total cost of executing every successor sequentially + InstructionCost total_cost = 0; + + for (auto *succ : hoistable) { + const InstructionCost block_cost = calculateBlockCost(*succ, TTI); + if (block_cost < min_cost) { + min_cost = block_cost; + } + total_cost += block_cost; + new_succs.insert(single_succs[succ]); + } + + // One of the successors was going to get executed anyway, so we can + // discount the cost of the cheapest one from the total cost. + total_cost -= min_cost; + + // The unconditional branches of the successors are going to get + // removed if we hoist the contents. We will only execute one successor + // so assume the first successor's branch is representative. + auto *succ_term = hoistable.front()->getTerminator(); + InstructionCost branch_cost = + TTI.getInstructionCost(succ_term, + TargetTransformInfo::TCK_RecipThroughput) + + TTI.getInstructionCost(succ_term, TargetTransformInfo::TCK_Latency); + + // If all our successors branch to the same target, the conditional + // branch is going to disappear as well, so we can add that to the cost + // of the successor's branches in our analysis. + auto *T = BB->getTerminator(); + if (new_succs.size() == 1) { + branch_cost += + TTI.getInstructionCost(T, TargetTransformInfo::TCK_RecipThroughput); + branch_cost += + TTI.getInstructionCost(T, TargetTransformInfo::TCK_Latency); + + // BOSCC will incur an additional cost on varying branches. + if (UVR && UVR->isVarying(T)) { + branch_cost += boscc_cost; + } + } + + // If the cost of executing everything is less than the cost of the + // branches that would get removed, then it is beneficial to hoist. + // If the costs are the same then we might as well make the CFG simpler! + if (total_cost <= branch_cost) { + // The Lower Switch Pass ought to guarantee we can only get branch + // instructions here, but in case it didn't, we don't want to crash. + if (auto *const Branch = dyn_cast(T)) { + for (auto *succ : hoistable) { + modified |= hoistInstructions(*succ, *Branch, div_exceptions); + } + + if (new_succs.size() == 1) { + // We are not going to modify the CFG while we are working on it, + // because that is very complex so we leave it to the Simplfy CFG + // Pass which is to come after us, and will do a better job. So + // here we can just pretend we modified it. + single_succs[BB] = *new_succs.begin(); + } + } + } + } + } + + if (!modified) { + return PreservedAnalyses::all(); + } + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp new file mode 100644 index 0000000000000..d59a65037555b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp @@ -0,0 +1,391 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/printf_scalarizer.h" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define DEBUG_TYPE "VECZ-PRINTF-SCALARIZER" + +using namespace llvm; + +namespace vecz { + +GlobalVariable *GetFormatStringAsValue(Value *op) { + if (isa(op)) { + auto const_string = cast(op); + if (const_string->getOpcode() != Instruction::GetElementPtr) { + return nullptr; + } + return dyn_cast(const_string->getOperand(0)); + } + + if (isa(op)) { + auto gep_string = cast(op); + return dyn_cast(gep_string->getPointerOperand()); + } + + return dyn_cast(op); +} + +std::string GetFormatStringAsString(Value *op) { + if (!op || !isa(op)) { + return ""; + } + + auto *string_global = cast(op); + + if (!string_global->hasInitializer()) { + return ""; + } + + Constant *const string_const = string_global->getInitializer(); + + if (!isa(string_const)) { + return ""; + } + + auto *array_string = cast(string_const); + + if (!array_string->isString()) { + return ""; + } + + return array_string->getAsString().str(); +} + +static bool IncrementPtr(const char **fmt) { + if (*(++(*fmt)) == '\0') { + return true; + } + return false; +} + +GlobalVariable * +GetNewFormatStringAsGlobalVar(Module &module, + GlobalVariable *const string_value, + const std::string &new_format_string) { + const ArrayRef Elts((const uint8_t *)new_format_string.data(), + new_format_string.size()); + Constant *new_format_string_const = + ConstantDataArray::get(module.getContext(), Elts); + + const bool is_constant = string_value->isConstant(); + const bool is_externally_initialized = false; + const uint32_t addr_space = string_value->getType()->getPointerAddressSpace(); + const GlobalValue::LinkageTypes linkage_type = string_value->getLinkage(); + const GlobalValue::ThreadLocalMode thread_local_mode = + string_value->getThreadLocalMode(); + + GlobalVariable *new_var = new GlobalVariable( + module, new_format_string_const->getType(), is_constant, linkage_type, + new_format_string_const, Twine(string_value->getName() + "_"), + string_value, thread_local_mode, addr_space, is_externally_initialized); + + new_var->setAlignment(MaybeAlign(string_value->getAlignment())); + new_var->setUnnamedAddr(string_value->getUnnamedAddr()); + + return new_var; +} + +EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str, + std::string &new_str) { + // Set some sensible defaults in case we return error + new_str = ""; + + const char *fmt = str.c_str(); + + while (*fmt != '\0') { + if (*fmt != '%') { + new_str += *fmt; + } else { + std::string specifier_string(1, *fmt); + + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + + // Parse (zero or more) Flags + const char *flag_chars = "-+ #0"; + while (strchr(flag_chars, *fmt)) { + specifier_string += *fmt; + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + } + + // Parse (optional) Width + if (*fmt == '*') { + specifier_string += *fmt; + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + } else if (isdigit(*fmt)) { + while (isdigit(*fmt)) { + specifier_string += *fmt; + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + } + } + + // Parse (optional) Precision + if (*fmt == '.') { + specifier_string += *fmt; + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + + while (isdigit(*fmt)) { + specifier_string += *fmt; + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + } + } + + uint32_t vector_length = 1u; + const bool is_vector = *fmt == 'v'; + // Parse (optional) Vector Specifier + if (is_vector) { + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + switch (*fmt) { + default: + LLVM_DEBUG(dbgs() << "Unexpected character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + case '1': + // Must be 16, else error + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() + << "Expected vector width of 16 in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + if (*fmt != '6') { + LLVM_DEBUG(dbgs() + << "Expected vector width of 16 in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + vector_length = 16u; + break; + case '2': + vector_length = 2u; + break; + case '3': + vector_length = 3u; + // Lookahead for vectors of width 32. We know that we won't go out + // of bounds because worst case scenario there should be a null byte + // after the '3'. + if (*(fmt + 1) == '2') { + IncrementPtr(&fmt); + vector_length = 32u; + } + break; + case '4': + vector_length = 4u; + break; + case '6': + // Must be 64, else error + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() + << "Expected vector width of 64 in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + if (*fmt != '4') { + LLVM_DEBUG(dbgs() + << "Expected vector width of 64 in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + vector_length = 64u; + break; + case '8': + vector_length = 8u; + break; + } + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + } + + // Parse Length Modifier + const char *length_modifier_chars = "hljztL"; + // Length Modifier is required with Vector Specifier + bool has_used_l_length_modifier = false; + const bool has_supplied_length_modifier = + strchr(length_modifier_chars, *fmt); + if (is_vector && !has_supplied_length_modifier) { + LLVM_DEBUG( + dbgs() << "Expected vector width specifier in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + + if (has_supplied_length_modifier) { + bool consume_next_char = true; + switch (*fmt) { + default: + // The 'j', 'z', 't', and 'L' length modifiers are not supported by + // OpenCL C. + LLVM_DEBUG(dbgs() << "Unsupported length modifier '" << *fmt + << "'specifier in format string \"" << str.c_str() + << "\""); + return kPrintfError_invalidFormatString; + case 'h': + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + if (*fmt == 'h') { + specifier_string += "hh"; + } else if (*fmt == 'l') { + // Native printf doesn't recognize 'hl' so we don't + // add it to the new format string. Luckily, 'hl' + // is sizeof(int) - the same as the default on + // native printf! + + // Additionally, 'hl' modifier may only be used in + // conjunction with the vector specifier + if (!is_vector) { + LLVM_DEBUG(dbgs() + << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + } else { + specifier_string += 'h'; + // We've already incremented the ptr and we found nothing; don't + // do it again + consume_next_char = false; + } + break; + case 'l': + specifier_string += *fmt; + // Check ahead to see if the user is using the invalid 'll' length + // modifier + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + if (*fmt == 'l') { + LLVM_DEBUG(dbgs() + << "The 'll' length specifier is invalid in OpenCL " + "printf\n > " + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + // We've already incremented the ptr; don't do it again + + // The 'l' specifier for the OpenCL printf expects 64 bits + // integers, check if the system's long are actually 64 bits wide + // and if not upgrade the format specifier to 'll'. + // + // FIXME: This only works for host based devices, which is fine for + // our current printf implementation, but it should really be + // removed once we have a proper printf implementation. + if (sizeof(long) != 8) { + specifier_string += 'l'; + } + + consume_next_char = false; + has_used_l_length_modifier = true; + break; + } + if (consume_next_char) { + if (IncrementPtr(&fmt)) { + LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \"" + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + } + } + + // Parse Specifier + specifier_string += *fmt; + + switch (*fmt) { + default: + break; + case 'n': + // The 'n' conversion specifier is not supported by OpenCL C. + LLVM_DEBUG( + dbgs() << "The 'n' conversion specifier is invalid in OpenCL " + "printf\n > " + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + case 's': // Intentional fall-through + case 'c': + // The 'l' length modifier followed by the 'c' or 's' conversion + // specifiers is not supported by OpenCL C. + if (has_used_l_length_modifier) { + LLVM_DEBUG(dbgs() + << "The 'l' length modifier followed by the 'c' or " + "'s' conversion is invalid in OpenCL printf\n > " + << str.c_str() << "\""); + return kPrintfError_invalidFormatString; + } + break; + } + + // Output the %specifier for each element of the vector, + // and for every element but the last, follow it by a "," string. + for (uint32_t i = 0; i < vector_length; ++i) { + new_str += specifier_string; + + if (i < (vector_length - 1)) { + new_str += ","; + } + } + } + ++fmt; + } + + new_str += '\0'; + + return kPrintfError_success; +} +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp new file mode 100644 index 0000000000000..419f41649c58d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp @@ -0,0 +1,125 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "transform/passes.h" + +#define DEBUG_TYPE "vecz" + +using namespace llvm; +using namespace vecz; + +/// @brief remove IntPtrs where possible. +PreservedAnalyses RemoveIntPtrPass::run(Function &F, + FunctionAnalysisManager &) { + static const StringRef name = "remove_intptr"; + + SmallVector casts; + for (auto &BB : F) { + for (auto &I : BB) { + if (auto *int_ptr = dyn_cast(&I)) { + casts.push_back(int_ptr); + } + } + } + + if (casts.empty()) { + return PreservedAnalyses::all(); + } + + while (!casts.empty()) { + PtrToIntInst *int_ptr = casts.back(); + casts.pop_back(); + + for (auto usei = int_ptr->use_begin(); usei != int_ptr->use_end();) { + auto &use = *(usei++); + auto *user = use.getUser(); + + if (auto *ptr = dyn_cast(user)) { + IRBuilder<> B(ptr); + Value *new_cast = B.CreatePointerBitCastOrAddrSpaceCast( + int_ptr->getOperand(0), ptr->getDestTy(), name); + ptr->replaceAllUsesWith(new_cast); + ptr->eraseFromParent(); + } else if (auto *phi = dyn_cast(user)) { + // How we deal with PHI nodes is we create another PHI node with the + // pointer type, moving the PtrToInt to the other side of it. We also + // create IntToPtrs on the incoming side, where it does not consume + // the PtrToInt that we are currently looking at. Any new casts will + // hopefully be removed later. + auto num_values = phi->getNumIncomingValues(); + PHINode *new_phi = PHINode::Create(int_ptr->getSrcTy(), num_values, + phi->getName() + ".intptr"); + new_phi->insertBefore(phi->getIterator()); + + Instruction *insert = phi; + while (isa(insert)) { + insert = insert->getNextNode(); + } + + // Populate the replacement PHI node + for (decltype(num_values) i = 0; i != num_values; ++i) { + Value *incoming = phi->getIncomingValue(i); + BasicBlock *inb = phi->getIncomingBlock(i); + if (incoming == int_ptr) { + incoming = int_ptr->getOperand(0); + } else { + IRBuilder<> B(inb->getTerminator()); + incoming = B.CreateIntToPtr(incoming, int_ptr->getSrcTy(), name); + } + new_phi->addIncoming(incoming, inb); + } + + // Add the cast back to Int at the other side + IRBuilder<> B(insert); + Value *new_cast = B.CreatePtrToInt(new_phi, phi->getType(), name); + phi->replaceAllUsesWith(new_cast); + phi->eraseFromParent(); + casts.push_back(cast(new_cast)); + } else if (auto *bin_op = dyn_cast(user)) { + auto *i8_ty = IntegerType::getInt8Ty(F.getContext()); + + IRBuilder<> B(bin_op); + Value *index = nullptr; + + auto opcode = bin_op->getOpcode(); + if (opcode == Instruction::Add) { + index = bin_op->getOperand(use.getOperandNo() == 0); + } else if (opcode == Instruction::Sub && use.getOperandNo() == 0) { + index = B.CreateNeg(bin_op->getOperand(1), name); + } + + if (index) { + Value *operand = int_ptr->getOperand(0); + Value *new_gep = B.CreateGEP(i8_ty, operand, index, name); + Value *new_cast = B.CreatePtrToInt(new_gep, bin_op->getType(), name); + bin_op->replaceAllUsesWith(new_cast); + bin_op->eraseFromParent(); + casts.push_back(cast(new_cast)); + } + } + } + + if (int_ptr->use_empty()) { + int_ptr->eraseFromParent(); + } + } + + auto Preserved = PreservedAnalyses::all(); + Preserved.abandon(); + return Preserved; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp new file mode 100644 index 0000000000000..fcb0dfca9e621 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp @@ -0,0 +1,284 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/scalarization_pass.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "analysis/control_flow_analysis.h" +#include "analysis/divergence_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "llvm_helpers.h" +#include "memory_operations.h" +#include "transform/scalarizer.h" +#include "vectorization_unit.h" +#include "vecz/vecz_choices.h" +#include "vecz/vecz_target_info.h" + +#define DEBUG_TYPE "vecz-scalarization" + +using namespace vecz; +using namespace llvm; + +STATISTIC(VeczScalarizeFail, + "Number of kernels that failed to scalarize [ID#S80]"); + +ScalarizationPass::ScalarizationPass() {} + +namespace { +bool needsScalarization(const Type &T) { return T.isVectorTy(); } + +bool needsScalarization(const Instruction &I) { + if (needsScalarization(*I.getType())) { + return true; + } + for (const Use &op : I.operands()) { + if (needsScalarization(*op->getType())) { + return true; + } + } + return false; +} + +bool isValidScalableShuffle(const ShuffleVectorInst &shuffle) { + // 3-element vectors are trouble, so scalarize them. + if (!isPowerOf2_32(cast(shuffle.getType()) + ->getElementCount() + .getFixedValue())) { + return false; + } + if (!isPowerOf2_32(cast(shuffle.getOperand(0)->getType()) + ->getElementCount() + .getFixedValue())) { + return false; + } + return true; +} + +bool shouldScalarize(Instruction *I, bool scalable) { + // Don't scalarize loads or stores.. + if (isa(I) || isa(I)) { + return false; + } + + // We also don't scalarize element manipulations of load instructions + if (auto *Shuffle = dyn_cast(I)) { + if (scalable && !isValidScalableShuffle(*Shuffle)) { + return true; + } + + auto *SrcA = dyn_cast(Shuffle->getOperand(0)); + if (SrcA && !shouldScalarize(SrcA, scalable)) { + return false; + } + auto *SrcB = dyn_cast(Shuffle->getOperand(1)); + if (SrcB && !shouldScalarize(SrcB, scalable)) { + return false; + } + } else if (auto *Extract = dyn_cast(I)) { + auto *SrcA = dyn_cast(Extract->getOperand(0)); + if (SrcA && !shouldScalarize(SrcA, scalable)) { + return false; + } + } + + // We also don't scalarize masked memory operations + if (auto *CI = dyn_cast(I)) { + if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) { + if (MaskedOp->isMaskedMemOp()) { + return false; + } + } + } + + // Scalarize anything else + return true; +} + +/// @brief Operand Tracer struct +/// The purpose of this helper struct is to trace through the operands of any +/// given instruction, incrementing a usage counter, which we can compare to +/// the total number of uses for an instruction. If any instruction's counter +/// is equal to its total usage count, it has no uses other than ones we have +/// marked. +struct OperandTracer { + using VisitSet = DenseSet; + + UniformValueResult &UVR; + bool scalable; + VisitSet visited; + SmallVector stack; + + OperandTracer(UniformValueResult &uvr, bool sc) : UVR(uvr), scalable(sc) {} + + void count(Instruction *I) { + if (visited.insert(I).second) { + stack.push_back(I); + } + } + + void countOperand(Value *V) { + if (auto *I = dyn_cast(V)) { + countInstruction(I); + } + } + + void countInstruction(Instruction *I) { + if (scalable) { + if (auto *const shuffle = dyn_cast(I)) { + if (!isValidScalableShuffle(*shuffle)) { + return; + } + } + } + + if (I->getType()->isVectorTy() && UVR.isVarying(I)) { + count(I); + } + } + + void countOperands(Instruction *I) { + if (auto *Phi = dyn_cast(I)) { + for (auto &use : Phi->incoming_values()) { + countOperand(use.get()); + } + return; + } + + for (auto *V : I->operand_values()) { + countOperand(V); + } + } + + void run() { + while (!stack.empty()) { + Instruction *I = stack.back(); + stack.pop_back(); + countOperands(I); + } + } +}; + +} // namespace + +PreservedAnalyses ScalarizationPass::run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM) { + VectorizationUnit &VU = AM.getResult(F).getVU(); + auto &Ctx = AM.getResult(F).getContext(); + const auto &MAMProxy = AM.getResult(F); + const auto *DI = + MAMProxy.getCachedResult( + *F.getParent()); + const bool DoubleSupport = DI && DI->double_capabilities != 0; + + const bool FullScalarization = + VU.choices().isEnabled(VectorizationChoices::eFullScalarization); + bool NeedsScalarization = false; + Scalarizer SR(F, Ctx, DoubleSupport); + + UniformValueResult &UVR = AM.getResult(F); + + // Find vector leaves that need to be scalarized. + std::vector Leaves; + UVR.findVectorLeaves(Leaves); + + if (FullScalarization) { + // Find varying vector values that need to be scalarized. + for (BasicBlock *BB : depth_first(&F)) { + for (Instruction &I : *BB) { + if (needsScalarization(*I.getType()) && UVR.isVarying(&I)) { + SR.setNeedsScalarization(&I); + NeedsScalarization = true; + } + } + } + + for (Instruction *Leaf : Leaves) { + if (needsScalarization(*Leaf) && getVectorType(Leaf)) { + SR.setNeedsScalarization(Leaf); + NeedsScalarization = true; + } + } + } else { + // We use the tracer to identify instructions that are only used by + // scalar instructions (i.e. ExtractElement instructions and reductions). + // + // Since these instructions don't necessarily use all lanes of their + // operands, scalarization can produce dead code, which will get removed + // by later cleanup optimizations. Reductions are generally much better + // off scalarized. + const bool scalable = VU.width().isScalable(); + + OperandTracer tracer(UVR, scalable); + for (Instruction *Leaf : Leaves) { + if (needsScalarization(*Leaf) && getVectorType(Leaf)) { + tracer.countOperands(Leaf); + } + } + // Vector-to-scalar bitcasts aren't normally counted as vector leaves, but + // in this case we void unnecessary scalarization if we do. + for (auto &BB : F) { + for (auto &I : BB) { + if (auto *B = dyn_cast(&I)) { + if (B->getSrcTy()->isVectorTy() && !B->getDestTy()->isVectorTy() && + UVR.isVarying(B)) { + tracer.countOperands(B); + } + } + } + } + + tracer.run(); + + for (auto &BB : F) { + for (auto &I : BB) { + if (!shouldScalarize(&I, scalable)) { + continue; + } + + if (I.getType()->isVectorTy() && UVR.isVarying(&I) && + !tracer.visited.contains(&I)) { + SR.setNeedsScalarization(&I); + NeedsScalarization = true; + } + } + } + } + + if (!NeedsScalarization) { + return PreservedAnalyses::all(); + } + + if (!SR.scalarizeAll()) { + ++VeczScalarizeFail; + return VU.setFailed("Failed to scalarize"); + } + + PreservedAnalyses Preserved; + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + Preserved.preserve(); + return Preserved; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp new file mode 100644 index 0000000000000..af44c92bfd780 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp @@ -0,0 +1,1583 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/scalarizer.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "debugging.h" +#include "llvm_helpers.h" +#include "memory_operations.h" +#include "simd_packet.h" +#include "transform/printf_scalarizer.h" +#include "vectorization_context.h" +#include "vecz/vecz_target_info.h" + +#define DEBUG_TYPE "vecz-scalarization" + +namespace { +/// @brief The maximum vector width that Vecz can handle. +/// +/// The current limitation is due to the masks being used in the SimdPackets +/// being stored as uint64_t. +const unsigned MAX_SIMD_WIDTH = 64; +} // namespace + +using namespace vecz; +using namespace llvm; + +STATISTIC(VeczScalarized, "Number of instructions scalarized [ID#S00]"); +STATISTIC(VeczScalarizeFailCall, + "Scalarize: missing function declarations [ID#S81]"); +STATISTIC(VeczScalarizeFailBuiltin, + "Scalarize: non-scalarizable builtins [ID#S82]"); +STATISTIC(VeczScalarizeFailPrintf, + "Scalarize: failures to scalarize printf [ID#S83]"); +STATISTIC(VeczScalarizeFailCast, + "Scalarize: failures to scalarize cast [ID#S84]"); +STATISTIC(VeczScalarizeFailBitcast, + "Scalarize: failures to scalarize bitcast [ID#S85]"); +STATISTIC(VeczScalarizeFailReduceIntrinsic, + "Scalarize: failures to scalarize vector.reduce intrinsic [ID#S86]"); + +Scalarizer::Scalarizer(llvm::Function &F, VectorizationContext &ctx, + bool DoubleSuport) + : Ctx(ctx), F(F), DoubleSupport(DoubleSuport) {} + +SimdPacket *Scalarizer::getPacket(const Value *V, unsigned Width, bool Create) { + auto infoIt = packets.find(V); + if (infoIt != packets.end()) { + return infoIt->second.get(); + } + + if (Create) { + auto *P = (packets[V] = std::make_unique()).get(); + P->resize(Width); + return P; + } else { + return nullptr; + } +} + +Value *Scalarizer::getGather(Value *V) { + auto &Cache = Gathers[V]; + if (Cache) { + return Cache; + } + + // Build the gather directly before the original instruction. + // If it is not an instruction just return the original. + auto *insert = dyn_cast(V); + if (!insert) { + Cache = V; + return V; + } + + auto *VecTy = cast(V->getType()); + const unsigned SimdWidth = VecTy->getNumElements(); + + SimdPacket *P = getPacket(V, SimdWidth, false); + assert(P); + + // Have to build after any PHI nodes. + while (isa(insert)) { + insert = insert->getNextNode(); + } + IRBuilder<> B(insert); + + // If every element in the packet is the same, create a vector splat instead + // of individually inserting every element. + Value *const splat = [](SimdPacket &P) -> Value * { + Value *const first = P.at(0); + for (unsigned i = 1; i < P.size(); i++) { + if (P.at(i) != first) { + return nullptr; + } + } + return first; + }(*P); + if (splat) { + return Cache = + B.CreateVectorSplat(ElementCount::getFixed(P->size()), splat); + } + + Value *Result = PoisonValue::get(V->getType()); + for (unsigned i = 0; i < P->size(); i++) { + if (auto *At = P->at(i)) { + if (!isa(At)) { + Result = B.CreateInsertElement(Result, At, B.getInt32(i)); + } + } + } + + Cache = Result; + return Result; +} + +void Scalarizer::setNeedsScalarization(Value *V) { + // Only mark each value once, but preserve the order + if (ScalarizeSet.insert(V).second) { + ToScalarize.push_back(V); + } +} + +bool Scalarizer::scalarizeAll() { + // scalar instructions that use values to be scalarized. + for (Value *V : ToScalarize) { + auto *VecTy = getVectorType(V); + assert(VecTy && "Trying to scalarize a non-vector"); + const unsigned SimdWidth = VecTy->getNumElements(); + // In the SimdPacket we use a mask that is stored as a uint64_t. Due + // to that, there is a limit on the vector size that Vecz can + // handle. + VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large"); + + PacketMask PM; + PM.enableAll(SimdWidth); + if (!scalarize(V, PM)) { + return false; + } + } + + // Beware of instructions not being processed strictly in dominance order. + DenseSet ScalarLeaves; + for (Value *V : ToScalarize) { + if (Failures.contains(V)) { + continue; + } + + // Any user of a scalarized instruction that is not itself scalarized needs + // its operands fixing up to use the scalarized versions. + for (auto *U : V->users()) { + if (auto *I = dyn_cast(U)) { + if (!ScalarizeSet.contains(I)) { + ScalarLeaves.insert(I); + } + } + } + } + + for (Instruction *I : ScalarLeaves) { + if (!scalarizeOperands(I)) { + emitVeczRemarkMissed(&F, I, "Could not scalarize"); + return false; + } + } + + IC.deleteInstructions(); + return true; +} + +Value *Scalarizer::scalarizeOperands(Instruction *I) { + // Vector extractions. + if (ExtractElementInst *Extract = dyn_cast(I)) { + // In the SimdPacket we use a mask that is stored as a uint64_t. Due to + // that, there is a limit on the vector size that Vecz can handle. + VECZ_ERROR_IF(multi_llvm::getVectorNumElements( + Extract->getVectorOperandType()) > MAX_SIMD_WIDTH, + "The SIMD width is too large"); + return scalarizeOperandsExtractElement(Extract); + } + + // Vector -> non-vector bitcasts. + if (BitCastInst *BC = dyn_cast(I)) { + if (BC->getSrcTy()->isVectorTy() && !BC->getDestTy()->isVectorTy()) { + // In the SimdPacket we use a mask that is stored as a uint64_t. Due to + // that, there is a limit on the vector size that Vecz can handle. + VECZ_ERROR_IF(multi_llvm::getVectorNumElements(BC->getSrcTy()) > + MAX_SIMD_WIDTH, + "The SIMD width is too large"); + return scalarizeOperandsBitCast(BC); + } + } + + // printf or reduction intrinsic calls + if (CallInst *CI = dyn_cast(I)) { + Function *Callee = CI->getCalledFunction(); + VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall); + + // printf calls: + if (!Callee->isIntrinsic()) { + // Check if this is indeed a printf call + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + if (auto B = BI.analyzeBuiltin(*Callee)) { + if (B->ID == BI.getPrintfBuiltin()) { + return scalarizeOperandsPrintf(CI); + } + } + } + + // reduction intrinsics: + if (auto *Intrin = dyn_cast(CI)) { + if (auto *reduce = scalarizeReduceIntrinsic(Intrin)) { + return reduce; + } + } + } + + // No special-case handling, so just gather any scalarized operands + for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) { + auto *Op = I->getOperand(i); + if (ScalarizeSet.contains(Op)) { + I->setOperand(i, getGather(Op)); + } + } + + return I; +} + +Value *Scalarizer::scalarizeOperandsPrintf(CallInst *CI) { + VECZ_STAT_FAIL_IF(CI->arg_empty(), VeczScalarizeFailPrintf); + + // Get the format string as a string + GlobalVariable *FmtStringGV = GetFormatStringAsValue(CI->getArgOperand(0)); + VECZ_STAT_FAIL_IF(!FmtStringGV, VeczScalarizeFailCall); + const std::string FmtString = GetFormatStringAsString(FmtStringGV); + VECZ_STAT_FAIL_IF(FmtString.empty(), VeczScalarizeFailCall); + std::string NewFmtString; + const EnumPrintfError err = + ScalarizeAndCheckFormatString(FmtString, NewFmtString); + // Check if the format string was scalarizer successfully + VECZ_STAT_FAIL_IF(err != kPrintfError_success, VeczScalarizeFailCall); + + // Create a new global variable out of the new format string + GlobalVariable *NewFmtStringGV = GetNewFormatStringAsGlobalVar( + *CI->getModule(), FmtStringGV, NewFmtString); + + IRBuilder<> B(CI); + // Gather the operands for the new printf call, taking care to scalarize + // any vector operands. + llvm::SmallVector NewOps; + for (const Use &Op : CI->args()) { + // The first operand is the new format string + if (Op == *CI->arg_begin()) { + Constant *Zero = B.getInt32(0); + NewOps.push_back(B.CreateGEP(NewFmtStringGV->getValueType(), + NewFmtStringGV, {Zero, Zero})); + continue; + } + // The rest of the operands can either be copied or scalarized + if (!Op->getType()->isVectorTy()) { + // Non-vector operand, just copy + NewOps.push_back(Op.get()); + } else { + // Vector operand, scalarize + // In the SimdPacket we use a mask that is stored as a uint64_t. Due + // to that, there is a limit on the vector size that Vecz can handle. + const uint32_t SimdWidth = + multi_llvm::getVectorNumElements(Op->getType()); + VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large"); + PacketMask PM; + PM.enableAll(SimdWidth); + const SimdPacket *OpPacket = scalarize(Op.get(), PM); + VECZ_STAT_FAIL_IF(!OpPacket, VeczScalarizeFailCall); + for (unsigned i = 0; i < OpPacket->size(); ++i) { + Value *Lane = OpPacket->at(i); + VECZ_STAT_FAIL_IF(!Lane, VeczScalarizeFailCall); + // We need to promote half and floats to doubles, as per 6.5.2.2/6 + // in the C99 standard, but not if the device does not have double + // support, in which case we need to promote them to floats, as per + // 6.12.13.2 in the OpenCL 1.2 standard. + Type *LaneTy = Lane->getType(); + Type *PromotionType = DoubleSupport ? B.getDoubleTy() : B.getFloatTy(); + if (LaneTy->isFloatingPointTy() && + LaneTy->getPrimitiveSizeInBits() < + PromotionType->getPrimitiveSizeInBits()) { + VECZ_ERROR_IF(!LaneTy->isHalfTy() && !LaneTy->isFloatTy(), + "Unexpected floating point type"); + Lane = B.CreateFPExt(Lane, PromotionType); + } + NewOps.push_back(Lane); + } + } + } + // Create the new printf call + Function *Callee = CI->getCalledFunction(); + CallInst *NewCI = B.CreateCall(Callee, NewOps, CI->getName()); + NewCI->setCallingConv(CI->getCallingConv()); + NewCI->setAttributes(CI->getAttributes()); + + // Replace all uses of the old one with the new one + CI->replaceAllUsesWith(NewCI); + IC.deleteInstructionLater(CI); + + return NewCI; +} + +Value *Scalarizer::scalarizeReduceIntrinsic(IntrinsicInst *Intrin) { + // Mark unhandled reduce intrinsics to fail (for now) + bool isHandled = true; + Instruction::BinaryOps BinOpcode; + switch (Intrin->getIntrinsicID()) { + default: + isHandled = false; + break; + case Intrinsic::vector_reduce_and: + BinOpcode = Instruction::And; + break; + case Intrinsic::vector_reduce_or: + BinOpcode = Instruction::Or; + break; + case Intrinsic::vector_reduce_xor: + BinOpcode = Instruction::Xor; + break; + case Intrinsic::vector_reduce_add: + // TODO: Need to handle FP reduce_add (Instruction::FAdd) + if (!Intrin->getType()->isFloatTy()) { + BinOpcode = Instruction::Add; + } else { + isHandled = false; + } + break; + case Intrinsic::vector_reduce_mul: + // TODO: Need to handle FP reduce_mul (Instruction::FMul) + if (!Intrin->getType()->isFloatTy()) { + BinOpcode = Instruction::Mul; + } else { + isHandled = false; + } + break; + case Intrinsic::vector_reduce_fadd: + // TODO: Need to handle FP reduce_add + isHandled = false; + break; + case Intrinsic::vector_reduce_fmul: + // TODO: Need to handle FP reduce_mul + isHandled = false; + break; + case Intrinsic::vector_reduce_fmax: + case Intrinsic::vector_reduce_smax: + case Intrinsic::vector_reduce_umax: + // TODO: Need to handle Int (signed/unsigned) Max and FP Max + isHandled = false; + break; + case Intrinsic::vector_reduce_fmin: + case Intrinsic::vector_reduce_smin: + case Intrinsic::vector_reduce_umin: + // TODO: Need to handle Int (signed/unsigned) Min and FP Min + isHandled = false; + break; + } + // If it's an intrinsic we don't handle here, return nullptr and fallback + // to simple gathering of any scalarized operands. + if (!isHandled) { + return nullptr; + } + + // We need to handle more reduce intrinsics such as with more than 1 operand + // like 'fadd' and 'fmul', where the first operand is scalar and the second + // is the vector. However, the current scalarization analysis won't let these + // through and will fail, so we the reduce intrinsic scalarization takes in + // account only the the first (vector) operand, which is the only operand for + // the integer reduce cases. + Value *Vec = Intrin->getOperand(0); + assert(Vec && "Could not get operand 0 of Intrin"); + + // In the SimdPacket we use a mask that is stored as a uint64_t. Due to + // that, there is a limit on the vector size that Vecz can handle. + auto *VecTy = dyn_cast(Vec->getType()); + VECZ_FAIL_IF(!VecTy); + const uint32_t SimdWidth = VecTy->getNumElements(); + VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large"); + + PacketMask PM; + IRBuilder<> B(Intrin); + PM.enableAll(SimdWidth); + + const SimdPacket *Packet = scalarize(Vec, PM); + VECZ_STAT_FAIL_IF(!Packet, VeczScalarizeFailReduceIntrinsic); + + Type *const VecEleTy = VecTy->getElementType(); + Value *Result = ConstantInt::getNullValue(VecEleTy); + for (unsigned i = 0; i < Packet->size(); ++i) { + Value *const Lane = Packet->at(i); + VECZ_STAT_FAIL_IF(!Lane, VeczScalarizeFailCall); + Type *const LaneTy = Lane->getType(); + VECZ_ERROR_IF(LaneTy->isFloatTy(), "Unexpected floating point type"); + Result = B.CreateBinOp(BinOpcode, Result, Lane); + } + + Intrin->replaceAllUsesWith(Result); + IC.deleteInstructionLater(Intrin); + + return Result; +} + +Value *Scalarizer::scalarizeOperandsExtractElement(ExtractElementInst *Extr) { + // Determine the extraction index. + Value *OrigVec = Extr->getOperand(0); + Value *ExtractIndex = Extr->getOperand(1); + assert(OrigVec && "Could not get operand 0 of Extr"); + assert(ExtractIndex && "Could not get operand 1 of Extr"); + ConstantInt *ConstantExtractIndex = dyn_cast(ExtractIndex); + PacketMask PM; + SimdPacket *OrigVecPacket; + Value *ReturnVal; + + if (!ConstantExtractIndex) { + // Index of extractElementInst is not a constant + // Scalarize the original vector for all lanes. + auto *Vec = dyn_cast(OrigVec->getType()); + const unsigned VecWidth = Vec ? Vec->getNumElements() : 0; + PM.enableAll(VecWidth); + OrigVecPacket = scalarize(OrigVec, PM); + VECZ_FAIL_IF(!OrigVecPacket); + + IRBuilder<> B(Extr); + Value *Select = PoisonValue::get(Extr->getType()); + for (unsigned lane = 0; lane < VecWidth; lane++) { + // Check if the the lane matches the extract index and select + // the corresponding value + Value *Cmp = B.CreateICmpEQ( + ConstantInt::get(ExtractIndex->getType(), lane), ExtractIndex); + Select = B.CreateSelect(Cmp, OrigVecPacket->at(lane), Select); + } + ReturnVal = Select; + } else { + // Scalarize the original vector, but only for the lane to extract. + const unsigned Lane = ConstantExtractIndex->getZExtValue(); + PM.enable(Lane); + OrigVecPacket = scalarize(OrigVec, PM); + VECZ_FAIL_IF(!OrigVecPacket); + ReturnVal = OrigVecPacket->at(Lane); + } + + // Replace the extraction by the extracted lane value. + Extr->replaceAllUsesWith(ReturnVal); + IC.deleteInstructionLater(Extr); + return ReturnVal; +} + +Value *Scalarizer::scalarizeOperandsBitCast(BitCastInst *BC) { + auto *VecSrcTy = dyn_cast(BC->getSrcTy()); + VECZ_FAIL_IF(!VecSrcTy); + const unsigned SimdWidth = VecSrcTy->getNumElements(); + PacketMask PM; + PM.enableAll(SimdWidth); + const SimdPacket *SrcPacket = scalarize(BC->getOperand(0), PM); + VECZ_FAIL_IF(!SrcPacket); + + Type *DstTy = BC->getDestTy(); + Type *DstAsIntTy = DstTy; + Type *SrcEleTy = VecSrcTy->getElementType(); + Type *SrcEleAsIntTy = SrcEleTy; + const uint64_t SrcEleBits = SrcEleTy->getScalarSizeInBits(); + const uint64_t DstBits = DstTy->getPrimitiveSizeInBits(); + if (!DstTy->isIntegerTy()) { + DstAsIntTy = IntegerType::get(BC->getContext(), DstBits); + } + if (!SrcEleTy->isIntegerTy()) { + SrcEleAsIntTy = IntegerType::get(BC->getContext(), SrcEleBits); + } + + // Successively OR each scalarized value together. + IRBuilder<> B(BC); + Value *Result = ConstantInt::getNullValue(DstAsIntTy); + for (unsigned i = 0; i < SimdWidth; i++) { + Value *Lane = SrcPacket->at(i); + if (!SrcEleTy->isIntegerTy()) { + Lane = B.CreateBitCast(Lane, SrcEleAsIntTy); + } + Lane = B.CreateZExt(Lane, DstAsIntTy); + Lane = B.CreateShl(Lane, i * SrcEleBits); + Result = B.CreateOr(Result, Lane); + } + if (!DstTy->isIntegerTy()) { + Result = B.CreateBitCast(Result, DstTy); + } + BC->replaceAllUsesWith(Result); + IC.deleteInstructionLater(BC); + return Result; +} + +SimdPacket *Scalarizer::scalarize(Value *V, PacketMask PM) { + auto *VecTy = getVectorType(V); + VECZ_ERROR_IF(!VecTy, + "We shouldn't be trying to scalarize a non-vector instruction"); + const unsigned SimdWidth = VecTy->getNumElements(); + + // Re-use cached packets, but make sure it contains all the lanes we want. + // If we have a cached packet with missing lanes, it will be fetched by + // getPacket and filled with the new lanes. + SimdPacket *CachedPacket = getPacket(V, SimdWidth, false); + if (CachedPacket && ((CachedPacket->Mask.Value & PM.Value) == PM.Value)) { + return CachedPacket; + } + + // This value hasn't been scheduled for scalarization, so extract instead + if (!V->getType()->isVoidTy() && !ScalarizeSet.contains(V)) { + return extractLanes(V, PM); + } + + // Only instructions can be scalarized at this point. + Instruction *Ins = dyn_cast(V); + if (!Ins) { + if (!V->getType()->isVoidTy()) { + return extractLanes(V, PM); + } else { + return assignScalar(nullptr, V); + } + } + + // Figure out what kind of instruction it is and try to scalarize it. + SimdPacket *Result = nullptr; + switch (Ins->getOpcode()) { + default: + if (Ins->isBinaryOp()) { + Result = scalarizeBinaryOp(cast(V), PM); + } else if (Ins->isCast()) { + Result = scalarizeCast(cast(V), PM); + } else if (Ins->isUnaryOp()) { + Result = scalarizeUnaryOp(cast(V), PM); + } + break; + case Instruction::GetElementPtr: + Result = scalarizeGEP(cast(V), PM); + break; + case Instruction::Store: + Result = scalarizeStore(cast(V), PM); + break; + case Instruction::Load: + Result = scalarizeLoad(cast(V), PM); + break; + case Instruction::Call: + Result = scalarizeCall(cast(V), PM); + break; + case Instruction::ICmp: + Result = scalarizeICmp(cast(V), PM); + break; + case Instruction::FCmp: + Result = scalarizeFCmp(cast(V), PM); + break; + case Instruction::Select: + Result = scalarizeSelect(cast(V), PM); + break; + case Instruction::ShuffleVector: + Result = scalarizeShuffleVector(cast(V), PM); + break; + case Instruction::InsertElement: + Result = scalarizeInsertElement(cast(V), PM); + break; + case Instruction::PHI: + Result = scalarizePHI(cast(V), PM); + break; + // Freeze instruction is not available in LLVM versions prior 10.0 + // and not used in LLVM versions prior to 11.0 + case Instruction::Freeze: + Result = scalarizeFreeze(cast(V), PM); + break; + } + + if (Result) { + scalarizeDI(Ins, Result, SimdWidth); + return assignScalar(Result, V); + } else { + // If an instruction couldn't be scalarized, we can just extract its + // elements, but we also need to remove it from the scalarization set and + // add it to the failures set so any scalar leaves don't try to scalarize + // it again. + ScalarizeSet.erase(Ins); + Failures.insert(Ins); + return extractLanes(V, PM); + } +} + +SimdPacket *Scalarizer::extractLanes(llvm::Value *V, PacketMask PM) { + auto *VecTy = getVectorType(V); + VECZ_FAIL_IF(!VecTy); + const unsigned SimdWidth = VecTy->getNumElements(); + SimdPacket *P = getPacket(V, SimdWidth); + + if (Constant *CVec = dyn_cast(V)) { + assert(isa(CVec->getType()) && "Invalid constant type!"); + SimdPacket *P = getPacket(CVec, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + P->set(i, CVec->getAggregateElement(i)); + } + return P; + } + + Instruction *insert = nullptr; + + if (auto *Arg = dyn_cast(V)) { + BasicBlock &Entry = Arg->getParent()->getEntryBlock(); + + // Make sure we start inserting new instructions after any allocas + auto insertAfter = Entry.begin(); + + while (isa(*insertAfter)) { + insertAfter++; + } + insert = &*insertAfter; + } else if (auto *Inst = dyn_cast(V)) { + insert = Inst->getNextNode(); + while (isa(insert)) { + insert = insert->getNextNode(); + } + } else { + return nullptr; + } + + const SimplifyQuery Q(F.getParent()->getDataLayout()); + + IRBuilder<> B(insert); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + + Value *Idx = B.getInt32(i); + Value *Extract = simplifyExtractElementInst(V, Idx, Q); + if (!Extract) { + Extract = B.CreateExtractElement(V, Idx); + } + P->set(i, Extract); + } + return P; +} + +void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet, + unsigned Width) { + // Don't support scalarizing PHI nodes + if (!Packet || !Original || isa(Original)) { + return; + } + + auto *const LAM = LocalAsMetadata::getIfExists(Original); + if (!LAM) { + return; + } + + // Contains processed SIMD values for which we create scalar debug + // instructions and is used to avoid duplicate LLVM dbg.value's. + SmallPtrSet VectorElements; + + DIBuilder DIB(*Original->getModule(), false); + + for (DbgVariableRecord *const DVR : LAM->getAllDbgVariableRecordUsers()) { + DILocalVariable *DILocal = nullptr; + DebugLoc DILoc; + + switch (DVR->getType()) { + case DbgVariableRecord::LocationType::Value: + case DbgVariableRecord::LocationType::Declare: + DILocal = DVR->getVariable(); + DILoc = DVR->getDebugLoc(); + break; + default: + continue; + } + + // Create new DbgVariableRecord across enabled SIMD lanes + const auto bitSize = Original->getType()->getScalarSizeInBits(); + for (unsigned lane = 0; lane < Width; ++lane) { + Value *LaneVal = Packet->at(lane); + if (LaneVal && !isa(LaneVal)) { + // Check if the LaneVal SIMD Value is already processed + // and a Debug Value Intrinsic has been created for it. + if (VectorElements.contains(LaneVal)) { + continue; + } + // DWARF bit piece expressions are used to describe part of an + // aggregate variable, our vector, which is fragmented across multiple + // values. First argument takes the offset of the piece, and the second + // takes the piece size. + std::optional DIExpr = + DIExpression::createFragmentExpression(DIB.createExpression(), + lane * bitSize, bitSize); + if (DIExpr) { + DIB.insertDbgValueIntrinsic(LaneVal, DILocal, *DIExpr, DILoc, + Original->getIterator()); + VectorElements.insert(LaneVal); + } + } + } + } + + auto *const MDV = MetadataAsValue::getIfExists(Original->getContext(), LAM); + if (!MDV) { + return; + } +} + +SimdPacket *Scalarizer::assignScalar(SimdPacket *P, Value *V) { + if (!P) { + emitVeczRemarkMissed(&F, V, "Could not scalarize"); + } else { + ++VeczScalarized; + if (Instruction *I = dyn_cast(V)) { + IC.deleteInstructionLater(I); + } + } + return P; +} + +SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) { + Value *PtrBase = Load->getPointerOperand(); + auto *VecDataTy = dyn_cast(Load->getType()); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + + Type *ScalarEleTy = VecDataTy->getElementType(); + + // Absorb redundant bitcasts + GetElementPtrInst *PtrGEP = dyn_cast(PtrBase); + const bool InBounds = (PtrGEP && PtrGEP->isInBounds()); + + IRBuilder<> B(Load); + + SimdPacket PtrPacket; + SimdPacket *P = getPacket(Load, SimdWidth); + PtrPacket.resize(SimdWidth); + + // Emit scalarized pointers. + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || PtrPacket.at(i)) { + continue; + } + + // Re-use GEPs if available + if (P->at(i)) { + LoadInst *LoadI = cast(P->at(i)); + Value *PtrI = LoadI->getPointerOperand(); + if (isa(PtrI)) { + PtrPacket.set(i, PtrI); + continue; + } + } + + Value *Ptr = InBounds + ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i)) + : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i)); + PtrPacket.set(i, Ptr); + } + + // The individual elements may need laxer alignment requirements than the + // whole vector. + const unsigned Alignment = Load->getAlign().value(); + unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8; + EleAlign = std::min(Alignment, EleAlign); + + // Emit scalarized loads. + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + LoadInst *NewLoad = B.CreateLoad(ScalarEleTy, PtrPacket.at(i), + Load->isVolatile(), Load->getName()); + + NewLoad->copyMetadata(*Load); + NewLoad->setAlignment(MaybeAlign(EleAlign).valueOrOne()); + + P->set(i, NewLoad); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) { + Value *PtrBase = Store->getPointerOperand(); + assert(PtrBase && "Could not get pointer operand from Store"); + auto *VecDataTy = + dyn_cast(Store->getValueOperand()->getType()); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + Type *ScalarEleTy = VecDataTy->getElementType(); + Value *VectorData = Store->getValueOperand(); + + // Emit scalarized data values. + const SimdPacket *DataPacket = scalarize(VectorData, PM); + VECZ_FAIL_IF(!DataPacket); + + GetElementPtrInst *PtrGEP = dyn_cast(PtrBase); + const bool InBounds = (PtrGEP && PtrGEP->isInBounds()); + + IRBuilder<> B(Store); + + SimdPacket PtrPacket; + SimdPacket *P = getPacket(Store, SimdWidth); + PtrPacket.resize(SimdWidth); + + // Emit scalarized pointers. + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || PtrPacket.at(i)) { + continue; + } + + // Re-use GEPs if available + if (P->at(i)) { + StoreInst *StoreI = cast(P->at(i)); + Value *PtrI = StoreI->getPointerOperand(); + if (isa(PtrI)) { + PtrPacket.set(i, PtrI); + continue; + } + } + + Value *Ptr = InBounds + ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i)) + : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i)); + PtrPacket.set(i, Ptr); + } + + // See comment at equivalent part of scalarizeLoad() + const unsigned Alignment = Store->getAlign().value(); + unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8; + EleAlign = std::min(Alignment, EleAlign); + + // Emit scalarized stores. + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *Data = DataPacket->at(i); + if (isa(Data)) { + P->set(i, Data); + } else { + StoreInst *NewStore = + B.CreateStore(Data, PtrPacket.at(i), Store->isVolatile()); + + NewStore->copyMetadata(*Store); + NewStore->setAlignment(MaybeAlign(EleAlign).valueOrOne()); + + P->set(i, NewStore); + } + } + return P; +} + +SimdPacket *Scalarizer::scalarizeBinaryOp(BinaryOperator *BinOp, + PacketMask PM) { + IRBuilder<> B(BinOp); + Value *LHS = BinOp->getOperand(0); + auto *VecDataTy = dyn_cast(LHS->getType()); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + const SimdPacket *LHSPacket = scalarize(LHS, PM); + VECZ_FAIL_IF(!LHSPacket); + Value *RHS = BinOp->getOperand(1); + const SimdPacket *RHSPacket = scalarize(RHS, PM); + VECZ_FAIL_IF(!RHSPacket); + SimdPacket *P = getPacket(BinOp, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *New = B.CreateBinOp(BinOp->getOpcode(), LHSPacket->at(i), + RHSPacket->at(i), BinOp->getName()); + if (BinaryOperator *NewBinOp = dyn_cast(New)) { + NewBinOp->copyIRFlags(BinOp); + } + P->set(i, New); + } + return P; +} + +// Freeze instruction is not available in LLVM versions prior 10.0 +// and not used in LLVM versions prior to 11.0 +SimdPacket *Scalarizer::scalarizeFreeze(FreezeInst *FreezeI, PacketMask PM) { + IRBuilder<> B(FreezeI); + Value *Src = FreezeI->getOperand(0); + auto *VecDataTy = dyn_cast(Src->getType()); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + const SimdPacket *SrcPacket = scalarize(Src, PM); + VECZ_FAIL_IF(!SrcPacket); + + // Create scalarized freeze. + SimdPacket *P = getPacket(FreezeI, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *New = B.CreateFreeze(SrcPacket->at(i), FreezeI->getName()); + P->set(i, New); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeUnaryOp(UnaryOperator *UnOp, PacketMask PM) { + IRBuilder<> B(UnOp); + Value *Src = UnOp->getOperand(0); + auto *VecDataTy = dyn_cast(Src->getType()); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + const SimdPacket *SrcPacket = scalarize(Src, PM); + VECZ_FAIL_IF(!SrcPacket); + SimdPacket *P = getPacket(UnOp, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *New = + B.CreateUnOp(UnOp->getOpcode(), SrcPacket->at(i), UnOp->getName()); + if (UnaryOperator *NewUnOp = dyn_cast(New)) { + NewUnOp->copyIRFlags(UnOp); + } + P->set(i, New); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeCast(CastInst *CastI, PacketMask PM) { + // Make sure we support the cast operation. + const CastInst::CastOps Opc = CastI->getOpcode(); + switch (Opc) { + default: + return nullptr; + case CastInst::BitCast: + return scalarizeBitCast(cast(CastI), PM); + case CastInst::Trunc: + case CastInst::ZExt: + case CastInst::SExt: + case CastInst::FPToUI: + case CastInst::FPToSI: + case CastInst::UIToFP: + case CastInst::SIToFP: + case CastInst::FPTrunc: + case CastInst::FPExt: + case CastInst::AddrSpaceCast: + break; + } + + // Scalarize the source value. + IRBuilder<> B(CastI); + Value *Src = CastI->getOperand(0); + auto *VecSrcTy = dyn_cast(Src->getType()); + VECZ_FAIL_IF(!VecSrcTy); + const unsigned SimdWidth = VecSrcTy->getNumElements(); + auto *VecDstTy = dyn_cast(CastI->getType()); + VECZ_STAT_FAIL_IF(!VecDstTy || (VecDstTy->getNumElements() != SimdWidth), + VeczScalarizeFailCast); + const SimdPacket *SrcPacket = scalarize(Src, PM); + VECZ_FAIL_IF(!SrcPacket); + + // Create scalarized casts. + SimdPacket *P = getPacket(CastI, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + auto *const SrcPacketValue = SrcPacket->at(i); + VECZ_FAIL_IF(!SrcPacketValue); + Value *New = B.CreateCast(Opc, SrcPacketValue, VecDstTy->getElementType(), + CastI->getName()); + P->set(i, New); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) { + IRBuilder<> B(BC); + Type *SrcTy = BC->getSrcTy(); + Value *Src = BC->getOperand(0); + auto *VecSrcTy = dyn_cast(SrcTy); + auto *VecDstTy = dyn_cast(BC->getDestTy()); + VECZ_FAIL_IF(!VecDstTy); + const unsigned SimdWidth = VecDstTy->getNumElements(); + const bool Vec3Src = VecSrcTy && (VecSrcTy->getNumElements() == 3); + const bool Vec3Dst = (SimdWidth == 3); + VECZ_STAT_FAIL_IF(Vec3Src ^ Vec3Dst, VeczScalarizeFailBitcast); + + // Handle non-vector -> vector casts and vector casts with different widths. + if (!VecSrcTy || (VecSrcTy->getNumElements() != SimdWidth)) { + VECZ_FAIL_IF(BC->getModule()->getDataLayout().isBigEndian()); + + // Treat scalars as vectors of length 1. + SimdPacket SrcScalar{Src}; + SimdPacket &S = + VecSrcTy ? *getPacket(Src, VecSrcTy->getNumElements()) : SrcScalar; + Type *const SrcEleTy = VecSrcTy ? VecSrcTy->getElementType() : SrcTy; + // Source element need not be a primitive if it was a non-vector, but in + // that case we know the size must match the destination vector type. + const size_t SrcEleSize = VecSrcTy ? SrcEleTy->getPrimitiveSizeInBits() + : VecDstTy->getPrimitiveSizeInBits(); + Type *const SrcEleIntTy = + SrcEleTy->isIntegerTy() + ? SrcEleTy + : SrcEleTy->getIntNTy(BC->getContext(), + SrcEleTy->getPrimitiveSizeInBits()); + Type *const DstEleTy = VecDstTy->getElementType(); + const size_t DstEleSize = DstEleTy->getPrimitiveSizeInBits(); + Type *const DstEleIntTy = + DstEleTy->isIntegerTy() + ? DstEleTy + : DstEleTy->getIntNTy(BC->getContext(), + DstEleTy->getPrimitiveSizeInBits()); + SimdPacket *P = getPacket(BC, SimdWidth); + PacketMask SPM; + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + if (VecSrcTy) { + for (unsigned j = i * DstEleSize / SrcEleSize; + j * SrcEleSize < (i + 1) * DstEleSize; ++j) { + SPM.enable(j); + } + const SimdPacket *SrcPacket = scalarize(Src, SPM); + VECZ_FAIL_IF(!SrcPacket); + assert(SrcPacket == &S && + "Scalarization of Src should update existing packet"); + } + Value *Lane = nullptr; + for (unsigned j = i * DstEleSize / SrcEleSize; + j * SrcEleSize < (i + 1) * DstEleSize; ++j) { + Value *SrcPart = S[j]; + assert( + SrcPart && + "Scalarization of Src failure should have been detected earlier"); + if (SrcEleIntTy != SrcEleTy) { + SrcPart = B.CreateBitCast(SrcPart, SrcEleIntTy); + } + if (SrcEleIntTy->getIntegerBitWidth() < + DstEleIntTy->getIntegerBitWidth()) { + SrcPart = B.CreateZExt(SrcPart, DstEleIntTy); + } + if (i * DstEleSize > j * SrcEleSize) { + SrcPart = B.CreateLShr(SrcPart, (i * DstEleSize) - (j * SrcEleSize)); + } else if (j * SrcEleSize > i * DstEleSize) { + SrcPart = B.CreateShl(SrcPart, (j * SrcEleSize) - (i * DstEleSize)); + } + if (SrcEleIntTy->getIntegerBitWidth() > + DstEleIntTy->getIntegerBitWidth()) { + SrcPart = B.CreateTrunc(SrcPart, DstEleIntTy); + } + Lane = Lane ? B.CreateOr(Lane, SrcPart) : SrcPart; + } + assert(Lane && "No bits found for lane"); + if (DstEleTy != DstEleIntTy) { + Lane = B.CreateBitCast(Lane, DstEleTy); + } + P->set(i, Lane); + } + return P; + } + + // Handle same width vector -> vector casts, quite a more straighforward + // affair. + const SimdPacket *SrcPacket = scalarize(Src, PM); + VECZ_FAIL_IF(!SrcPacket); + Type *DstEleTy = VecDstTy->getElementType(); + SimdPacket *P = getPacket(BC, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *NewVal = B.CreateBitCast(SrcPacket->at(i), DstEleTy); + P->set(i, NewVal); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeICmp(ICmpInst *ICmp, PacketMask PM) { + IRBuilder<> B(ICmp); + Value *LHS = ICmp->getOperand(0); + auto *VecDataTy = dyn_cast(ICmp->getType()); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + const SimdPacket *LHSPacket = scalarize(LHS, PM); + VECZ_FAIL_IF(!LHSPacket); + Value *RHS = ICmp->getOperand(1); + const SimdPacket *RHSPacket = scalarize(RHS, PM); + VECZ_FAIL_IF(!RHSPacket); + SimdPacket *P = getPacket(ICmp, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *New = B.CreateICmp(ICmp->getPredicate(), LHSPacket->at(i), + RHSPacket->at(i), ICmp->getName()); + P->set(i, New); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeFCmp(FCmpInst *FCmp, PacketMask PM) { + IRBuilder<> B(FCmp); + Value *LHS = FCmp->getOperand(0); + auto *VecDataTy = dyn_cast(FCmp->getType()); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + const SimdPacket *LHSPacket = scalarize(LHS, PM); + VECZ_FAIL_IF(!LHSPacket); + Value *RHS = FCmp->getOperand(1); + const SimdPacket *RHSPacket = scalarize(RHS, PM); + VECZ_FAIL_IF(!RHSPacket); + SimdPacket *P = getPacket(FCmp, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *New = B.CreateFCmp(FCmp->getPredicate(), LHSPacket->at(i), + RHSPacket->at(i), FCmp->getName()); + P->set(i, New); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeSelect(SelectInst *Select, PacketMask PM) { + IRBuilder<> B(Select); + Value *Cond = Select->getCondition(); + const SimdPacket *CondPacket = nullptr; + if (Cond->getType()->isVectorTy()) { + CondPacket = scalarize(Cond, PM); + VECZ_FAIL_IF(!CondPacket); + } + Value *TrueVal = Select->getTrueValue(); + auto *VecDataTy = dyn_cast(Select->getType()); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + const SimdPacket *TruePacket = scalarize(TrueVal, PM); + VECZ_FAIL_IF(!TruePacket); + Value *FalseVal = Select->getFalseValue(); + const SimdPacket *FalsePacket = scalarize(FalseVal, PM); + VECZ_FAIL_IF(!FalsePacket); + SimdPacket *P = getPacket(Select, SimdWidth); + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *CondLane = CondPacket ? CondPacket->at(i) : Cond; + Value *New = B.CreateSelect(CondLane, TruePacket->at(i), FalsePacket->at(i), + Select->getName()); + P->set(i, New); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM, + MemOp &MaskedOp) { + Function *Callee = CI->getCalledFunction(); + VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall); + auto *VecDataTy = getVectorType(CI); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + assert((MaskedOp.isLoad() || MaskedOp.isStore()) && + "Masked op is not a store or load!"); + + // Scalarize mask + Value *MaskOperand = MaskedOp.getMaskOperand(); + VECZ_FAIL_IF(!MaskOperand); + const SimdPacket *MaskPacket = scalarize(MaskedOp.getMaskOperand(), PM); + VECZ_FAIL_IF(!MaskPacket); + + Value *PtrBase = MaskedOp.getPointerOperand(); + VECZ_FAIL_IF(!PtrBase); + + // Scalarize data packet if this is a store + const SimdPacket *DataPacket = nullptr; + if (MaskedOp.isStore()) { + DataPacket = scalarize(MaskedOp.getDataOperand(), PM); + VECZ_FAIL_IF(!DataPacket); + } + + Type *ScalarEleTy = VecDataTy->getElementType(); + + GetElementPtrInst *PtrGEP = dyn_cast(PtrBase); + const bool InBounds = (PtrGEP && PtrGEP->isInBounds()); + + IRBuilder<> B(CI); + + SimdPacket PtrPacket; + SimdPacket *P = getPacket(CI, SimdWidth); + PtrPacket.resize(SimdWidth); + + // Create scalar pointers + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || PtrPacket.at(i)) { + continue; + } + + Value *Ptr = InBounds + ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i)) + : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i)); + PtrPacket.set(i, Ptr); + } + + const unsigned Alignment = MaskedOp.getAlignment(); + unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8; + EleAlign = std::min(Alignment, EleAlign); + + for (unsigned i = 0; i < SimdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Instruction *ScalarMemOp = nullptr; + if (MaskedOp.isLoad()) { + ScalarMemOp = + createMaskedLoad(Ctx, ScalarEleTy, PtrPacket.at(i), MaskPacket->at(i), + /*EVL*/ nullptr, EleAlign); + } else { + ScalarMemOp = createMaskedStore(Ctx, DataPacket->at(i), PtrPacket.at(i), + MaskPacket->at(i), + /*EVL*/ nullptr, EleAlign); + } + VECZ_FAIL_IF(!ScalarMemOp); + B.Insert(ScalarMemOp); + P->set(i, ScalarMemOp); + } + + return P; +} + +SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) { + compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + Function *Callee = CI->getCalledFunction(); + VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall); + auto *VecDataTy = getVectorType(CI); + VECZ_FAIL_IF(!VecDataTy); + const unsigned SimdWidth = VecDataTy->getNumElements(); + + if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) { + if (MaskedOp->isMaskedMemOp()) { + return scalarizeMaskedMemOp(CI, PM, *MaskedOp); + } + } + + Value *VectorCallMask = nullptr; + if (Ctx.isMaskedFunction(Callee)) { + // We have a masked call to a function. + // Extract the mask from the call, we need to re-apply it later + VectorCallMask = CI->getArgOperand(CI->arg_size() - 1); + + // Get the original function call from the masked wrapper function + Function *originalFunc = Ctx.getOriginalMaskedFunction(Callee); + Callee = originalFunc; + } + + const auto Builtin = BI.analyzeBuiltin(*Callee); + VECZ_FAIL_IF(!Builtin); + Function *ScalarEquiv = BI.getScalarEquivalent(*Builtin, F.getParent()); + VECZ_STAT_FAIL_IF(!ScalarEquiv, VeczScalarizeFailBuiltin); + + IRBuilder<> B(CI); + const auto Props = Builtin->properties; + // Ignore the mask if present + const unsigned NumArgs = VectorCallMask ? CI->arg_size() - 1 : CI->arg_size(); + SmallVector OpPackets(NumArgs); + SmallVector OpScalars(NumArgs); + for (unsigned i = 0; i < NumArgs; i++) { + Value *OrigOp = CI->getArgOperand(i); + Type *OldTy = OrigOp->getType(); + if (OldTy->isVectorTy()) { + SimdPacket *OpPacket = scalarize(OrigOp, PM); + VECZ_FAIL_IF(!OpPacket); + OpPackets[i] = OpPacket; + } else if (PointerType *OldPtrTy = dyn_cast(OldTy)) { + auto *const PtrRetPointeeTy = + compiler::utils::getPointerReturnPointeeTy(*Callee, Props); + if (PtrRetPointeeTy && PtrRetPointeeTy->isVectorTy()) { + // Handle 'pointer return' arguments. The old type was Vector*, the new + // type is Scalar*. To accommodate the different we need to have + // individual offsets, one for each 'element pointer'. + auto *OldVecTy = cast(PtrRetPointeeTy); + VECZ_STAT_FAIL_IF(OldVecTy->getNumElements() != SimdWidth, + VeczScalarizeFailBuiltin); + Type *NewTy = OldPtrTy; + Value *ScalarAddrBase = B.CreateBitCast(OrigOp, NewTy); + SimdPacket *OpPacket = getPacket(ScalarAddrBase, SimdWidth); + for (unsigned j = 0; j < SimdWidth; j++) { + if (!PM.isEnabled(j) || OpPacket->at(j)) { + continue; + } + Value *ScalarAddr = B.CreateGEP(OldVecTy->getElementType(), + ScalarAddrBase, B.getInt32(j)); + OpPacket->set(j, ScalarAddr); + OpPackets[i] = OpPacket; + } + } else { + OpScalars[i] = OrigOp; + } + } else { + OpScalars[i] = OrigOp; + } + } + + SimdPacket *P = getPacket(CI, SimdWidth); + for (unsigned j = 0; j < SimdWidth; j++) { + if (!PM.isEnabled(j) || P->at(j)) { + continue; + } + SmallVector Ops; + for (unsigned i = 0; i < NumArgs; i++) { + const SimdPacket *OpPacket = OpPackets[i]; + if (OpPacket) { + Ops.push_back(OpPacket->at(j)); + } else { + Value *OrigOp = OpScalars[i]; + VECZ_FAIL_IF(!OrigOp); + Ops.push_back(OrigOp); + } + } + + CallInst *NewCI = B.CreateCall(ScalarEquiv, Ops, CI->getName()); + NewCI->setCallingConv(CI->getCallingConv()); + NewCI->setAttributes(CI->getAttributes()); + // Re-apply mask. The new CI already has to exist to create the masked + // function which is why it gets updated here. We then need to add the + // mask argument back to the call, but LLVM won't let us update the existing + // one, so recreate the CallInst one last time + if (VectorCallMask) { + Function *MaskedScalarEquiv = Ctx.getOrCreateMaskedFunction(NewCI); + VECZ_FAIL_IF(!MaskedScalarEquiv); + Ops.push_back(VectorCallMask); + CallInst *NewCIMasked = + B.CreateCall(MaskedScalarEquiv, Ops, CI->getName()); + NewCIMasked->setCallingConv(CI->getCallingConv()); + NewCIMasked->setAttributes(CI->getAttributes()); + P->set(j, NewCIMasked); + NewCI->eraseFromParent(); + } else { + P->set(j, NewCI); + } + } + return P; +} + +SimdPacket *Scalarizer::scalarizeShuffleVector(ShuffleVectorInst *Shuffle, + PacketMask PM) { + auto *VecTy = dyn_cast(Shuffle->getType()); + VECZ_FAIL_IF(!VecTy); + Value *LHS = Shuffle->getOperand(0); + Value *RHS = Shuffle->getOperand(1); + assert(LHS && "Could not get operand 0"); + assert(RHS && "Could not get operand 1"); + auto *LHSVecTy = dyn_cast(LHS->getType()); + VECZ_FAIL_IF(!LHSVecTy); + const unsigned SrcWidth = LHSVecTy->getNumElements(); + const unsigned DstWidth = VecTy->getNumElements(); + + // Determine which lanes we need from both vector operands. + PacketMask LHSMask; + PacketMask RHSMask; + for (unsigned i = 0; i < DstWidth; i++) { + if (!PM.isEnabled(i)) { + continue; + } + int MaskLane = Shuffle->getMaskValue(i); + if (MaskLane >= static_cast(SrcWidth)) { + MaskLane -= static_cast(SrcWidth); + RHSMask.enable(static_cast(MaskLane)); + } else if (MaskLane >= 0) { + LHSMask.enable(static_cast(MaskLane)); + } + } + + // Scalarize each vector operand as needed. + const SimdPacket *LHSPacket = nullptr; + if (LHSMask.Value != 0) { + LHSPacket = scalarize(LHS, LHSMask); + VECZ_FAIL_IF(!LHSPacket); + } + const SimdPacket *RHSPacket = nullptr; + if (RHSMask.Value != 0) { + RHSPacket = scalarize(RHS, RHSMask); + VECZ_FAIL_IF(!RHSPacket); + } + + // Copy the scalarized values to the result packet. + SimdPacket *P = getPacket(Shuffle, DstWidth); + for (unsigned i = 0; i < DstWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + Value *Extracted = nullptr; + int MaskLane = Shuffle->getMaskValue(i); + if (MaskLane < 0) { + Extracted = PoisonValue::get(VecTy->getElementType()); + } else if (MaskLane >= (int)SrcWidth) { + MaskLane -= (int)SrcWidth; + if (RHSPacket) { + Extracted = RHSPacket->at(MaskLane); + } + } else if (MaskLane >= 0) { + if (LHSPacket) { + Extracted = LHSPacket->at(MaskLane); + } + } + P->set(i, Extracted); + } + return P; +} + +SimdPacket *Scalarizer::scalarizeInsertElement(InsertElementInst *Insert, + PacketMask PM) { + Value *Vec = Insert->getOperand(0); + VECZ_FAIL_IF(!Vec); + Value *Ele = Insert->getOperand(1); + assert(Ele && "Could not get operand 1 of Insert"); + Value *Index = Insert->getOperand(2); + assert(Index && "Could not get operand 2 of Insert"); + const ConstantInt *CIndex = dyn_cast(Index); + const auto *VecTy = cast(Vec->getType()); + const unsigned IndexInt = CIndex ? CIndex->getZExtValue() : 0; + const unsigned SimdWidth = VecTy->getNumElements(); + + SimdPacket *P = getPacket(Insert, SimdWidth); + + // Scalarize the vector operand + PacketMask OpMask; + OpMask.enableAll(SimdWidth); + // If we have a constant mask, we can skip the lane we are not going to use + if (CIndex) { + OpMask.disable(IndexInt); + } + const SimdPacket *VecP = scalarize(Vec, OpMask); + VECZ_FAIL_IF(!VecP); + + // For each lane, we need to select either the original vector element (from + // VecP) or the new value Ele. The selection is done based on the Index. + IRBuilder<> B(Insert); + for (unsigned lane = 0; lane < SimdWidth; ++lane) { + if (!PM.isEnabled(lane) || P->at(lane)) { + continue; + } + Value *LaneValue = nullptr; + if (CIndex) { + // If the Index is a Constant, then we can do the selection at compile + // time + LaneValue = (IndexInt == lane) ? Ele : VecP->at(lane); + } else { + // If the Index is a runtime value, then we have to emit select + // instructions to do selection at runtime + Constant *LaneC = ConstantInt::get(Index->getType(), lane); + LaneValue = + B.CreateSelect(B.CreateICmpEQ(Index, LaneC), Ele, VecP->at(lane)); + } + P->set(lane, LaneValue); + } + + return P; +} + +SimdPacket *Scalarizer::scalarizeGEP(GetElementPtrInst *GEP, PacketMask PM) { + auto *const vecDataTy = dyn_cast(GEP->getType()); + VECZ_FAIL_IF(!vecDataTy); + const unsigned simdWidth = vecDataTy->getNumElements(); + + Value *const ptr = GEP->getPointerOperand(); + const SimdPacket *ptrPacket = nullptr; + if (ptr->getType()->isVectorTy()) { + ptrPacket = scalarize(ptr, PM); + VECZ_FAIL_IF(!ptrPacket); + } + + // Scalarize any vector GEP indices. + SmallVector indexPackets; + for (unsigned i = 0, n = GEP->getNumIndices(); i < n; ++i) { + Value *const idx = GEP->getOperand(1 + i); + if (idx->getType()->isVectorTy()) { + SimdPacket *idxP = scalarize(idx, PM); + VECZ_FAIL_IF(!idxP); + indexPackets.push_back(idxP); + } else { + indexPackets.push_back(nullptr); + } + } + + IRBuilder<> B(GEP); + const bool inBounds = GEP->isInBounds(); + const auto name = GEP->getName(); + SimdPacket *const P = getPacket(GEP, simdWidth); + for (unsigned i = 0; i < simdWidth; i++) { + if (!PM.isEnabled(i) || P->at(i)) { + continue; + } + + // Get the GEP indices per lane, scalarized or otherwise + SmallVector scalarIndices; + unsigned indexN = 1U; + for (auto *idx : indexPackets) { + if (idx) { + scalarIndices.push_back(idx->at(i)); + } else { + scalarIndices.push_back(GEP->getOperand(indexN)); + } + ++indexN; + } + + auto *const scalarPointer = ptrPacket ? ptrPacket->at(i) : ptr; + Value *const newGEP = + inBounds ? B.CreateInBoundsGEP(GEP->getSourceElementType(), + scalarPointer, scalarIndices, name) + : B.CreateGEP(GEP->getSourceElementType(), scalarPointer, + scalarIndices, name); + + P->set(i, newGEP); + } + return P; +} + +SimdPacket *Scalarizer::scalarizePHI(PHINode *Phi, PacketMask PM) { + auto *PhiTy = cast(Phi->getType()); + const unsigned Width = PhiTy->getNumElements(); + const unsigned NumIncoming = Phi->getNumIncomingValues(); + SmallVector Incoming; + + SimdPacket *P = getPacket(Phi, Width); + IRBuilder<> B(Phi); + + SmallVector ActiveLanes; + + // Start by creating the Phi nodes. This is done before everything else + // because the IR might contain cycles which will cause the scalarization to + // loop back to this Phi node when scalarizing the incoming values. + for (unsigned lane = 0; lane < Width; ++lane) { + if (!PM.isEnabled(lane) || P->at(lane)) { + continue; + } + PHINode *SPhi = + B.CreatePHI(PhiTy->getElementType(), NumIncoming, Phi->getName()); + P->set(lane, SPhi); + ActiveLanes.push_back(lane); + } + + // Scalarize the incoming values + for (auto &In : Phi->incoming_values()) { + SimdPacket *SIn = scalarize(In, PM); + VECZ_FAIL_IF(!SIn); + Incoming.push_back(SIn); + } + + // Assign the scalarized incoming values to the scalarized Phi nodes + for (const unsigned lane : ActiveLanes) { + VECZ_ERROR_IF(!PM.isEnabled(lane), "Active lane should be enabled."); + PHINode *SPhi = cast(P->at(lane)); + for (unsigned i = 0; i < NumIncoming; ++i) { + SPhi->addIncoming(Incoming[i]->at(lane), Phi->getIncomingBlock(i)); + } + } + + return P; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp new file mode 100644 index 0000000000000..d73bdfb33df85 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp @@ -0,0 +1,114 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +#include + +#include "debugging.h" +#include "transform/passes.h" + +using namespace llvm; + +PreservedAnalyses +vecz::SimplifyInfiniteLoopPass::run(Loop &L, LoopAnalysisManager &, + LoopStandardAnalysisResults &AR, + LPMUpdater &) { + bool modified = false; + + SmallVector loopExitBlocks; + L.getExitBlocks(loopExitBlocks); + + // If we have an infinite loop, create a virtual exit block that will target + // the unique exit block of the function. + if (loopExitBlocks.empty()) { + BasicBlock *latch = L.getLoopLatch(); + assert(latch && "Loop should have a unique latch."); + + Function *F = L.getHeader()->getParent(); + + // Get the return block of the function. + std::vector returnBlocks; + for (BasicBlock &BB : *F) { + if (isa(BB.getTerminator())) { + returnBlocks.push_back(&BB); + } + } + + if (returnBlocks.empty() || returnBlocks.size() > 1) { + assert(false && "Function should have only one exit."); + return PreservedAnalyses::all(); + } + + // The target of the virtual exit block of the infinite loop. + BasicBlock *target = returnBlocks[0]; + + // Replace the terminator of the latch with a fake conditional branch that + // will actually always target the header to maintain the semantic of the + // program. + latch->getTerminator()->eraseFromParent(); + AR.DT.deleteEdge(latch, L.getHeader()); + BasicBlock *virtualExit = + BasicBlock::Create(F->getContext(), L.getName() + ".virtual_exit", F); + AR.DT.addNewBlock(virtualExit, latch); + BranchInst::Create(L.getHeader(), virtualExit, + ConstantInt::getTrue(F->getContext()), latch); + AR.DT.insertEdge(latch, L.getHeader()); + AR.DT.insertEdge(latch, virtualExit); + BranchInst::Create(target, virtualExit); + AR.DT.insertEdge(virtualExit, target); + + assert(AR.DT.verify() && + "SimplifyInfiniteLoopPass: Dominator Tree failed verification"); + + // Update the phi nodes in the return block because we added a new + // predecessor to it. + for (Instruction &I : *target) { + if (auto *PHI = dyn_cast(&I)) { + PHI->addIncoming(PoisonValue::get(PHI->getType()), virtualExit); + } + } + + modified = true; + } else if (loopExitBlocks.size() == 1) { + // Canonicalize any other infinite loops so that the loop header is the + // true condition successor. + auto *const latch = L.getLoopLatch(); + auto *const header = L.getHeader(); + auto *const T = latch->getTerminator(); + if (auto *const branch = dyn_cast(T)) { + if (branch->isConditional()) { + if (auto *const cond = dyn_cast(branch->getCondition())) { + if (branch->getSuccessor(1) == header) { + modified = true; + auto &ctx = latch->getParent()->getContext(); + branch->setCondition(cond->isOneValue() + ? ConstantInt::getFalse(ctx) + : ConstantInt::getTrue(ctx)); + branch->swapSuccessors(); + } + } + } + } + } + + if (!modified) { + return PreservedAnalyses::all(); + } + + return getLoopPassPreservedAnalyses(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp new file mode 100644 index 0000000000000..4b09013f07756 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp @@ -0,0 +1,277 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include + +#include "analysis/stride_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "debugging.h" +#include "transform/packetization_helpers.h" +#include "transform/passes.h" + +#define DEBUG_TYPE "vecz" + +using namespace llvm; +using namespace vecz; + +/// @brief replace loads of vectors of small vector loads and stores with scalar +/// loads and stores, where the entire vector fits into a legal integer. +/// +/// The rationale here is that if we end up generating a scatter/gather, or +/// interleaved memop, it would be more efficient with the wider type than with +/// the vector of the narrower type. Although it's not trivial to know in +/// advance if we will get a scatter/gather or interleaved or contiguous load, +/// so we just do all of them and not worry too much about doing it when we +/// didn't really need to. +/// +/// Be careful not to run Instruction Combine Pass between this pass and +/// packetization, because it is likely to undo it. +PreservedAnalyses SquashSmallVectorsPass::run(Function &F, + FunctionAnalysisManager &AM) { + bool changed = false; + + const auto &UVR = AM.getResult(F); + const auto &SAR = AM.getResult(F); + auto &DL = F.getParent()->getDataLayout(); + auto &context = F.getContext(); + + // Keep a cache of the bitcasts so we don't create multiple bitcasts for the + // same value in each BasicBlock. + DenseMap squashCasts; + auto getSquashed = [&](Value *vector, Type *intTy, + IRBuilder<> &B) -> Value * { + auto *&bitCast = squashCasts[vector]; + Value *element = bitCast; + if (!element) { + if (auto *const bcast = dyn_cast(vector)) { + // "See through" existing bitcasts. + element = bcast->getOperand(0); + } else { + element = vector; + } + + if (element->getType() != intTy) { + // Note we have to freeze the vector value first, because individual + // elements can be `poison`, which would result in the entire value + // becoming `poison`, which is not a valid transform (it is not valid to + // increase the amount of `poison` in the IR). + element = B.CreateBitCast(B.CreateFreeze(element), intTy, + Twine(vector->getName(), ".squash")); + bitCast = dyn_cast(element); + } + } + return element; + }; + + SmallVector toErase; + for (auto &BB : F) { + for (auto &I : BB) { + if (auto *load = dyn_cast(&I)) { + if (!UVR.isVarying(load)) { + continue; + } + + auto *const ty = load->getType(); + auto *const scalarTy = ty->getScalarType(); + const unsigned numBits = ty->getPrimitiveSizeInBits(); + if (isPowerOf2_32(numBits) && scalarTy != ty && + DL.fitsInLegalInteger(numBits)) { + const auto align = load->getAlign(); + auto *const intTy = IntegerType::get(context, numBits); + if (DL.getABITypeAlign(intTy) > align) { + // The alignment of this type is too strict to convert + continue; + } + + auto *const ptr = load->getPointerOperand(); + const auto *const info = SAR.getInfo(ptr); + if (info && info->hasStride() && + info->getConstantMemoryStride(ty, &DL) == 1) { + // No need to perform this transform on contiguous loads + continue; + } + + IRBuilder<> B(load); + const auto name = load->getName(); + auto *newLoad = cast( + B.CreateLoad(intTy, ptr, Twine(name, ".squashed"))); + newLoad->setAlignment(align); + newLoad->copyMetadata(*load); + + auto *const newVec = + B.CreateBitCast(newLoad, ty, Twine(name, ".unsquash")); + + load->replaceAllUsesWith(newVec); + toErase.push_back(load); + changed = true; + } + } else if (auto *store = dyn_cast(&I)) { + if (!UVR.isVarying(store)) { + continue; + } + + auto *const data = store->getValueOperand(); + auto *const ty = data->getType(); + auto *const scalarTy = ty->getScalarType(); + const unsigned numBits = ty->getPrimitiveSizeInBits(); + if (isPowerOf2_32(numBits) && scalarTy != ty && + DL.fitsInLegalInteger(numBits)) { + const auto align = store->getAlign(); + auto *const intTy = IntegerType::get(context, numBits); + if (DL.getABITypeAlign(intTy) > align) { + // The alignment of this type is too strict to convert + continue; + } + + auto *const ptr = store->getPointerOperand(); + const auto *const info = SAR.getInfo(ptr); + if (info && info->hasStride() && + info->getConstantMemoryStride(ty, &DL) == 1) { + // No need to perform this transform on contiguous stores + continue; + } + + IRBuilder<> B(store); + auto *const newData = getSquashed(data, intTy, B); + auto *newStore = cast(B.CreateStore(newData, ptr)); + newStore->setAlignment(align); + newStore->copyMetadata(*store); + + toErase.push_back(store); + changed = true; + } + } else if (auto *zext = dyn_cast(&I)) { + if (!UVR.isVarying(zext)) { + continue; + } + // A zero-extend of an extract element can be squashed, if the source + // vector size is the same as the extended integer size. That is (for + // little-endian systems): + // + // zext i32(extract <4 x i8> data, i32 3) + // + // becomes: + // + // and(lshr(bitcast i32 data), i32 24), 0xFF) + // + // this avoids creating shufflevectors during packetization. + // + // We limit this optimization to vectors no larger than 64 bits in + // size. This is primarily because this optimization focuses on 'small' + // vectors but also, because LLVM's constants are limited to 64-bit + // integers, the masking logic would need to be done with extra + // instructions. + auto *const srcOp = zext->getOperand(0); + if (auto *const extract = dyn_cast(srcOp)) { + auto *const vector = extract->getVectorOperand(); + auto *const indexOp = extract->getIndexOperand(); + auto *const intTy = zext->getType(); + auto *const vecTy = vector->getType(); + if (vecTy->getPrimitiveSizeInBits() == + intTy->getPrimitiveSizeInBits() && + zext->getSrcTy()->getPrimitiveSizeInBits() <= 32 && + intTy->getScalarSizeInBits() <= 64 && isa(indexOp)) { + IRBuilder<> B(zext); + Value *element = getSquashed(vector, intTy, B); + + const auto bits = zext->getSrcTy()->getScalarSizeInBits(); + const auto scaled = + cast(indexOp)->getZExtValue() * bits; + + // Note on Little Endian systems, element 0 occupies the least + // significant bits of the vector. On Big Endian systems it occupies + // the most significant bits. Thus, we shift by "maximum element + // number minus current element number" times by "number of bits + // per element". + const auto shift = + DL.isBigEndian() + ? intTy->getPrimitiveSizeInBits() - bits - scaled + : scaled; + + if (shift != 0) { + element = + B.CreateLShr(element, ConstantInt::get(intTy, shift), + Twine(extract->getName(), ".squashExtract")); + } + element = B.CreateAnd( + element, + ConstantInt::get(intTy, maskTrailingOnes(bits)), + Twine(zext->getName(), ".squashZExt")); + + zext->replaceAllUsesWith(element); + toErase.push_back(zext); + changed = true; + } + } + } else if (auto *sext = dyn_cast(&I)) { + if (!UVR.isVarying(sext)) { + continue; + } + // We can squash sign extends in-place as well. + // We do this by shifting the required element into most-significant + // position, and then arithmetic-shifting it back down to the least- + // significant position. + auto *const srcOp = sext->getOperand(0); + if (auto *const extract = dyn_cast(srcOp)) { + auto *const vector = extract->getVectorOperand(); + auto *const indexOp = extract->getIndexOperand(); + auto *const intTy = sext->getType(); + auto *const vecTy = vector->getType(); + if (vecTy->getPrimitiveSizeInBits() == + intTy->getPrimitiveSizeInBits() && + isa(indexOp)) { + IRBuilder<> B(sext); + Value *element = getSquashed(vector, intTy, B); + + const auto bits = sext->getSrcTy()->getScalarSizeInBits(); + const auto shiftr = intTy->getPrimitiveSizeInBits() - bits; + const auto scaled = + cast(indexOp)->getZExtValue() * bits; + const auto shiftl = DL.isBigEndian() ? scaled : shiftr - scaled; + + if (shiftl != 0) { + element = + B.CreateShl(element, ConstantInt::get(intTy, shiftl), + Twine(extract->getName(), ".squashExtract")); + } + element = B.CreateAShr(element, ConstantInt::get(intTy, shiftr), + Twine(extract->getName(), ".squashSExt")); + + sext->replaceAllUsesWith(element); + toErase.push_back(sext); + changed = true; + } + } + } + } + + // only re-use casts within a basic block + squashCasts.clear(); + } + + for (auto *I : toErase) { + I->eraseFromParent(); + } + + auto preserved = PreservedAnalyses::all(); + if (changed) { + preserved.abandon(); + preserved.abandon(); + } + return preserved; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp new file mode 100644 index 0000000000000..b4ceb56dc2cd3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp @@ -0,0 +1,239 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "transform/ternary_transform_pass.h" + +#include +#include +#include +#include + +#include "analysis/stride_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "ir_cleanup.h" +#include "memory_operations.h" + +using namespace llvm; +using namespace vecz; + +namespace { +/// @brief Determine whether the select can and should be transformed. This is +/// the case when there is at most one GEP to it and followed by Load/Store +/// memory op and there are no other users to GEP. +/// Additionally, we reject various cases where the tranform would not result +/// in better code. +bool shouldTransform(SelectInst *Select, const StrideAnalysisResult &SAR) { + // The transform only applies to pointer selects. + if (!Select->getType()->isPointerTy()) { + return false; + } + + // There is absolutely no need to transform a uniform select. + if (!SAR.UVR.isVarying(Select)) { + return false; + } + + { + // If the select itself is a strided pointer, we don't gain anything by + // transforming it into a pair of masked memops. + const auto *info = SAR.getInfo(Select); + if (info && info->hasStride()) { + return false; + } + } + + // Validate Select operands + Value *VecTrue = Select->getOperand(1); + Value *VecFalse = Select->getOperand(2); + + assert(VecTrue && VecFalse); + + // If both pointers are uniform, it's worth doing the transform, since we get + // only scalar Mask Varying memops, instead of vector memops. + if (SAR.UVR.isVarying(VecTrue) || SAR.UVR.isVarying(VecFalse)) { + // Both pointers must be either strided or uniform (i.e. not divergent). + const auto *infoT = SAR.getInfo(VecTrue); + const auto *infoF = SAR.getInfo(VecFalse); + if (!infoT || !infoF || infoT->mayDiverge() || infoF->mayDiverge()) { + return false; + } + } + + // Validate Select users + GetElementPtrInst *TheGEP = nullptr; + SmallVector SelectsUsers; + for (User *U : Select->users()) { + if (GetElementPtrInst *GEP = dyn_cast(U)) { + // There can be at most one GEP + if (TheGEP) { + return false; + } + TheGEP = GEP; + SelectsUsers.push_back(GEP); + } else { + return false; + } + } + + // Validate GEP users + while (!SelectsUsers.empty()) { + VECZ_FAIL_IF(!isa(SelectsUsers.back())); + GetElementPtrInst *GEP = + cast(SelectsUsers.pop_back_val()); + + // Validate the GEP indices + for (Value *idx : GEP->indices()) { + const auto *info = SAR.getInfo(idx); + if (!info || info->mayDiverge()) { + return false; + } + } + // We only transform selects used by GEPs who are exclusively used by + // scalar loads and stores. Performing this transform on vectors was + // historically banned due to internal limitations, but these days we + // *should* be able to. It's just that we don't know whether it's + // beneficial. + for (User *U : GEP->users()) { + if (auto *const LI = dyn_cast(U)) { + if (LI->getType()->isVectorTy()) { + return false; + } + } else if (auto *const SI = dyn_cast(U)) { + if (SI->getValueOperand()->getType()->isVectorTy()) { + return false; + } + } else { + return false; + } + } + } + return true; +} + +/// @brief Try to transform the select, remove GEP & memory op and +/// replace with transformed GEP and masked memory op. +void Transform(SelectInst *Select, VectorizationContext &Ctx) { + SmallVector ToDelete; + + auto transformSelect = [&](GetElementPtrInst *GEP, Instruction *Memop, + Value *StoredValue, ArrayRef Indices) { + // Non-obviously, we need to insert the new instructions at the GEP. The GEP + // is a user of the select, so we can guarantee that the GEP dominates the + // select. To ensure that the new instructions added also dominate the + // indices of the GEP, we need to insert at the GEP. + IRBuilder<> B(GEP); + + Value *Condition = Select->getCondition(); + Value *InvCondition = B.CreateXor(Condition, 1); + Value *True = Select->getTrueValue(); + Value *False = Select->getFalseValue(); + Value *GepTrue = B.CreateGEP(GEP->getSourceElementType(), True, Indices); + Value *GepFalse = B.CreateGEP(GEP->getSourceElementType(), False, Indices); + auto MaskedOp = MemOp::get(Memop); + assert(MaskedOp); + const MemOpDesc Mem = MaskedOp->getDesc(); + + // We should have filtered out all vector memory operations earlier. + assert(!Mem.getDataType()->isVectorTy()); + + auto Alignment = Mem.getAlignment(); + if (isa(Memop)) { + // Transform load + auto *LoadTrue = + createMaskedLoad(Ctx, Mem.getDataType(), GepTrue, Condition, + /*VL*/ nullptr, Alignment); + LoadTrue->insertBefore(Memop->getIterator()); + auto *LoadFalse = + createMaskedLoad(Ctx, Mem.getDataType(), GepFalse, InvCondition, + /*VL*/ nullptr, Alignment); + LoadFalse->insertBefore(Memop->getIterator()); + B.SetInsertPoint(Memop); + Value *LoadResult = B.CreateSelect(Condition, LoadTrue, LoadFalse); + + // Replace all uses with new value + Memop->replaceAllUsesWith(LoadResult); + } else if (isa(Memop)) { + // Transform store + createMaskedStore(Ctx, StoredValue, GepTrue, Condition, /*VL*/ nullptr, + Alignment) + ->insertBefore(Memop->getIterator()); + createMaskedStore(Ctx, StoredValue, GepFalse, InvCondition, + /*VL*/ nullptr, Alignment) + ->insertBefore(Memop->getIterator()); + } + }; + + for (User *U : Select->users()) { + if (GetElementPtrInst *GEP = dyn_cast(U)) { + ToDelete.push_back(GEP); + + const SmallVector Indices(GEP->idx_begin(), GEP->idx_end()); + + for (User *G : GEP->users()) { + if (LoadInst *Load = dyn_cast(G)) { + ToDelete.push_back(Load); + transformSelect(GEP, Load, nullptr, Indices); + } else if (StoreInst *Store = dyn_cast(G)) { + ToDelete.push_back(Store); + transformSelect(GEP, Store, Store->getValueOperand(), Indices); + } + } + } + } + + // Clean up instructions bottom-up (users first). + while (!ToDelete.empty()) { + Instruction *I = ToDelete.pop_back_val(); + if (I->use_empty()) { + IRCleanup::deleteInstructionNow(I); + } + } + + IRCleanup::deleteInstructionNow(Select); +} +} // namespace + +PreservedAnalyses TernaryTransformPass::run(llvm::Function &F, + llvm::FunctionAnalysisManager &AM) { + const auto &SAR = AM.getResult(F); + + // Find selects that can be transformed + SmallVector Selects; + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (SelectInst *Select = dyn_cast(&I)) { + if (shouldTransform(Select, SAR)) { + Selects.push_back(Select); + } + } + } + } + + if (Selects.empty()) { + return PreservedAnalyses::all(); + } + + auto &Ctx = AM.getResult(F).getContext(); + + // Transform them. + for (SelectInst *Select : Selects) { + Transform(Select, Ctx); + } + + return PreservedAnalyses::none(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp new file mode 100644 index 0000000000000..753ec2176b38f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp @@ -0,0 +1,358 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include + +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "transform/passes.h" + +#define DEBUG_TYPE "vecz" + +// WHAT THIS DOES +// +// Where we have some expression involving binary operators over uniform and +// varying values, it can sometimes be advantageous to re-arrange the terms +// to reduce the vectorization overhead. For example, we might have: +// +// (Varying + Uniform) + Uniform +// +// The above expression requires TWO vector broadcasts of the uniform values, +// and TWO vector additions. However, if we re-associate the operators to get: +// +// Varying + (Uniform + Uniform) +// +// In this new form, we only need a scalar addition and a single broadcast, +// followed by a single vector addition. +// +// We also make the following transformations: +// +// (Varying + Uniform) + Varying -> (Varying + Varying) + Uniform +// Varying + (Varying + Uniform) -> (Varying + Varying) + Uniform +// +// Although these transformations don't reduce the number of vector +// instructions, they may reduce the vector register pressure somewhat. But +// more importantly they may enable further transforms on the CFG. +// +// A common pattern is a conditional statement like this: +// +// if (uniform_condition && varying_condition) { ... } +// +// Control flow conversion quite often replaces the && with an & in order to +// reduce the number of branches/basic blocks. In this case, however, that is +// counter-productive for us, since we wish to retain the uniform branch and +// linearize the varying one. This pass also splits up such branch conditions. +// +// POTENTIAL FURTHER WORK +// +// Currently, this pass only works on expressions involving a single kind of +// associative and commutative operators. However, similar transformations +// are possible with subtracts and mixtures of subtracts and additions. + +using namespace llvm; + +namespace { + +/// @brief it goes through all the PHI nodes in BB and duplicates the incoming +/// values from "original" to new the new incoming block "extra" +void updatePHIs(BasicBlock &BB, BasicBlock *original, BasicBlock *extra) { + for (auto &I : BB) { + auto *const PHI = dyn_cast(&I); + if (!PHI) { + break; + } + PHI->addIncoming(PHI->getIncomingValueForBlock(original), extra); + } +} + +} // namespace + +namespace vecz { +class Reassociator { +public: + Reassociator() {} + + /// @brief perform the Branch Split transformation + /// + /// @param[in] F Function to transform. + /// @param[in] AM FunctionAnalysisManager providing analyses. + /// @returns true iff any branches were split + bool run(llvm::Function &F, llvm::FunctionAnalysisManager &AM); + +private: + /// @brief classification of a binary operand according to whether its + /// operands are Uniform, Varying, both (Varying Op Uniform), or non- + /// canonically both (i.e. Uniform Op Varying). + enum class OpForm { Uniform, Varying, Mixed, NonCanonical }; + + /// @brief tries to transform a Binary Operator into a canonical form, such + /// that if only one operand is Uniform, it is the second operand. + /// + /// @param[in] Op the Binary Operator to transform + /// @returns the form of the canonicalized operator + OpForm canonicalizeBinOp(llvm::BinaryOperator &Op); + + /// @brief tries to rearrange a binary operator expression to reduce vector + /// broadcasts, or to facilitate branch splitting. + /// + /// @param[in] Op the Binary Operator to transform + /// @returns true iff the expression was transformed + bool reassociate(llvm::BinaryOperator &Op); + + /// @brief canonicalizes a branch into a form that can be split + /// + /// @param[in] Branch the branch instruction to canonicalize + /// @returns true iff the branch condition is mixed (Varying Op Uniform) + /// and can be split into two separate branches. + bool canSplitBranch(llvm::BranchInst &Branch); + + UniformValueResult *UVR = nullptr; +}; + +Reassociator::OpForm Reassociator::canonicalizeBinOp(llvm::BinaryOperator &Op) { + if (!UVR->isVarying(&Op)) { + // Both operands are uniform + return OpForm::Uniform; + } + + if (!UVR->isVarying(Op.getOperand(0))) { + if (Op.isCommutative()) { + // canonicalize the operator so that operand 1 is uniform + Op.swapOperands(); + return OpForm::Mixed; + } + return OpForm::NonCanonical; + } + + if (!UVR->isVarying(Op.getOperand(1))) { + return OpForm::Mixed; + } + + // Both operands are varying + return OpForm::Varying; +} + +bool Reassociator::reassociate(llvm::BinaryOperator &Op) { + if (!Op.isAssociative() || !Op.isCommutative()) { + return false; + } + + const auto Opcode = Op.getOpcode(); + auto *const LHS = Op.getOperand(0); + auto *const RHS = Op.getOperand(1); + + auto *const A = dyn_cast(LHS); + if (A && A->getOpcode() == Opcode && A->hasNUses(1) && + canonicalizeBinOp(*A) == OpForm::Mixed) { + if (UVR->isVarying(RHS)) { + // Transform (Varying Op Uniform) Op Varying + // into (Varying Op Varying) Op Uniform + auto *const P = BinaryOperator::Create(Opcode, A->getOperand(0), RHS, + "varying.reassoc"); + P->insertBefore(Op.getIterator()); + UVR->setVarying(P); + Op.setOperand(0, P); + Op.setOperand(1, A->getOperand(1)); + UVR->remove(A); + A->eraseFromParent(); + return true; + } else { + // Transform (Varying Op Uniform) Op Uniform + // into Varying Op (Uniform Op Uniform) + auto *const P = BinaryOperator::Create(Opcode, A->getOperand(1), RHS, + "uniform.reassoc"); + P->insertBefore(Op.getIterator()); + Op.setOperand(0, A->getOperand(0)); + Op.setOperand(1, P); + UVR->remove(A); + A->eraseFromParent(); + return true; + } + } + + auto *const B = dyn_cast(RHS); + if (B && B->getOpcode() == Opcode && B->hasNUses(1) && + canonicalizeBinOp(*B) == OpForm::Mixed) { + // Transform Varying Op (Varying Op Uniform) + // into (Varying Op Varying) Op Uniform + auto *const P = BinaryOperator::Create(Opcode, B->getOperand(0), LHS, + "varying.reassoc"); + P->insertBefore(Op.getIterator()); + Op.setOperand(0, P); + Op.setOperand(1, B->getOperand(1)); + UVR->setVarying(P); + UVR->remove(B); + B->eraseFromParent(); + return true; + } + + return false; +} + +bool Reassociator::canSplitBranch(BranchInst &Branch) { + if (auto *Op = dyn_cast(Branch.getCondition())) { + auto Opcode = Op->getOpcode(); + if (Opcode == Instruction::Or || Opcode == Instruction::And) { + auto Form = canonicalizeBinOp(*Op); + if (Form == OpForm::Mixed) { + return true; + } + } + } + return false; +} + +bool Reassociator::run(llvm::Function &F, llvm::FunctionAnalysisManager &AM) { + auto *DT = &AM.getResult(F); + LoopInfo *LI = nullptr; + UVR = &AM.getResult(F); + + // Iterate over all instructions in dominance order, so that we always + // transform an expression before any of its uses. + SmallVector Blocks; + DT->getDescendants(&F.getEntryBlock(), Blocks); + + SmallVector SplitBranches; + for (auto *const BB : Blocks) { + for (auto Iit = BB->begin(); Iit != BB->end();) { + auto &I = *(Iit++); + if (auto *BinOp = dyn_cast(&I)) { + const auto form = canonicalizeBinOp(*BinOp); + if (form == OpForm::Varying || form == OpForm::Mixed) { + reassociate(*BinOp); + } + } else if (auto *Branch = dyn_cast(&I)) { + if (Branch->isConditional() && Branch->getNumSuccessors() == 2 && + canSplitBranch(*Branch)) { + // Lazily obtain the Loop Info + if (!LI) { + LI = &AM.getResult(F); + } + + if (auto *const L = LI->getLoopFor(BB)) { + if (L->isLoopExiting(BB)) { + // No need to do this transform on loop exits (?) + continue; + } + } + + SplitBranches.push_back(Branch); + } + } + } + } + + if (SplitBranches.empty()) { + return false; + } + + auto *PDT = &AM.getResult(F); + + do { + auto *Branch = SplitBranches.back(); + SplitBranches.pop_back(); + BasicBlock *BB = Branch->getParent(); + + BasicBlock *newBB = SplitBlock(BB, Branch, DT, LI); + newBB->setName(Twine(BB->getName(), ".cond_split")); + + // update the PostDominatorTree manually.. + PDT->addNewBlock(newBB, PDT->getNode(BB)->getIDom()->getBlock()); + + // Remove the unconditional branch created by splitting.. + BB->getTerminator()->eraseFromParent(); + + auto *Cond = cast(Branch->getCondition()); + auto *varyingCond = Cond->getOperand(0); + auto *uniformCond = Cond->getOperand(1); + + // Create a new Uniform branch condition to the Return block.. + // Note that a conditional branch's successors are returned in reverse + // order, relative to how they appear in the IR, with the "true" target + // last. However, "getSuccessor(n)" also indexes backwards, from the end. + auto Opcode = Cond->getOpcode(); + + if (Opcode == Instruction::Or) { + BasicBlock *SuccT = Branch->getSuccessor(0); + + BranchInst::Create(SuccT, newBB, uniformCond, BB); + Branch->setCondition(varyingCond); + + // If the branch target has PHI nodes, they need to get an extra target + updatePHIs(*SuccT, newBB, BB); + + // Update Dominator and PostDominator trees.. + DT->insertEdge(BB, SuccT); + PDT->insertEdge(BB, SuccT); + } else { + BasicBlock *SuccF = Branch->getSuccessor(1); + + BranchInst::Create(newBB, SuccF, uniformCond, BB); + Branch->setCondition(varyingCond); + + // If the branch target has PHI nodes, they need to get an extra target + updatePHIs(*SuccF, newBB, BB); + + // Update Dominator and PostDominator trees.. + DT->insertEdge(BB, SuccF); + PDT->insertEdge(BB, SuccF); + } + + // If we made the condition dead, we can delete it + if (Cond->use_empty()) { + Cond->eraseFromParent(); + } + + // The branch may still have a mixed condition after splitting.. + if (canSplitBranch(*Branch)) { + SplitBranches.push_back(Branch); + } + } while (!SplitBranches.empty()); + + assert(DT->verify() && "Reassociator: Dominator Tree failed verification"); + + assert(PDT->verify() && + "Reassociator: Post-Dominator Tree failed verification"); + + if (LI) { + // Unlike the dominator trees, LoopInfo::verify() returns void and asserts + // internally on failure, for some reason + LI->verify(*DT); + } + + return true; +} + +/// @brief reassociate uniform binary operators and split branches +PreservedAnalyses UniformReassociationPass::run(Function &F, + FunctionAnalysisManager &AM) { + Reassociator reassociator; + const bool changed = reassociator.run(F, AM); + (void)changed; + + PreservedAnalyses PA; + PA.preserve(); + PA.preserve(); + PA.preserve(); + PA.preserve(); + return PA; +} +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp new file mode 100644 index 0000000000000..b22b7f1816f30 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp @@ -0,0 +1,1340 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "debugging.h" +#include "memory_operations.h" +#include "transform/packetization_helpers.h" +#include "vecz/vecz_target_info.h" + +using namespace vecz; +using namespace llvm; + +namespace { +/// @brief Applies @a EVL to @a Mask, clearing those bits in a position greater +/// than @a EVL. +Value *applyEVLToMask(IRBuilder<> &B, Value *EVL, Value *Mask) { + if (EVL) { + auto *const IndexVector = B.CreateStepVector(VectorType::get( + EVL->getType(), multi_llvm::getVectorElementCount(Mask->getType()))); + auto *const Splat = B.CreateVectorSplat( + multi_llvm::getVectorElementCount(Mask->getType()), EVL); + auto *const M = B.CreateICmpULT(IndexVector, Splat); + Mask = B.CreateLogicalAnd(Mask, M); + } + return Mask; +} + +bool isLegalMaskedLoad(const TargetTransformInfo &TTI, Type *Ty, + unsigned Alignment, unsigned AddrSpace) { + return multi_llvm::isLegalMaskedLoad(TTI, Ty, Align(Alignment), AddrSpace); +} + +bool isLegalMaskedStore(const TargetTransformInfo &TTI, Type *Ty, + unsigned Alignment, unsigned AddrSpace) { + return multi_llvm::isLegalMaskedStore(TTI, Ty, Align(Alignment), AddrSpace); +} + +bool isLegalMaskedGather(const TargetTransformInfo &TTI, Type *Ty, + unsigned Alignment, unsigned) { + return TTI.isLegalMaskedGather(Ty, Align(Alignment)); +} + +bool isLegalMaskedScatter(const TargetTransformInfo &TTI, Type *Ty, + unsigned Alignment, unsigned) { + return TTI.isLegalMaskedScatter(Ty, Align(Alignment)); +} +} // namespace + +// NOTE the TargetMachine is allowed to be null here; it isn't used in the +// implementation at present, but if it gets used in future it needs to be +// guarded. +TargetInfo::TargetInfo(TargetMachine *tm) : TM_(tm) {} + +Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr, + Value *Stride, unsigned Alignment, + Value *EVL) const { + if (!Ptr || !Stride || !Ty->isVectorTy()) { + return nullptr; + } + + // Validate the pointer type. + PointerType *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy) { + return nullptr; + } + Type *EleTy = Ty->getScalarType(); + + // Trivial case: contiguous load. + ConstantInt *CIntStride = dyn_cast(Stride); + if (CIntStride && CIntStride->getSExtValue() == 1) { + if (EVL) { + const Function *F = B.GetInsertBlock()->getParent(); + const auto Legality = + isVPLoadLegal(F, Ty, Alignment, PtrTy->getAddressSpace()); + if (!Legality.isVPLegal()) { + emitVeczRemarkMissed(F, "Could not create a VP load as the target " + "reported it would be illegal"); + VECZ_FAIL(); + } + auto *Mask = createAllTrueMask(B, multi_llvm::getVectorElementCount(Ty)); + const SmallVector Args = {Ptr, Mask, EVL}; + const SmallVector Tys = {Ty, Ptr->getType()}; + return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args); + } + return B.CreateAlignedLoad(Ty, Ptr, MaybeAlign(Alignment)); + } + + if (EVL) { + emitVeczRemarkMissed( + B.GetInsertBlock()->getParent(), Ptr, + "Could not create vector-length-predicated interleaved load"); + return nullptr; + } + + auto Elts = multi_llvm::getVectorElementCount(Ty); + if (Elts.isScalable()) { + emitVeczRemarkMissed(B.GetInsertBlock()->getParent(), Ptr, + "Could not create a scalable-vector interleaved load"); + VECZ_FAIL(); + } + const unsigned SimdWidth = Elts.getFixedValue(); + // Load individual values. + SmallVector Values; + Value *Index = B.getInt64(0); + for (unsigned i = 0; i < SimdWidth; i++) { + Value *GEP = B.CreateGEP(EleTy, Ptr, Index); + Values.push_back(B.CreateLoad(EleTy, GEP, false, "interleaved.load")); + Index = B.CreateAdd(Index, Stride); + } + + // Create a vector out of these values. + Value *Result = PoisonValue::get(Ty); + for (unsigned i = 0; i < SimdWidth; i++) { + Result = B.CreateInsertElement(Result, Values[i], B.getInt32(i)); + } + return Result; +} + +Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr, + Value *Stride, unsigned Alignment, + Value *EVL) const { + if (!Ptr || !Data || !Stride) { + return nullptr; + } + + // Validate the pointer type. + PointerType *PtrTy = dyn_cast(Ptr->getType()); + if (!PtrTy) { + return nullptr; + } + Type *VecTy = Data->getType(); + Type *EleTy = VecTy->getScalarType(); + + // Trivial case: contiguous store. + ConstantInt *CIntStride = dyn_cast(Stride); + if (CIntStride && CIntStride->getSExtValue() == 1) { + if (EVL) { + const Function *F = B.GetInsertBlock()->getParent(); + const auto Legality = + isVPStoreLegal(F, VecTy, Alignment, PtrTy->getAddressSpace()); + if (!Legality.isVPLegal()) { + emitVeczRemarkMissed(F, "Could not create a VP store as the target " + "reported it would be illegal"); + VECZ_FAIL(); + } + auto *Mask = + createAllTrueMask(B, multi_llvm::getVectorElementCount(VecTy)); + const SmallVector Args = {Data, Ptr, Mask, EVL}; + const SmallVector Tys = {Data->getType(), + Ptr->getType()}; + return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args); + } + return B.CreateAlignedStore(Data, Ptr, MaybeAlign(Alignment)); + } + + if (EVL) { + emitVeczRemarkMissed( + B.GetInsertBlock()->getParent(), Ptr, + "Could not create vector-length-predicated interleaved store"); + return nullptr; + } + + auto Elts = multi_llvm::getVectorElementCount(VecTy); + if (Elts.isScalable()) { + emitVeczRemarkMissed( + B.GetInsertBlock()->getParent(), Ptr, + "Could not create a scalable-vector interleaved store"); + VECZ_FAIL(); + } + const unsigned SimdWidth = Elts.getFixedValue(); + // Extract values from the vector. + SmallVector Values; + for (unsigned i = 0; i < SimdWidth; i++) { + Values.push_back(B.CreateExtractElement(Data, B.getInt32(i))); + } + + // Store individual values. + Value *Ret = nullptr; + Value *Index = B.getInt64(0); + for (unsigned i = 0; i < SimdWidth; i++) { + Value *GEP = B.CreateGEP(EleTy, Ptr, Index); + Ret = B.CreateStore(Values[i], GEP); + cast(Ret)->setAlignment(MaybeAlign(Alignment).valueOrOne()); + + Index = B.CreateAdd(Index, Stride); + } + return Ret; +} + +Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr, + Value *Mask, Value *EVL, + unsigned Alignment) const { + VECZ_FAIL_IF(!Ptr || !Mask); + PointerType *PtrTy = dyn_cast(Ptr->getType()); + VECZ_FAIL_IF(!PtrTy); + Type *EleTy = Ty->getScalarType(); + + // Validate the pointer and mask types. + auto *DataVecTy = dyn_cast(Ty); + auto *MaskVecTy = dyn_cast(Mask->getType()); + if (DataVecTy && MaskVecTy) { + VECZ_ERROR_IF(multi_llvm::getVectorElementCount(DataVecTy) != + multi_llvm::getVectorElementCount(MaskVecTy), + "The mask and the data need to have the same width"); + } + + // Use LLVM intrinsics for masked vector loads. + if (Ty->isVectorTy()) { + const Function *F = B.GetInsertBlock()->getParent(); + const auto Legality = + isVPLoadLegal(F, Ty, Alignment, PtrTy->getAddressSpace()); + if (EVL && Legality.isVPLegal()) { + const SmallVector Args = {Ptr, Mask, EVL}; + const SmallVector Tys = {Ty, PtrTy}; + return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args); + } else if (Legality.isMaskLegal()) { + Mask = applyEVLToMask(B, EVL, Mask); + VECZ_FAIL_IF(!Mask); + return B.CreateMaskedLoad(Ty, Ptr, Align(Alignment), Mask); + } else { + emitVeczRemarkMissed(F, "Could not create a masked load as the target " + "reported it would be illegal"); + VECZ_FAIL(); + } + } + + const unsigned Width = 1; + + LLVMContext &Ctx = B.getContext(); + BasicBlock *Entry = B.GetInsertBlock(); + BasicBlock *Exit = nullptr; + Function *F = Entry->getParent(); + VECZ_FAIL_IF(!F || !Ptr || !Mask || EVL); + + // Create all the required blocks. + SmallVector TestBlocks; + SmallVector LoadBlocks; + TestBlocks.push_back(Entry); + LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F)); + for (unsigned i = 1; i < Width; i++) { + TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F)); + LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F)); + } + Exit = BasicBlock::Create(Ctx, "masked_load_exit", F); + + Constant *const DefaultEleData = PoisonValue::get(EleTy); + SmallVector LoadedLanes; + SmallVector LanePhis; + for (unsigned i = 0; i < Width; i++) { + BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit; + + // Extract the mask elements and branch. + B.SetInsertPoint(TestBlocks[i]); + if (i > 0) { + PHINode *LanePhi = B.CreatePHI(EleTy, 2, "result_lane"); + LanePhi->addIncoming(LoadedLanes[i - 1], LoadBlocks[i - 1]); + LanePhi->addIncoming(DefaultEleData, TestBlocks[i - 1]); + LanePhis.push_back(LanePhi); + } + + Value *MaskLane = + (Width == 1) ? Mask + : B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane"); + B.CreateCondBr(MaskLane, LoadBlocks[i], Next); + + // Load the element and branch. + B.SetInsertPoint(LoadBlocks[i]); + Value *LanePtr = + i > 0 ? B.CreateGEP(EleTy, Ptr, B.getInt32(i), "lane_ptr") : Ptr; + LoadInst *Load = B.CreateLoad(EleTy, LanePtr, false, "masked_load"); + Load->setAlignment(MaybeAlign(Alignment).valueOrOne()); + LoadedLanes.push_back(Load); + B.CreateBr(Next); + } + + // Aggregate the loaded lanes. + B.SetInsertPoint(Exit); + PHINode *LastLanePhi = B.CreatePHI(EleTy, 2, "result_lane"); + LastLanePhi->addIncoming(LoadedLanes[Width - 1], LoadBlocks[Width - 1]); + LastLanePhi->addIncoming(DefaultEleData, TestBlocks[Width - 1]); + LanePhis.push_back(LastLanePhi); + + Value *Result = nullptr; + if (Width > 1) { + Result = PoisonValue::get(Ty); + for (unsigned i = 0; i < Width; i++) { + Result = B.CreateInsertElement(Result, LanePhis[i], B.getInt32(i)); + } + } else { + Result = LanePhis[Width - 1]; + } + + return Result; +} + +Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr, + Value *Mask, Value *EVL, + unsigned Alignment) const { + PointerType *PtrTy = dyn_cast(Ptr->getType()); + VECZ_FAIL_IF(!PtrTy); + Type *DataTy = Data->getType(); + Type *EleTy = DataTy->getScalarType(); + + auto *DataVecTy = dyn_cast(DataTy); + auto *MaskVecTy = dyn_cast(Mask->getType()); + if (DataVecTy && MaskVecTy) { + VECZ_ERROR_IF(multi_llvm::getVectorElementCount(DataVecTy) != + multi_llvm::getVectorElementCount(MaskVecTy), + "The mask and the data need to have the same width"); + } + + // Use LLVM intrinsics for masked vector Stores. + if (DataTy->isVectorTy()) { + const Function *F = B.GetInsertBlock()->getParent(); + const auto Legality = + isVPStoreLegal(F, DataTy, Alignment, PtrTy->getAddressSpace()); + if (EVL && Legality.isVPLegal()) { + const SmallVector Args = {Data, Ptr, Mask, EVL}; + const SmallVector Tys = {Data->getType(), PtrTy}; + return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args); + } else if (Legality.isMaskLegal()) { + Mask = applyEVLToMask(B, EVL, Mask); + VECZ_FAIL_IF(!Mask); + return B.CreateMaskedStore(Data, Ptr, Align(Alignment), Mask); + } else { + emitVeczRemarkMissed(F, "Could not create a masked store as the target " + "reported it would be illegal"); + VECZ_FAIL(); + } + } + + const unsigned Width = 1; + + LLVMContext &Ctx = B.getContext(); + BasicBlock *Entry = B.GetInsertBlock(); + BasicBlock *Exit = nullptr; + StoreInst *FirstStore = nullptr; + Function *F = Entry->getParent(); + VECZ_FAIL_IF(!F || EVL); + + // Create all the required blocks. + SmallVector TestBlocks; + SmallVector StoreBlocks; + TestBlocks.push_back(Entry); + StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F)); + for (unsigned i = 1; i < Width; i++) { + TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F)); + StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F)); + } + Exit = BasicBlock::Create(Ctx, "masked_store_exit", F); + + for (unsigned i = 0; i < Width; i++) { + BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit; + + // Extract the mask elements and branch. + B.SetInsertPoint(TestBlocks[i]); + Value *MaskLane = + (Width == 1) ? Mask + : B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane"); + B.CreateCondBr(MaskLane, StoreBlocks[i], Next); + + // Extract the data elements and store. + B.SetInsertPoint(StoreBlocks[i]); + Value *DataLane = + (Width == 1) ? Data + : B.CreateExtractElement(Data, B.getInt32(i), "data_lane"); + Value *LanePtr = Ptr; + if (i > 0) { + LanePtr = B.CreateGEP(EleTy, LanePtr, B.getInt32(i), "lane_ptr"); + } + StoreInst *Store = B.CreateStore(DataLane, LanePtr); + if (i == 0) { + FirstStore = Store; + } + Store->setAlignment(MaybeAlign(Alignment).valueOrOne()); + B.CreateBr(Next); + } + + B.SetInsertPoint(Exit); + return FirstStore; +} + +Value *TargetInfo::createInterleavedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr, + Value *Stride, Value *EVL, + unsigned Alignment) const { + auto EC = multi_llvm::getVectorElementCount(Ty); + auto *const Mask = B.CreateVectorSplat(EC, B.getTrue()); + return createMaskedInterleavedLoad(B, Ty, Ptr, Mask, Stride, EVL, Alignment); +} + +Value *TargetInfo::createInterleavedStore(IRBuilder<> &B, Value *Data, + Value *Ptr, Value *Stride, Value *EVL, + unsigned Alignment) const { + auto EC = multi_llvm::getVectorElementCount(Data->getType()); + auto *const Mask = B.CreateVectorSplat(EC, B.getTrue()); + return createMaskedInterleavedStore(B, Data, Ptr, Mask, Stride, EVL, + Alignment); +} + +Value *TargetInfo::createMaskedInterleavedLoad(IRBuilder<> &B, Type *Ty, + Value *Ptr, Value *Mask, + Value *Stride, Value *EVL, + unsigned Alignment) const { + // We only support scalar pointer types + assert(!Ptr->getType()->isVectorTy() && "Unsupported interleaved load"); + + auto EC = multi_llvm::getVectorElementCount(Ty); + Value *BroadcastAddr = B.CreateVectorSplat(EC, Ptr, "BroadcastAddr"); + Value *StrideSplat = B.CreateVectorSplat(EC, Stride); + + Value *IndicesVector = + createIndexSequence(B, cast(StrideSplat->getType())); + VECZ_FAIL_IF(!IndicesVector); + IndicesVector = B.CreateMul(StrideSplat, IndicesVector); + + Value *Address = + B.CreateGEP(Ty->getScalarType(), BroadcastAddr, IndicesVector); + + return createMaskedGatherLoad(B, Ty, Address, Mask, EVL, Alignment); +} + +Value *TargetInfo::createMaskedInterleavedStore(IRBuilder<> &B, Value *Data, + Value *Ptr, Value *Mask, + Value *Stride, Value *EVL, + unsigned Alignment) const { + // We only support scalar pointer types + assert(!Ptr->getType()->isVectorTy() && "Unsupported interleaved store"); + auto EC = multi_llvm::getVectorElementCount(Data->getType()); + Value *BroadcastAddr = B.CreateVectorSplat(EC, Ptr, "BroadcastAddr"); + Value *StrideSplat = B.CreateVectorSplat(EC, Stride); + + Value *IndicesVector = + createIndexSequence(B, cast(StrideSplat->getType())); + VECZ_FAIL_IF(!IndicesVector); + IndicesVector = B.CreateMul(StrideSplat, IndicesVector); + + Value *Address = B.CreateGEP(Data->getType()->getScalarType(), BroadcastAddr, + IndicesVector); + + return createMaskedScatterStore(B, Data, Address, Mask, EVL, Alignment); +} + +Value *TargetInfo::createGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr, + Value *EVL, unsigned Alignment) const { + auto EC = multi_llvm::getVectorElementCount(Ty); + auto *const Mask = B.CreateVectorSplat(EC, B.getTrue()); + return createMaskedGatherLoad(B, Ty, Ptr, Mask, EVL, Alignment); +} + +Value *TargetInfo::createScatterStore(IRBuilder<> &B, Value *Data, Value *Ptr, + Value *EVL, unsigned Alignment) const { + auto EC = multi_llvm::getVectorElementCount(Data->getType()); + auto *const Mask = B.CreateVectorSplat(EC, B.getTrue()); + return createMaskedScatterStore(B, Data, Ptr, Mask, EVL, Alignment); +} + +Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr, + Value *Mask, Value *EVL, + unsigned Alignment) const { + LLVMContext &Ctx = B.getContext(); + BasicBlock *Entry = B.GetInsertBlock(); + BasicBlock *Exit = nullptr; + Function *F = Entry->getParent(); + VECZ_FAIL_IF(!F || !Ptr || !Mask); + + auto *VecPtrTy = dyn_cast(Ptr->getType()); + VECZ_FAIL_IF(!VecPtrTy); + PointerType *PtrTy = dyn_cast(VecPtrTy->getElementType()); + VECZ_FAIL_IF(!PtrTy); + Type *EleTy = Ty->getScalarType(); + Constant *DefaultEleData = PoisonValue::get(EleTy); + + if (Ty->isVectorTy()) { + const auto Legality = + isVPGatherLegal(F, Ty, Alignment, PtrTy->getAddressSpace()); + if (EVL && Legality.isVPLegal()) { + const SmallVector Args = {Ptr, Mask, EVL}; + const SmallVector Tys = {Ty, VecPtrTy}; + return B.CreateIntrinsic(llvm::Intrinsic::vp_gather, Tys, Args); + } else if (Legality.isMaskLegal()) { + Function *MaskedGather = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::masked_gather, {Ty, VecPtrTy}); + + if (MaskedGather) { + Mask = applyEVLToMask(B, EVL, Mask); + VECZ_FAIL_IF(!Mask); + // Create the call to the function + Value *Args[] = {Ptr, B.getInt32(Alignment), Mask, + PoisonValue::get(Ty)}; + CallInst *CI = B.CreateCall(MaskedGather, Args); + if (CI) { + CI->setCallingConv(MaskedGather->getCallingConv()); + CI->setAttributes(MaskedGather->getAttributes()); + return CI; + } + } + } else { + emitVeczRemarkMissed(F, "Could not create a masked gather as the target " + "reported it would be illegal"); + VECZ_FAIL(); + } + } + + VECZ_FAIL_IF(EVL); + auto VecWidth = multi_llvm::getVectorElementCount(Ty); + const unsigned Width = VecWidth.getFixedValue(); + + // Fallback scalar function generator + // Create all the required blocks. + SmallVector TestBlocks; + SmallVector LoadBlocks; + TestBlocks.push_back(Entry); + LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F)); + for (unsigned i = 1; i < Width; i++) { + TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F)); + LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F)); + } + Exit = BasicBlock::Create(Ctx, "masked_load_exit", F); + + SmallVector LoadedLanes; + SmallVector LanePhis; + for (unsigned i = 0; i < Width; i++) { + BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit; + + // Extract the mask elements and branch. + B.SetInsertPoint(TestBlocks[i]); + if (i > 0) { + PHINode *LanePhi = B.CreatePHI(EleTy, 2, "result_lane"); + LanePhi->addIncoming(LoadedLanes[i - 1], LoadBlocks[i - 1]); + LanePhi->addIncoming(DefaultEleData, TestBlocks[i - 1]); + LanePhis.push_back(LanePhi); + } + + Value *MaskLane = B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane"); + B.CreateCondBr(MaskLane, LoadBlocks[i], Next); + + // Load the element and branch. + B.SetInsertPoint(LoadBlocks[i]); + Value *PtrLane = B.CreateExtractElement(Ptr, B.getInt32(i), "ptr_lane"); + LoadInst *Load = B.CreateLoad(EleTy, PtrLane, false, "masked_load"); + Load->setAlignment(MaybeAlign(Alignment).valueOrOne()); + LoadedLanes.push_back(Load); + B.CreateBr(Next); + } + + // Aggregate the loaded lanes. + B.SetInsertPoint(Exit); + PHINode *LastLanePhi = B.CreatePHI(EleTy, 2, "result_lane"); + LastLanePhi->addIncoming(LoadedLanes[Width - 1], LoadBlocks[Width - 1]); + LastLanePhi->addIncoming(DefaultEleData, TestBlocks[Width - 1]); + LanePhis.push_back(LastLanePhi); + Value *Result = PoisonValue::get(Ty); + for (unsigned i = 0; i < Width; i++) { + Result = B.CreateInsertElement(Result, LanePhis[i], B.getInt32(i)); + } + return Result; +} + +Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data, + Value *Ptr, Value *Mask, Value *EVL, + unsigned Alignment) const { + LLVMContext &Ctx = B.getContext(); + BasicBlock *Entry = B.GetInsertBlock(); + BasicBlock *Exit = nullptr; + StoreInst *FirstStore = nullptr; + Function *F = Entry->getParent(); + VECZ_FAIL_IF(!F || !Ptr || !Mask); + auto *DataTy = Data->getType(); + + if (DataTy->isVectorTy()) { + auto *VecPtrTy = dyn_cast(Ptr->getType()); + VECZ_FAIL_IF(!VecPtrTy); + auto *PtrTy = dyn_cast(VecPtrTy->getElementType()); + VECZ_FAIL_IF(!PtrTy); + const auto Legality = + isVPScatterLegal(F, DataTy, Alignment, PtrTy->getAddressSpace()); + if (EVL && Legality.isVPLegal()) { + const SmallVector Args = {Data, Ptr, Mask, EVL}; + const SmallVector Tys = {Data->getType(), VecPtrTy}; + return B.CreateIntrinsic(llvm::Intrinsic::vp_scatter, Tys, Args); + } else if (Legality.isMaskLegal()) { + Function *MaskedScatter = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::masked_scatter, {DataTy, VecPtrTy}); + + if (MaskedScatter) { + Mask = applyEVLToMask(B, EVL, Mask); + VECZ_FAIL_IF(!Mask); + // Create the call to the function + Value *Args[] = {Data, Ptr, B.getInt32(Alignment), Mask}; + CallInst *CI = B.CreateCall(MaskedScatter, Args); + if (CI) { + CI->setCallingConv(MaskedScatter->getCallingConv()); + CI->setAttributes(MaskedScatter->getAttributes()); + return CI; + } + } + } else { + emitVeczRemarkMissed(F, "Could not create a masked scatter as the target " + "reported it would be illegal"); + VECZ_FAIL(); + } + } + + VECZ_FAIL_IF(EVL); + auto VecWidth = multi_llvm::getVectorElementCount(DataTy); + const unsigned Width = VecWidth.getFixedValue(); + + // Fallback scalar function generator + // Create all the required blocks. + SmallVector TestBlocks; + SmallVector StoreBlocks; + TestBlocks.push_back(Entry); + StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F)); + for (unsigned i = 1; i < Width; i++) { + TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F)); + StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F)); + } + Exit = BasicBlock::Create(Ctx, "masked_store_exit", F); + + for (unsigned i = 0; i < Width; i++) { + BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit; + + // Extract the mask elements and branch. + B.SetInsertPoint(TestBlocks[i]); + Value *MaskLane = B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane"); + B.CreateCondBr(MaskLane, StoreBlocks[i], Next); + + // Extract the data elements and store. + B.SetInsertPoint(StoreBlocks[i]); + Value *PtrLane = B.CreateExtractElement(Ptr, B.getInt32(i), "ptr_lane"); + Value *DataLane = B.CreateExtractElement(Data, B.getInt32(i), "data_lane"); + StoreInst *Store = B.CreateStore(DataLane, PtrLane); + if (i == 0) { + FirstStore = Store; + } + Store->setAlignment(MaybeAlign(Alignment).valueOrOne()); + B.CreateBr(Next); + } + + B.SetInsertPoint(Exit); + return FirstStore; +} + +Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B, + VectorizationContext &Ctx, + Instruction *extract, + Type *narrowTy, Value *src, + Value *index, Value *VL) const { + (void)VL; + const auto *origSrc = extract->getOperand(0); + auto *eltTy = src->getType()->getScalarType(); + + auto *wideTy = src->getType(); + + auto it = B.GetInsertPoint(); + + // Insert alloca at the beginning of the function. + auto allocaIt = + B.GetInsertBlock()->getParent()->getEntryBlock().getFirstInsertionPt(); + B.SetInsertPoint(&*allocaIt); + auto *const alloc = B.CreateAlloca(wideTy, nullptr, "fixlen.alloc"); + + // Reset the insertion point to wherever we must insert instructions + B.SetInsertPoint(&*it); + + // Store the packetized vector to the allocation + B.CreateStore(src, alloc); + + const unsigned fixedVecElts = + multi_llvm::getVectorNumElements(origSrc->getType()); + + Instruction *load = nullptr; + if (!index->getType()->isVectorTy()) { + // If the index remains a scalar (is uniform) then we can use a strided load + // starting from the address '&alloc[index]', strided by the original vector + // width: &alloc[index], &alloc[index+N], &alloc[index+2N], ... + auto *const stride = getSizeInt(B, fixedVecElts); + auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne(); + // Index into the allocation, coming back with the starting offset from + // which to begin our loads. This is either a scalar pointer, or a vector of + // pointers. + auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, index, "vec.alloc"); + + load = ::createInterleavedLoad(Ctx, narrowTy, gep, stride, /*Mask*/ nullptr, + /*EVL*/ nullptr, alignment.value()); + } else { + // Else if we've got a varying, vector index, then we must use a gather. + // Take our indices, and add them to a step multiplied by the original + // vecor width. Use that to create a vector of pointers. + auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne(); + + index = getGatherIndicesVector( + B, index, index->getType(), + multi_llvm::getVectorNumElements(origSrc->getType()), "idx"); + + // Index into the allocation, coming back with the starting offset from + // which to begin our striding load. + auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, index, "vec.alloc"); + + load = ::createGather(Ctx, narrowTy, gep, /*Mask*/ nullptr, /*EVL*/ nullptr, + alignment.value()); + } + load->insertBefore(B.GetInsertPoint()); + + return load; +} + +Value *TargetInfo::createOuterScalableBroadcast(IRBuilder<> &builder, + Value *vector, Value *VL, + ElementCount factor) const { + return createScalableBroadcast(builder, vector, VL, factor, + /* URem */ true); +} + +Value *TargetInfo::createInnerScalableBroadcast(IRBuilder<> &builder, + Value *vector, Value *VL, + ElementCount factor) const { + return createScalableBroadcast(builder, vector, VL, factor, + /* URem */ false); +} + +Value *TargetInfo::createScalableBroadcast(IRBuilder<> &B, Value *vector, + Value *VL, ElementCount factor, + bool URem) const { + (void)VL; + auto *const ty = vector->getType(); + auto *const wideTy = ScalableVectorType::get( + multi_llvm::getVectorElementType(ty), + factor.getKnownMinValue() * + multi_llvm::getVectorElementCount(ty).getKnownMinValue()); + auto wideEltCount = multi_llvm::getVectorElementCount(wideTy); + + // The splats must be inserted after any Allocas + auto it = B.GetInsertBlock()->getParent()->getEntryBlock().begin(); + while (isa(*it)) { + ++it; + } + IRBuilder<> AllocaB(&*it); + + auto *const alloc = AllocaB.CreateAlloca(ty, nullptr, "fixlen.alloc"); + + // Store the vector to the allocation. + B.CreateStore(vector, alloc); + + auto *const eltTy = cast(ty)->getElementType(); + + auto *const stepsRem = TargetInfo::createBroadcastIndexVector( + B, + ScalableVectorType::get(B.getInt32Ty(), cast(wideTy)), + factor, URem, "idx1"); + auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, stepsRem, "vec.alloc"); + auto *const boolTrue = ConstantInt::getTrue(B.getContext()); + auto *const mask = B.CreateVectorSplat(wideEltCount, boolTrue, "truemask"); + // Set the alignment to that of vector element type. + auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne(); + return B.CreateMaskedGather(wideTy, gep, alignment, mask, + PoisonValue::get(wideTy)); +} + +Value *TargetInfo::createBroadcastIndexVector(IRBuilder<> &B, Type *ty, + ElementCount factor, bool URem, + const llvm::Twine &N) { + auto *const steps = B.CreateStepVector(ty, "idx0"); + const auto tyEC = multi_llvm::getVectorElementCount(ty); + const unsigned factorMinVal = factor.getKnownMinValue(); + + unsigned fixedAmt; + Instruction::BinaryOps Opc; + if (URem) { + fixedAmt = tyEC.getKnownMinValue() / factorMinVal; + Opc = BinaryOperator::URem; + } else { + fixedAmt = factorMinVal; + Opc = BinaryOperator::UDiv; + } + auto *const vectorEltsSplat = B.CreateVectorSplat( + tyEC, ConstantInt::get(multi_llvm::getVectorElementType(ty), fixedAmt)); + return B.CreateBinOp(Opc, steps, vectorEltsSplat, N); +} + +Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B, + VectorizationContext &Ctx, + Instruction *insert, Value *elt, + Value *into, Value *index, + Value *VL) const { + (void)VL; + auto *eltTy = elt->getType(); + auto *intoTy = into->getType(); + auto *scalarTy = elt->getType()->getScalarType(); + + // The alloca must be inserted at the beginning of the function. + auto allocaIt = + B.GetInsertBlock()->getParent()->getEntryBlock().getFirstInsertionPt(); + auto it = B.GetInsertPoint(); + + B.SetInsertPoint(&*allocaIt); + auto *const alloc = B.CreateAlloca(intoTy, nullptr); + + // Reset the insertion point to wherever we must insert instructions + B.SetInsertPoint(&*it); + + // Store the wide vector to the allocation + B.CreateStore(into, alloc); + + const unsigned fixedVecElts = + multi_llvm::getVectorNumElements(insert->getOperand(0)->getType()); + + // Construct the index, either by packetizing if (if varying) or by + // splatting it and combining it with a step vector + Instruction *store; + if (!index->getType()->isVectorTy()) { + // If the index remains a scalar (is uniform) then we can use a strided + // store starting from the address '&alloc[index]', strided by the original + // vector width: &alloc[index], &alloc[index+N], &alloc[index+2N], ... + auto *const stride = getSizeInt(B, fixedVecElts); + auto alignment = + MaybeAlign(scalarTy->getScalarSizeInBits() / 8).valueOrOne(); + // Index into the allocation, coming back with the starting offset from + // which to begin our loads. This is either a scalar pointer, or a vector of + // pointers. + auto *const gep = B.CreateInBoundsGEP(scalarTy, alloc, index, "vec.alloc"); + + store = ::createInterleavedStore(Ctx, elt, gep, stride, /*Mask*/ nullptr, + /*EVL*/ nullptr, alignment.value()); + } else { + // Else if we've got a varying, vector index, then we must use a scatter. + // Take our indices, and add them to a step multiplied by the original + // vecor width. Use that to create a vector of pointers. + auto alignment = + MaybeAlign(scalarTy->getScalarSizeInBits() / 8).valueOrOne(); + + auto narrowEltCount = multi_llvm::getVectorElementCount(eltTy); + + auto *steps = B.CreateStepVector(index->getType(), "idx0"); + auto *const fixedVecEltsSplat = B.CreateVectorSplat( + narrowEltCount, + ConstantInt::get(index->getType()->getScalarType(), fixedVecElts)); + auto *const stepsMul = B.CreateMul(steps, fixedVecEltsSplat, "idx.scale"); + index = B.CreateAdd(stepsMul, index, "idx"); + + // Index into the allocation, coming back with the starting offset from + // which to begin our striding load. + auto *const gep = B.CreateInBoundsGEP(scalarTy, alloc, index, "vec.alloc"); + + store = ::createScatter(Ctx, elt, gep, /*Mask*/ nullptr, + /*EVL*/ nullptr, alignment.value()); + } + VECZ_FAIL_IF(!store); + store->insertBefore(B.GetInsertPoint()); + + // Load the vector back from the stack + return B.CreateLoad(intoTy, alloc); +} + +bool TargetInfo::isVPVectorLegal(const Function &F, Type *Ty) const { + return !TM_ || + TM_->getTargetTransformInfo(F).isElementTypeLegalForScalableVector( + multi_llvm::getVectorElementType(Ty)); +} + +TargetInfo::VPMemOpLegality TargetInfo::checkMemOpLegality( + const Function *F, + function_ref + Checker, + Type *Ty, unsigned Alignment, unsigned AddrSpace) const { + assert(Ty->isVectorTy() && "Expected a vector type"); + const bool isMaskLegal = + !(isa(Ty) && TM_) || + Checker(TM_->getTargetTransformInfo(*F), Ty, Alignment, AddrSpace); + // Assuming a pointer bit width of 64 + bool isVPLegal = isMaskLegal && isVPVectorLegal(*F, Ty); + if (isVPLegal) { + const unsigned PtrBitWidth = + TM_ ? TM_->createDataLayout().getPointerSizeInBits(AddrSpace) : 64; + auto &Ctx = Ty->getContext(); + auto *const IntTy = IntegerType::get(Ctx, PtrBitWidth); + auto *const IntVecTy = + VectorType::get(IntTy, multi_llvm::getVectorElementCount(Ty)); + isVPLegal = isVPVectorLegal(*F, IntVecTy); + } + return {isVPLegal, isMaskLegal}; +} + +TargetInfo::VPMemOpLegality +TargetInfo::isVPLoadLegal(const Function *F, Type *Ty, unsigned Alignment, + unsigned AddrSpace) const { + return checkMemOpLegality(F, isLegalMaskedLoad, Ty, Alignment, AddrSpace); +} + +TargetInfo::VPMemOpLegality +TargetInfo::isVPStoreLegal(const Function *F, Type *Ty, unsigned Alignment, + unsigned AddrSpace) const { + return checkMemOpLegality(F, isLegalMaskedStore, Ty, Alignment, AddrSpace); +} + +TargetInfo::VPMemOpLegality +TargetInfo::isVPGatherLegal(const Function *F, Type *Ty, unsigned Alignment, + unsigned AddrSpace) const { + return checkMemOpLegality(F, isLegalMaskedGather, Ty, Alignment, AddrSpace); +} + +TargetInfo::VPMemOpLegality +TargetInfo::isVPScatterLegal(const Function *F, Type *Ty, unsigned Alignment, + unsigned AddrSpace) const { + return checkMemOpLegality(F, isLegalMaskedScatter, Ty, Alignment, AddrSpace); +} + +bool TargetInfo::isLegalVPElementType(Type *) const { return true; } + +llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B, + llvm::Value *src, + llvm::Value *mask, + llvm::Value *evl) const { + auto *const srcTy = dyn_cast(src->getType()); + auto *const maskTy = dyn_cast(mask->getType()); + assert( + srcTy && maskTy && + "TargetInfo::createVectorShuffle: source and mask must have vector type"); + + if (isa(mask)) { + // Special case if the mask happens to be a constant. + return B.CreateShuffleVector(src, PoisonValue::get(srcTy), mask); + } + + // The alloca must be inserted at the beginning of the function. + auto *const curBlock = B.GetInsertBlock(); + auto &entryBlock = curBlock->getParent()->getEntryBlock(); + const auto allocaIt = entryBlock.getFirstInsertionPt(); + const auto it = B.GetInsertPoint(); + + B.SetInsertPoint(&entryBlock, allocaIt); + auto *const alloc = B.CreateAlloca(srcTy, nullptr); + + // Reset the insertion point to wherever we must insert instructions + B.SetInsertPoint(curBlock, it); + + // Store the wide vector to the allocation + B.CreateStore(src, alloc); + + auto *const eltTy = srcTy->getElementType(); + + // Index into the allocation. + auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, mask, "vec.alloc"); + + const auto eltCount = maskTy->getElementCount(); + auto *const dstTy = VectorType::get(eltTy, eltCount); + const auto alignment = + MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne(); + + Value *gatherMask = nullptr; + if (evl) { + const auto EC = srcTy->getElementCount(); + auto *const IndexTy = VectorType::get(evl->getType(), EC); + auto *const step = B.CreateStepVector(IndexTy); + gatherMask = B.CreateICmpULT(step, B.CreateVectorSplat(EC, evl)); + } else { + gatherMask = B.CreateVectorSplat(eltCount, B.getTrue()); + } + + return B.CreateMaskedGather(dstTy, gep, alignment, gatherMask, + PoisonValue::get(dstTy)); +} + +llvm::Value *TargetInfo::createVectorSlideUp(llvm::IRBuilder<> &B, + llvm::Value *src, + llvm::Value *insert, + llvm::Value *) const { + auto *const srcTy = dyn_cast(src->getType()); + assert(srcTy && + "TargetInfo::createVectorShuffle: source must have vector type"); + + auto *const poison = PoisonValue::get(srcTy); + const auto EC = srcTy->getElementCount(); + if (!EC.isScalable()) { + // Special case for fixed-width vectors + const auto width = EC.getFixedValue(); + SmallVector mask(width); + auto it = mask.begin(); + *it++ = 0; + for (size_t i = 1; i < width; ++i) { + *it++ = i - 1; + } + + auto *const rotate = + createOptimalShuffle(B, src, poison, mask, Twine("slide_up")); + return B.CreateInsertElement(rotate, insert, B.getInt64(0), "slide_in"); + } + + auto *const rotate = B.CreateVectorSplice(poison, src, -1, "slide_up"); + return B.CreateInsertElement(rotate, insert, B.getInt64(0), "slide_in"); +} + +bool TargetInfo::canOptimizeInterleavedGroup(const Instruction &val, + InterleavedOperation Kind, + int Stride, + unsigned GroupSize) const { + if ((Stride == 2) || (Stride == 4)) { + VECZ_FAIL_IF((int)GroupSize != abs(Stride)); + VECZ_FAIL_IF((Kind != eInterleavedLoad) && (Kind != eInterleavedStore) && + (Kind != eMaskedInterleavedLoad) && + (Kind != eMaskedInterleavedStore)); + Type *DataType = nullptr; + if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) { + DataType = val.getOperand(0)->getType(); + } else { + DataType = val.getType(); + } + VECZ_FAIL_IF(!DataType); + VECZ_FAIL_IF(!isa(DataType)); + return true; + } + return false; +} + +bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B, + InterleavedOperation Kind, + ArrayRef Group, + ArrayRef Masks, + Value *Address, int Stride) const { + VECZ_FAIL_IF(Stride < 0); + + // Validate the operations in the group. + SmallVector Calls; + for (unsigned i = 0; i < Group.size(); i++) { + CallInst *Op = dyn_cast(Group[i]); + VECZ_FAIL_IF(!Op); + Calls.push_back(Op); + } + PointerType *PtrTy = dyn_cast(Address->getType()); + VECZ_FAIL_IF(!PtrTy); + CallInst *Op0 = Calls[0]; + VECZ_FAIL_IF(!canOptimizeInterleavedGroup(*Op0, Kind, Stride, Group.size())); + + // canOptimizeInterleavedGroup() performs several checks, including valid + // Kind and Op0 types. Thus, these casts are safe. + FixedVectorType *VecTy = nullptr; + if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) { + VecTy = cast(Op0->getOperand(0)->getType()); + } else { // eInterleavedLoad || eMaskedInterleavedLoad + VecTy = cast(Op0->getType()); + } + + auto VecWidth = multi_llvm::getVectorElementCount(VecTy); + const unsigned SimdWidth = VecWidth.getFixedValue(); + + Type *EleTy = VecTy->getElementType(); + const unsigned Align = EleTy->getScalarSizeInBits() / 8; + + const bool HasMask = + (Kind == eMaskedInterleavedLoad) || (Kind == eMaskedInterleavedStore); + SmallVector Vectors; + SmallVector VecMasks(Masks.begin(), Masks.end()); + if (Kind == eInterleavedLoad || Kind == eMaskedInterleavedLoad) { + // Create one regular vector load per interleaved load in the group. + if (HasMask) { + VECZ_FAIL_IF(!interleaveVectors(B, VecMasks, true)); + } + + for (unsigned i = 0; i < Group.size(); i++) { + Value *AddressN = Address; + if (i > 0) { + const unsigned Offset = i * SimdWidth; + AddressN = B.CreateGEP(EleTy, Address, B.getInt32(Offset)); + } + Value *Load = nullptr; + if (!HasMask) { + Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1), Align); + } else { + Value *Mask = VecMasks[i]; + Load = + createMaskedLoad(B, VecTy, AddressN, Mask, /*EVL*/ nullptr, Align); + } + VECZ_FAIL_IF(!Load); + Vectors.push_back(Load); + } + // Transpose the loaded vectors and replace the original loads. + VECZ_FAIL_IF(!interleaveVectors(B, Vectors, false)); + for (unsigned i = 0; i < Group.size(); i++) { + Value *Vector = Vectors[i]; + Value *OrigLoad = Group[i]; + OrigLoad->replaceAllUsesWith(Vector); + } + } else if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) { + // Transpose the vectors to store with interleave. + for (unsigned i = 0; i < Group.size(); i++) { + CallInst *OrigStore = cast(Group[i]); + Vectors.push_back(OrigStore->getOperand(0)); + } + VECZ_FAIL_IF(!interleaveVectors(B, Vectors, true)); + if (HasMask) { + VECZ_FAIL_IF(!interleaveVectors(B, VecMasks, true)); + } + // Create one regular vector store per interleaved store in the group. + for (unsigned i = 0; i < Group.size(); i++) { + Value *Vector = Vectors[i]; + Value *AddressN = Address; + if (i > 0) { + const unsigned Offset = i * SimdWidth; + AddressN = B.CreateGEP(EleTy, Address, B.getInt32(Offset)); + } + Value *Store = nullptr; + if (!HasMask) { + Store = createStore(B, Vector, AddressN, getSizeInt(B, 1), Align); + } else { + Value *Mask = VecMasks[i]; + Store = createMaskedStore(B, Vector, AddressN, Mask, /*EVL*/ nullptr, + Align); + } + VECZ_FAIL_IF(!Store); + } + } + + return true; +} + +bool TargetInfo::interleaveVectors(IRBuilder<> &B, + MutableArrayRef Vectors, + bool Forward) const { + const unsigned Stride = Vectors.size(); + if (Stride == 0) { + return true; + } + auto *VecTy = dyn_cast(Vectors[0]->getType()); + VECZ_FAIL_IF(!VecTy); + if (Stride == 1) { + return true; + } + const unsigned Width = VecTy->getNumElements(); + VECZ_FAIL_IF(Width < Stride); + VECZ_FAIL_IF((Width % Stride) != 0); + for (unsigned i = 1; i < Stride; i++) { + auto *VecTyN = dyn_cast(Vectors[i]->getType()); + VECZ_FAIL_IF(!VecTyN || (VecTyN != VecTy)); + } + + // Prepare the masks. + SmallVector MaskLow2; + SmallVector MaskHigh2; + + StringRef Name; + if (Forward) { + Name = "interleave"; + const unsigned Width2 = Width >> 1; + const unsigned Width3 = Width2 + Width; + for (unsigned i = 0; i < Width2; ++i) { + MaskLow2.push_back(i); + MaskHigh2.push_back(i + Width2); + MaskLow2.push_back(i + Width); + MaskHigh2.push_back(i + Width3); + } + } else { + Name = "deinterleave"; + const unsigned Width2 = Width << 1; + for (unsigned i = 0; i < Width2; i += 2) { + MaskLow2.push_back(i); + MaskHigh2.push_back(i + 1); + } + } + Constant *CMaskLow2 = ConstantDataVector::get(B.getContext(), MaskLow2); + Constant *CMaskHigh2 = ConstantDataVector::get(B.getContext(), MaskHigh2); + + if (Stride == 2) { + Value *Src0 = Vectors[0]; + Value *Src1 = Vectors[1]; + Vectors[0] = B.CreateShuffleVector(Src0, Src1, CMaskLow2, Name); + Vectors[1] = B.CreateShuffleVector(Src0, Src1, CMaskHigh2, Name); + + return true; + } else if (Stride == 4) { + // For a 4-way interleave, we need two layers of shuffles. + // Starting with vectors a..A : b..B : c..C : d..D + // first shuffle layer -> ab.. : ..AB : cd.. : ..CD + // second shuffle layer -> abcd : .... : .... : ABCD + Value *Src0 = Vectors[0]; + Value *Src1 = Vectors[1]; + Value *Src2 = Vectors[2]; + Value *Src3 = Vectors[3]; + + Constant *CMaskLow4 = nullptr; + Constant *CMaskHigh4 = nullptr; + if (Forward) { + SmallVector MaskLow4; + SmallVector MaskHigh4; + const unsigned Width2 = Width >> 1; + const unsigned Width3 = Width2 + Width; + for (unsigned i = 0; i < Width2; i += 2) { + MaskLow4.push_back(i); + MaskLow4.push_back(i + 1); + MaskLow4.push_back(i + Width); + MaskLow4.push_back(i + 1 + Width); + MaskHigh4.push_back(Width2 + i); + MaskHigh4.push_back(Width2 + i + 1); + MaskHigh4.push_back(Width3 + i); + MaskHigh4.push_back(Width3 + i + 1); + } + CMaskLow4 = ConstantDataVector::get(B.getContext(), MaskLow4); + CMaskHigh4 = ConstantDataVector::get(B.getContext(), MaskHigh4); + } else { + SmallVector MaskLow4; + SmallVector MaskHigh4; + const unsigned Width2 = Width << 1; + for (unsigned i = 0; i < Width2; i += 4) { + MaskLow4.push_back(i); + MaskLow4.push_back(i + 1); + MaskHigh4.push_back(i + 2); + MaskHigh4.push_back(i + 3); + } + + // to perform the de-interleave we reverse the functions of the masks. + CMaskLow4 = CMaskLow2; + CMaskHigh4 = CMaskHigh2; + CMaskLow2 = ConstantDataVector::get(B.getContext(), MaskLow4); + CMaskHigh2 = ConstantDataVector::get(B.getContext(), MaskHigh4); + } + + Value *Tmp0 = B.CreateShuffleVector(Src0, Src1, CMaskLow2, Name); + Value *Tmp1 = B.CreateShuffleVector(Src0, Src1, CMaskHigh2, Name); + Value *Tmp2 = B.CreateShuffleVector(Src2, Src3, CMaskLow2, Name); + Value *Tmp3 = B.CreateShuffleVector(Src2, Src3, CMaskHigh2, Name); + Vectors[0] = B.CreateShuffleVector(Tmp0, Tmp2, CMaskLow4, Name); + Vectors[1] = B.CreateShuffleVector(Tmp0, Tmp2, CMaskHigh4, Name); + Vectors[2] = B.CreateShuffleVector(Tmp1, Tmp3, CMaskLow4, Name); + Vectors[3] = B.CreateShuffleVector(Tmp1, Tmp3, CMaskHigh4, Name); + + return true; + } + return false; +} + +unsigned TargetInfo::estimateSimdWidth(const TargetTransformInfo &TTI, + const ArrayRef vals, + unsigned width) const { + const unsigned MaxVecRegBitWidth = + TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector) + .getFixedValue(); + + const unsigned NumVecRegs = + TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true)); + + unsigned VaryingUsage = 0; + for (const auto *VI : vals) { + const auto *Ty = VI->getType(); + VaryingUsage += + Ty->isPointerTy() + ? TM_->getPointerSizeInBits(Ty->getPointerAddressSpace()) + : VI->getType()->getPrimitiveSizeInBits(); + } + const unsigned MaxBits = MaxVecRegBitWidth * NumVecRegs; + while (VaryingUsage * width > MaxBits) { + width >>= 1; + } + + return width; +} + +unsigned TargetInfo::getVectorWidthForType(const llvm::TargetTransformInfo &TTI, + const llvm::Type &Ty) const { + const unsigned MaxVecRegBitWidth = + TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector) + .getFixedValue(); + + if (MaxVecRegBitWidth == 0) { + return 0; + } + + unsigned BitWidth = 0; + if (!Ty.isPtrOrPtrVectorTy()) { + BitWidth = Ty.getScalarSizeInBits(); + } else if (TM_) { + BitWidth = TM_->getPointerSizeInBits(Ty.getPointerAddressSpace()); + } + + if (BitWidth == 0) { + // Couldn't work out the vector width.. + return 0; + } + + // The floor of 8 prevents poor double precision performance. + // Not sure why. + return std::max(MaxVecRegBitWidth / BitWidth, 8u); +} + +bool TargetInfo::canPacketize(const llvm::Value *, ElementCount) const { + return true; +} + +std::unique_ptr +vecz::createTargetInfoFromTargetMachine(TargetMachine *tm) { + // The TargetMachine is allowed to be null + if (tm) { + const Triple &TT(tm->getTargetTriple()); + switch (TT.getArch()) { + case Triple::arm: + return createTargetInfoArm(tm); + case Triple::aarch64: + return createTargetInfoAArch64(tm); + case Triple::riscv32: + case Triple::riscv64: + return createTargetInfoRISCV(tm); + default: + // Just use the generic TargetInfo unless we know better + break; + } + } + return std::make_unique(tm); +} + +AnalysisKey TargetInfoAnalysis::Key; + +TargetInfoAnalysis::TargetInfoAnalysis() + : TICallback([](const Module &) { + return std::make_unique(/*TM*/ nullptr); + }) {} + +TargetInfoAnalysis::TargetInfoAnalysis(TargetMachine *TM) + : TICallback([TM](const Module &) { + return vecz::createTargetInfoFromTargetMachine(TM); + }) {} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp new file mode 100644 index 0000000000000..bae66eb789260 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp @@ -0,0 +1,407 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include + +#include "debugging.h" +#include "vecz/vecz_target_info.h" + +using namespace vecz; +using namespace llvm; + +namespace vecz { + +class TargetInfoArm final : public TargetInfo { +public: + TargetInfoArm(TargetMachine *tm) : TargetInfo(tm) {} + + ~TargetInfoArm() = default; + + bool canOptimizeInterleavedGroup(const Instruction &val, + InterleavedOperation kind, int stride, + unsigned groupSize) const override; + + bool optimizeInterleavedGroup(IRBuilder<> &builder, InterleavedOperation kind, + ArrayRef group, + ArrayRef masks, Value *baseAddress, + int stride) const override; + +private: + bool canOptimizeInterleavedGroupImpl(const Instruction &val, + InterleavedOperation kind, int stride, + unsigned groupSize, + unsigned &intrinsicID) const; +}; + +class TargetInfoAArch64 final : public TargetInfo { +public: + TargetInfoAArch64(TargetMachine *tm) : TargetInfo(tm) {} + + ~TargetInfoAArch64() = default; + + bool canOptimizeInterleavedGroup(const Instruction &val, + InterleavedOperation kind, int stride, + unsigned groupSize) const override; + + bool optimizeInterleavedGroup(IRBuilder<> &builder, InterleavedOperation kind, + ArrayRef group, + ArrayRef masks, Value *baseAddress, + int stride) const override; + +private: + bool canOptimizeInterleavedGroupImpl(const Instruction &val, + InterleavedOperation kind, int stride, + unsigned groupSize, + unsigned &intrinsicID) const; +}; + +std::unique_ptr createTargetInfoArm(TargetMachine *tm) { + return std::make_unique(tm); +} + +std::unique_ptr createTargetInfoAArch64(TargetMachine *tm) { + return std::make_unique(tm); +} + +} // namespace vecz + +bool TargetInfoArm::canOptimizeInterleavedGroup(const Instruction &val, + InterleavedOperation kind, + int stride, + unsigned groupSize) const { + unsigned IntrID; + return canOptimizeInterleavedGroupImpl(val, kind, stride, groupSize, IntrID); +} + +bool TargetInfoArm::canOptimizeInterleavedGroupImpl(const Instruction &val, + InterleavedOperation kind, + int stride, + unsigned groupSize, + unsigned &IntrID) const { + IntrID = Intrinsic::not_intrinsic; + Type *dataType = nullptr; + if (kind == eInterleavedStore) { + switch (stride) { + default: + break; + case 2: + IntrID = Intrinsic::arm_neon_vst2; + break; + case 3: + IntrID = Intrinsic::arm_neon_vst3; + break; + case 4: + IntrID = Intrinsic::arm_neon_vst4; + break; + } + dataType = val.getOperand(0)->getType(); + } else if (kind == eInterleavedLoad) { + switch (stride) { + default: + break; + case 2: + IntrID = Intrinsic::arm_neon_vld2; + break; + case 3: + IntrID = Intrinsic::arm_neon_vld3; + break; + case 4: + IntrID = Intrinsic::arm_neon_vld4; + break; + } + dataType = val.getType(); + } else { + return false; + } + + if (IntrID == Intrinsic::not_intrinsic || ((groupSize % stride) != 0)) { + return false; + } + + if (!dataType) { + return false; + } + + auto *VecTy = dyn_cast(dataType); + if (!VecTy) { + return false; + } + + const unsigned VecBits = VecTy->getPrimitiveSizeInBits(); + if ((VecBits != 128) && (VecBits != 64)) { + return false; + } + + // NEON interleave instructions only allow 8, 16, and 32 bit elements + const unsigned ElementSize = VecTy->getScalarSizeInBits(); + if ((ElementSize != 32) && (ElementSize != 16) && (ElementSize != 8)) { + return false; + } + + return true; +} + +bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B, + InterleavedOperation kind, + ArrayRef group, + ArrayRef, Value *address, + int stride) const { + const bool HasMask = + (kind == eMaskedInterleavedLoad) || (kind == eMaskedInterleavedStore); + // canOptimizeInterleavedGroup() should have returned false in this case. + // ARM does not have masked vector load or store instructions. + VECZ_FAIL_IF(HasMask); + VECZ_FAIL_IF(stride < 0); + + // TODO fetch information on SubTargetInfo + // load instructions seems to be easily split in the backend whereas stores + // generate a backend error because of invalid data type on vector operands. + // Vector operands are enabled in the backend only when SubTargetInfo ensures + // NEON instrutions are supported. + const bool subTargetHasNeon = false; + if (!subTargetHasNeon && kind == eInterleavedStore) { + return false; + } + + // Validate the operations in the group. + SmallVector Calls; + for (unsigned i = 0; i < group.size(); i++) { + CallInst *Op = dyn_cast(group[i]); + if (!Op) { + return false; + } + Calls.push_back(Op); + } + + PointerType *PtrTy = dyn_cast(address->getType()); + if (!PtrTy) { + return false; + } + + CallInst *Op0 = Calls[0]; + // Determine the intrinsic to emit for this group. + unsigned IntrID = Intrinsic::not_intrinsic; + if (!canOptimizeInterleavedGroupImpl(*Op0, kind, stride, group.size(), + IntrID)) { + return false; + } + + // canOptimizeInterleavedGroup() performs several checks, including valid + // Kind and Op0 types. Thus, these casts are safe. + FixedVectorType *VecTy = nullptr; + if (kind == eInterleavedStore) { + VecTy = cast(Op0->getOperand(0)->getType()); + } else { // eInterleavedLoad + VecTy = cast(Op0->getType()); + } + + Type *EleTy = VecTy->getElementType(); + const unsigned Alignment = (EleTy->getPrimitiveSizeInBits() / 8); + + // Declare the intrinsic if needed. + SmallVector Tys; + if (kind == eInterleavedStore) { + Tys = {PtrTy, VecTy}; + } else if (kind == eInterleavedLoad) { + Tys = {VecTy, PtrTy}; + } + + Function *IntrFn = Intrinsic::getOrInsertDeclaration( + Op0->getModule(), (Intrinsic::ID)IntrID, Tys); + if (!IntrFn) { + return false; + } + + // Create a NEON load or store to replace the interleaved calls. + SmallVector Ops; + Ops.push_back(address); + if (kind == eInterleavedStore) { + for (unsigned i = 0; i < group.size(); i++) { + CallInst *Op = Calls[i]; + Ops.push_back(Op->getOperand(0)); + } + } + Ops.push_back(B.getInt32(Alignment)); + CallInst *CI = B.CreateCall(IntrFn, Ops, Op0->getName()); + CI->setCallingConv(IntrFn->getCallingConv()); + if (kind == eInterleavedLoad) { + for (unsigned i = 0; i < Calls.size(); i++) { + CallInst *Op = Calls[i]; + const ArrayRef Indices(&i, 1); + Value *Extract = B.CreateExtractValue(CI, Indices); + Op->replaceAllUsesWith(Extract); + } + } + return true; +} + +bool TargetInfoAArch64::canOptimizeInterleavedGroup(const Instruction &val, + InterleavedOperation kind, + int stride, + unsigned groupSize) const { + unsigned IntrID; + return canOptimizeInterleavedGroupImpl(val, kind, stride, groupSize, IntrID); +} + +bool TargetInfoAArch64::canOptimizeInterleavedGroupImpl( + const Instruction &val, InterleavedOperation kind, int stride, + unsigned groupSize, unsigned &IntrID) const { + IntrID = Intrinsic::not_intrinsic; + Type *dataType = nullptr; + if (kind == eInterleavedStore) { + switch (stride) { + default: + break; + case 2: + IntrID = Intrinsic::aarch64_neon_st2; + break; + case 3: + IntrID = Intrinsic::aarch64_neon_st3; + break; + case 4: + IntrID = Intrinsic::aarch64_neon_st4; + break; + } + dataType = val.getOperand(0)->getType(); + } else if (kind == eInterleavedLoad) { + switch (stride) { + default: + break; + case 2: + IntrID = Intrinsic::aarch64_neon_ld2; + break; + case 3: + IntrID = Intrinsic::aarch64_neon_ld3; + break; + case 4: + IntrID = Intrinsic::aarch64_neon_ld4; + break; + } + dataType = val.getType(); + } else { + return false; + } + + if (IntrID == Intrinsic::not_intrinsic || ((groupSize % stride) != 0)) { + return false; + } + + if (!dataType) { + return false; + } + + auto *VecTy = dyn_cast(dataType); + if (!VecTy) { + return false; + } + + const unsigned VecBits = VecTy->getPrimitiveSizeInBits(); + if ((VecBits != 128) && (VecBits != 64)) { + return false; + } + + // NEON interleave instructions only allow 8, 16, and 32 bit elements + const unsigned ElementSize = VecTy->getScalarSizeInBits(); + if ((ElementSize != 32) && (ElementSize != 16) && (ElementSize != 8)) { + return false; + } + + return true; +} + +bool TargetInfoAArch64::optimizeInterleavedGroup( + IRBuilder<> &B, InterleavedOperation kind, ArrayRef group, + ArrayRef, Value *address, int stride) const { + const bool HasMask = + (kind == eMaskedInterleavedLoad) || (kind == eMaskedInterleavedStore); + // canOptimizeInterleavedGroup() should have returned false in this case. + // AArch64 does not have masked vector load or store instructions. + VECZ_FAIL_IF(HasMask); + VECZ_FAIL_IF(stride < 0); + + // TODO fetch information on SubTargetInfo + // load instructions seems to be easily split in the backend whereas stores + // generate a backend error because of invalid data type on vector operands. + // Vector operands are enabled in the backend only when SubTargetInfo ensures + // NEON instrutions are supported. + const bool subTargetHasNeon = false; + if (!subTargetHasNeon && kind == eInterleavedStore) { + return false; + } + + // Validate the operations in the group. + SmallVector Calls; + for (unsigned i = 0; i < group.size(); i++) { + CallInst *Op = dyn_cast(group[i]); + if (!Op) { + return false; + } + Calls.push_back(Op); + } + + PointerType *PtrTy = dyn_cast(address->getType()); + if (!PtrTy) { + return false; + } + + CallInst *Op0 = Calls[0]; + // Determine the intrinsic to emit for this group. + unsigned IntrID = Intrinsic::not_intrinsic; + if (!canOptimizeInterleavedGroupImpl(*Op0, kind, stride, group.size(), + IntrID)) { + return false; + } + + // canOptimizeInterleavedGroup() performs several checks, including valid + // Kind and Op0 types. Thus, these casts are safe. + FixedVectorType *VecTy = nullptr; + if (kind == eInterleavedStore) { + VecTy = cast(Op0->getOperand(0)->getType()); + } else { // eInterleavedLoad + VecTy = cast(Op0->getType()); + } + + Function *IntrFn = Intrinsic::getOrInsertDeclaration( + Op0->getModule(), (Intrinsic::ID)IntrID, {VecTy, PtrTy}); + if (!IntrFn) { + return false; + } + + // Create a NEON load or store to replace the interleaved calls. + SmallVector Ops; + if (kind == eInterleavedStore) { + for (unsigned i = 0; i < group.size(); i++) { + CallInst *Op = Calls[i]; + Ops.push_back(Op->getOperand(0)); + } + } + Ops.push_back(address); + CallInst *CI = B.CreateCall(IntrFn, Ops, Op0->getName()); + CI->setCallingConv(IntrFn->getCallingConv()); + if (kind == eInterleavedLoad) { + for (unsigned i = 0; i < Calls.size(); i++) { + CallInst *Op = Calls[i]; + const ArrayRef Indices(&i, 1); + Value *Extract = B.CreateExtractValue(CI, Indices); + Op->replaceAllUsesWith(Extract); + } + } + return true; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp new file mode 100644 index 0000000000000..8c320bd324ffa --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp @@ -0,0 +1,753 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include + +#include "transform/packetization_helpers.h" +#include "vecz/vecz_target_info.h" + +using namespace vecz; +using namespace llvm; + +namespace vecz { + +class TargetInfoRISCV final : public TargetInfo { +public: + TargetInfoRISCV(TargetMachine *tm) : TargetInfo(tm) {} + + ~TargetInfoRISCV() = default; + + bool canPacketize(const llvm::Value *Val, ElementCount Width) const override; + + // These functions should only be overriden in LLVM >= 13. + llvm::Value *createScalableExtractElement( + llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx, + llvm::Instruction *extract, llvm::Type *narrowTy, llvm::Value *src, + llvm::Value *index, llvm::Value *evl) const override; + + llvm::Value * + createOuterScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector, + llvm::Value *VL, + ElementCount factor) const override { + return createScalableBroadcast(builder, vector, VL, factor, + /* URem */ true); + } + + llvm::Value * + createInnerScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector, + llvm::Value *VL, + ElementCount factor) const override { + return createScalableBroadcast(builder, vector, VL, factor, + /* URem */ false); + } + + llvm::Value *createScalableInsertElement(llvm::IRBuilder<> &builder, + vecz::VectorizationContext &Ctx, + llvm::Instruction *insert, + llvm::Value *elt, llvm::Value *into, + llvm::Value *index, + llvm::Value *evl) const override; + bool isVPVectorLegal(const llvm::Function &F, llvm::Type *Ty) const override; + + llvm::Value *createVectorShuffle(llvm::IRBuilder<> &builder, llvm::Value *src, + llvm::Value *mask, + llvm::Value *evl) const override; + + llvm::Value *createVectorSlideUp(llvm::IRBuilder<> &builder, llvm::Value *src, + llvm::Value *insert, + llvm::Value *evl) const override; + +private: + bool isOperationLegal(llvm::Intrinsic::ID ID, + llvm::ArrayRef Tys) const; + + /// @brief Maximum vector type size in bits for VP intrinsics. + static constexpr unsigned MaxLegalVectorTypeBits = 8 * 64; + + /// @return Whether the minimum size of a given vector type is less than 64 + /// bytes and the length is a power of 2. + bool isVectorTypeLegal(llvm::Type *Ty) const; + + llvm::Value *createScalableBroadcast(llvm::IRBuilder<> &builder, + llvm::Value *vector, llvm::Value *VL, + ElementCount factor, bool URem) const; + + Value *createVPKernelWidth(IRBuilder<> &, Value *, unsigned, + ElementCount) const override; +}; + +// LLVM 14 introduced vp intrinsics legalization. +bool TargetInfoRISCV::isVPVectorLegal(const llvm::Function &F, + llvm::Type *Ty) const { + (void)F; + return isVectorTypeLegal(Ty); +} + +// Should be target-dependent. Take RISCV legal types for now. +// FIXME: LLVM 14 adds better support for legalization of vp intrinsics, but +// not RISCV ones like vrgather_vv. +bool TargetInfoRISCV::isVectorTypeLegal(Type *Ty) const { + assert(Ty->isVectorTy() && "Expecting a vector type."); + (void)Ty; + // FIXME: VP boolean logical operators (and,or,xor) are not matched in the + // LLVM 13 RVV backend: we must backport https://reviews.llvm.org/D115546 + // before we can enable this for Int1Ty as well. + bool isLegal = isLegalVPElementType(multi_llvm::getVectorElementType(Ty)); + if (isLegal) { + const uint32_t MinSize = + multi_llvm::getVectorElementCount(Ty).getKnownMinValue(); + isLegal = isPowerOf2_32(MinSize) && + MinSize * Ty->getScalarSizeInBits() <= MaxLegalVectorTypeBits; + } + return isLegal; +} + +std::unique_ptr createTargetInfoRISCV(TargetMachine *tm) { + return std::make_unique(tm); +} + +} // namespace vecz + +bool TargetInfoRISCV::canPacketize(const llvm::Value *Val, + ElementCount Width) const { + // If we're not scalable, assume the backend will sort everything out. + if (!Width.isScalable()) { + return true; + } + // Do a relatively simple check that instructions aren't defining any types + // that can't be legalized when turned into scalable vectors. + if (!llvm::isa(Val)) { + return true; + } + const auto *I = llvm::cast(Val); + + const auto IsIllegalIntBitwidth = [](const llvm::Type *Ty) { + if (!Ty->isIntOrIntVectorTy()) { + return false; + } + auto ScalarBitWidth = + llvm::cast(Ty->getScalarType())->getBitWidth(); + return ScalarBitWidth > 64; + }; + + if (IsIllegalIntBitwidth(I->getType())) { + return false; + } + for (auto *O : I->operand_values()) { + if (IsIllegalIntBitwidth(O->getType())) { + return false; + } + } + return true; +} + +/// @return Whether RISCV intrinsic @a ID is legal for types @a Tys. +/// +/// This function does not check whether the intrinsic is being called +/// with the right argument types, it just tests that all the types +/// used to call the intrinsic (and its return type) are +/// isVectorTypeLegal(). +/// +/// @param[in] ID The intrinsic ID +/// @param[in] Tys A subset of the overloaded types of the intrinsic required to +/// check whether it's legal. +bool TargetInfoRISCV::isOperationLegal(llvm::Intrinsic::ID ID, + llvm::ArrayRef Tys) const { + switch (ID) { + case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv: + case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv_mask: + // riscv_vrgather_vv[_mask](RetTy, _IdxTy) + // We only need to check the return type here, as it should be greater or + // equal to the index type. + assert(Tys.size() == 1 && + "Only the return type is needed to check vrgather_vv intrinsics"); + return isVectorTypeLegal(Tys.front()); + case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv: + case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv_mask: { + constexpr unsigned MaxVectorSize = MaxLegalVectorTypeBits / 16; + // riscv_vrgatherei16_vv[_mask](RetTy, _IdxTy) + // Case similar to that of riscv_vrgather_vv[_mask], but we also need to + // check that the vector size is no greater than MaxLegalVectorTypeSize / + // 16, as the index type will always be i16. + assert( + Tys.size() == 1 && + "Only the return type is needed to check vrgatherei16_vv intrinsics"); + auto *const RetTy = Tys.front(); + return isVectorTypeLegal(RetTy) && + multi_llvm::getVectorElementCount(RetTy).getKnownMinValue() <= + MaxVectorSize; + } + default: + break; + } + llvm_unreachable("Don't know how to check whether this intrinsic is legal."); +} + +namespace { +static unsigned getRISCVBits(const TargetMachine *TM) { + const auto &Triple = TM->getTargetTriple(); + return Triple.isArch32Bit() ? 32 : 64; +} + +/// @brief Get VL to be used as a parameter of a RISCV intrinsic. +/// +/// The type of this value will depend on the architecture (RISCV32 or +/// RISCV64). +/// +/// @return A pair containig the VL value and its type. +/// +/// @param[in] B Builder to use when creating the VL value. +/// @param[in] VL Original VL. If non-nullptr, this value (zero-extended for +/// RISCV64) will be returned. +/// @param[in] wideTy Type of the vectors which will be used in the intrinsics. +/// If no VL is provided and `` is used here, ` * N` will be returned. +/// @param[in] TM Target machine. +/// @param[in] N name of the instruction to generate. "xlen" by default. +llvm::Value *getIntrinsicVL(llvm::IRBuilderBase &B, llvm::Value *VL, + llvm::Type *wideTy, llvm::TargetMachine *TM, + const Twine &N = "xlen") { + const unsigned XLenTyWidth = getRISCVBits(TM); + Type *XLen = B.getIntNTy(XLenTyWidth); + + if (VL) { + // Our incoming VP VL type is always i32, so zero-extend to 64 bits if + // required. + return XLenTyWidth == 32 ? VL : B.CreateZExt(VL, XLen, N); + } + + // Else create a 'default' VL which covers the entire scalable vector. + return B.CreateElementCount(XLen, + cast(wideTy)->getElementCount()); +} + +/// @brief Returns a pair with the `vrgather` intrinsic variation to use and the +/// bitwidth of the `vs1` parameter to this intrinsic. +/// +/// @param[in] vs2Ty Type of the source vector. +/// @param[in] isMasked Whether the intrinsic should be masked. +std::pair +getGatherIntrinsic(llvm::Type *vs2Ty, bool isMasked = false) { + assert(!vs2Ty->isPtrOrPtrVectorTy() && + "Cannot get gather intrinsic for a vector of pointers"); + + Intrinsic::RISCVIntrinsics Opc; + auto *vecTy = multi_llvm::getVectorElementType(vs2Ty); + unsigned vs1Width; + if (vecTy->isIntegerTy() && vecTy->getIntegerBitWidth() == 8) { + Opc = isMasked ? Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv_mask + : Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv; + + vs1Width = 16; + } else { + Opc = isMasked ? Intrinsic::RISCVIntrinsics::riscv_vrgather_vv_mask + : Intrinsic::RISCVIntrinsics::riscv_vrgather_vv; + + vs1Width = vecTy->getScalarSizeInBits(); + } + return std::make_pair(Opc, vs1Width); +} + +/// @brief Returns the `v?slide1up.v?` intrinsic variation to use. +/// +/// @param[in] vs2Ty Type of the source vector. +llvm::Intrinsic::RISCVIntrinsics getSlideUpIntrinsic(llvm::Type *vs2Ty) { + assert(!vs2Ty->isPtrOrPtrVectorTy() && + "Cannot get gather intrinsic for a vector of pointers"); + + Intrinsic::RISCVIntrinsics Opc; + auto *vecTy = multi_llvm::getVectorElementType(vs2Ty); + if (vecTy->isFloatingPointTy()) { + Opc = Intrinsic::RISCVIntrinsics::riscv_vfslide1up; + } else { + Opc = Intrinsic::RISCVIntrinsics::riscv_vslide1up; + } + return Opc; +} + +} // namespace + +llvm::Value *TargetInfoRISCV::createScalableExtractElement( + llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx, + llvm::Instruction *origExtract, llvm::Type *narrowTy, llvm::Value *src, + llvm::Value *index, llvm::Value *VL) const { + // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through + // memory when creating this operation. + // vrgather: vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; + // or, + // vrgather: res[i] = (idxs[i] >= VLMAX) ? 0 : src[idxs[i]]; + // An example: extractelement , I - vectorized by - we + // receive here as packetized arguments: + // src: ( ) + // idxs: ( ) + // We want to construct operands such that we have: + // srcs: as before + // idxs: ( ) + // So that vrgather extracts the Ith element from the first 4 elements, the + // Jth element from the second 4, etc. + auto *srcTy = cast(src->getType()); + + Intrinsic::ID intrinsicID; + unsigned intrIdxBitWidth; + std::tie(intrinsicID, intrIdxBitWidth) = getGatherIntrinsic(srcTy); + + const auto srcEC = multi_llvm::getVectorElementCount(srcTy); + const auto resEC = multi_llvm::getVectorElementCount(narrowTy); + + auto *const indexEltTy = B.getIntNTy(intrIdxBitWidth); + Type *const indexVecTy = VectorType::get(indexEltTy, resEC); + + // We cannot use this optimization if the types are not legal in the target + // machine. + if (!isOperationLegal(intrinsicID, {srcTy})) { + return TargetInfo::createScalableExtractElement(B, Ctx, origExtract, + narrowTy, src, index, VL); + } + + auto *const avl = getIntrinsicVL(B, VL, narrowTy, getTargetMachine()); + + auto *indexTy = index->getType(); + const bool isIdxVector = indexTy->isVectorTy(); + const unsigned idxBitWidth = indexTy->getScalarSizeInBits(); + + // The intrinsic may demand a larger index type than we currently have; + // extend up to the right type. + if (idxBitWidth != intrIdxBitWidth) { + index = B.CreateZExtOrTrunc(index, isIdxVector ? indexVecTy : indexEltTy); + } + + // If the index is uniform, it may not be a vector. We need one for the + // intrinsic, so splat it here. + if (!isIdxVector) { + index = B.CreateVectorSplat(resEC, index); + } + + // Construct the indices such that each packetized index (still indexing into + // the original vector of 4 elements) is spread out such that each index + // indexes into its own 4-element slice: e.g., . + auto *indices = getGatherIndicesVector( + B, index, indexVecTy, + multi_llvm::getVectorNumElements(origExtract->getOperand(0)->getType()), + "vs1"); + + auto *const zero = B.getInt64(0); + + // Our indices are still in the narrower vectorized type (e.g., ), but the vrgather intrinsics need equally-sized vector types. So + // insert the indices into a wide dummy vector (e.g., ), + // perform the vrgather, and extract the subvector back out again. + auto *const intrIndexTy = VectorType::get(indexEltTy, srcEC); + indices = B.CreateInsertVector(intrIndexTy, PoisonValue::get(intrIndexTy), + indices, zero); + + SmallVector ops; + // Add the a pass-through operand - we set it to poison. + ops.push_back(PoisonValue::get(srcTy)); + ops.push_back(src); + ops.push_back(indices); + ops.push_back(avl); + + auto *const gather = + B.CreateIntrinsic(intrinsicID, {srcTy, avl->getType()}, ops); + + return B.CreateExtractVector(narrowTy, gather, zero); +} + +llvm::Value *TargetInfoRISCV::createScalableBroadcast(llvm::IRBuilder<> &B, + llvm::Value *vector, + llvm::Value *VL, + ElementCount factor, + bool URem) const { + // Using rvv instruction: + // vrgather.vv vd, vs2, vs1, vm s.t. + // vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]] + + auto *vectorTy = vector->getType(); + auto *const origElTy = multi_llvm::getVectorElementType(vectorTy); + + // We first check we are not broadcasting a vector of pointers, + // unsupported by the intrinsic. + const bool isVectorOfPointers = origElTy->isPtrOrPtrVectorTy(); + if (isVectorOfPointers) { + vectorTy = VectorType::get(B.getIntNTy(getRISCVBits(getTargetMachine())), + multi_llvm::getVectorElementCount(vectorTy)); + } + + auto *const wideTy = ScalableVectorType::get( + multi_llvm::getVectorElementType(vectorTy), + factor.getKnownMinValue() * + multi_llvm::getVectorElementCount(vectorTy).getKnownMinValue()); + + Intrinsic::RISCVIntrinsics intrinsicID; + unsigned vs1Width; + std::tie(intrinsicID, vs1Width) = getGatherIntrinsic(wideTy); + auto *const vs1ElTy = B.getIntNTy(vs1Width); + + // We cannot use this optimization if the types are not legal in the target + // machine. + if (!isOperationLegal(intrinsicID, {wideTy})) { + return URem + ? TargetInfo::createOuterScalableBroadcast(B, vector, VL, factor) + : TargetInfo::createInnerScalableBroadcast(B, vector, VL, + factor); + } + + // Cast the vector of pointers to a vector of integers if needed. + if (isVectorOfPointers) { + vector = B.CreatePtrToInt(vector, vectorTy); + } + + // We grow the fixed vector to consume an entire RVV register. + auto *const vs2 = B.CreateInsertVector(wideTy, PoisonValue::get(wideTy), + vector, B.getInt64(0), "vs2"); + + auto *const vs1 = createBroadcastIndexVector( + B, VectorType::get(vs1ElTy, wideTy), factor, URem, "vs1"); + + auto *const avl = getIntrinsicVL(B, VL, wideTy, getTargetMachine()); + + SmallVector ops; + // Add the pass-through operand - we set it to poison. + ops.push_back(PoisonValue::get(vs2->getType())); + ops.push_back(vs2); + ops.push_back(vs1); + ops.push_back(avl); + + Value *gather = + B.CreateIntrinsic(intrinsicID, {vs2->getType(), avl->getType()}, ops); + + // If we had to cast the vector before, we do the reverse operation + // on the result. + if (isVectorOfPointers) { + gather = B.CreateIntToPtr(gather, VectorType::get(origElTy, wideTy)); + } + + return gather; +} + +static CallInst *createRISCVMaskedIntrinsic(IRBuilder<> &B, Intrinsic::ID ID, + ArrayRef Types, + ArrayRef Args, + unsigned TailPolicy, + Instruction *FMFSource = nullptr, + const Twine &Name = "") { + SmallVector InArgs(Args.begin(), Args.end()); + InArgs.push_back( + B.getIntN(Args.back()->getType()->getIntegerBitWidth(), TailPolicy)); + return B.CreateIntrinsic(ID, Types, InArgs, FMFSource, Name); +} + +llvm::Value *TargetInfoRISCV::createScalableInsertElement( + llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx, + llvm::Instruction *origInsert, llvm::Value *elt, llvm::Value *into, + llvm::Value *index, llvm::Value *VL) const { + // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through + // memory when creating this operation. + // vrgather: vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]; + // or, + // vrgather: res[i] = (idxs[i] >= VLMAX) ? 0 : src[idxs[i]]; + // An example: insertelement , X, I - vectorized by - + // we receive here as packetized arguments: + // into: ( ) + // elt: ( ) + // idxs: ( ) + // We want to construct operands such that we have: + // into: as before + // elt: ( ) + // mask: true where the elts indices are to be inserted according to the + // indices, e.g., + // <0,1,0,0, 0,0,0,1, 1,0,0,0, ... ( ) + // idxs: <0,I,0,0, 0,0,0,J+4, K+8,...> ( ) + // So that vrgather inserts X into the Ith element of the first 4 elements, Y + // into the Jth element of the second 4, etc: + // res: + // If instead we use a masked vrgather with the same mask as before and with + // a merge operand of 'into', we expect the blended operation to be correct: + // res: + auto *const eltTy = elt->getType(); + auto *const intoTy = into->getType(); + + Intrinsic::ID intrinsicID; + unsigned intrIdxBitWidth; + std::tie(intrinsicID, intrIdxBitWidth) = + getGatherIntrinsic(intoTy, /*isMasked*/ true); + + const auto eltEC = multi_llvm::getVectorElementCount(eltTy); + const auto intoEC = multi_llvm::getVectorElementCount(intoTy); + const auto fixedAmt = + multi_llvm::getVectorElementCount(origInsert->getType()); + assert(!fixedAmt.isScalable() && "Scalable pre-packetized value?"); + + auto *indexEltTy = B.getIntNTy(intrIdxBitWidth); + Type *const indexVecTy = VectorType::get(indexEltTy, eltEC); + + // We cannot use this optimization if the types are not legal in the target + // machine. + if (!isOperationLegal(intrinsicID, {intoTy})) { + return TargetInfo::createScalableInsertElement(B, Ctx, origInsert, elt, + into, index, VL); + } + + auto *const avl = getIntrinsicVL(B, VL, intoTy, getTargetMachine()); + + auto *const indexTy = index->getType(); + const unsigned idxBitWidth = indexTy->getScalarSizeInBits(); + const bool indexIsVector = indexTy->isVectorTy(); + + // The intrinsic may demand a larger index type than we currently have; + // extend up to the right type. + if (idxBitWidth != intrIdxBitWidth) { + index = B.CreateZExtOrTrunc(index, indexIsVector ? indexVecTy : indexEltTy); + } + + // If the index is uniform, it may not be a vector. We need one for the + // intrinsic, so splat it here. + if (!indexIsVector) { + index = B.CreateVectorSplat(intoEC, index); + } else { + index = createInnerScalableBroadcast(B, index, VL, fixedAmt); + } + + auto *const zero = B.getInt64(0); + + auto *const intrEltTy = + VectorType::get(multi_llvm::getVectorElementType(elt->getType()), intoEC); + elt = B.CreateInsertVector(intrEltTy, PoisonValue::get(intrEltTy), elt, zero, + "vs2"); + + auto *steps = B.CreateStepVector(VectorType::get(indexEltTy, intoEC)); + + // Create our inner indices, e.g.: <0,1,2,3, 0,1,2,3, 0,1,2,3, ... > + auto *const innerIndices = B.CreateURem( + steps, + ConstantVector::getSplat( + intoEC, ConstantInt::get(indexEltTy, fixedAmt.getFixedValue()))); + + // Create our outer indices, e.g., <0,0,0,0,1,1,1,1,2,2,2,2,...> + auto *const outerIndices = B.CreateUDiv( + steps, + ConstantVector::getSplat( + intoEC, ConstantInt::get(indexEltTy, fixedAmt.getFixedValue()))); + + // Now compare the insert indices with the inner index vector: only one per + // N-element slice will be 'on', depending on the exact indices, e.g., if we + // originally have: + // <1,3,0, ...> + // we have prepared it when constructing the indices: + // <1,1,1,1, 3,3,3,3, 0,0,0,0, ...> + // == <0,1,2,3, 0,1,2,3, 0,1,2,3, ...> + // -> <0,1,0,0, 0,0,0,1, 1,0,0,0, ...> + auto *const mask = B.CreateICmpEQ(index, innerIndices, "vm"); + + return createRISCVMaskedIntrinsic(B, intrinsicID, {intoTy, avl->getType()}, + {into, elt, outerIndices, mask, avl}, + /*TailUndisturbed*/ 1); +} + +llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B, + llvm::Value *src, + llvm::Value *mask, + llvm::Value *VL) const { + // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through + // memory when creating this operation. + assert(isa(src->getType()) && + "TargetInfoRISCV::createVectorShuffle: source must have vector type"); + assert(isa(mask->getType()) && + "TargetInfoRISCV::createVectorShuffle: mask must have vector type"); + + auto *const srcTy = cast(src->getType()); + if (isa(mask)) { + // Special case if the mask happens to be a constant. + return B.CreateShuffleVector(src, PoisonValue::get(srcTy), mask); + } + + if (isa(srcTy)) { + // The gather intrinsics don't work with fixed vectors. + return TargetInfo::createVectorShuffle(B, src, mask, VL); + } + + auto *const maskTy = cast(mask->getType()); + const auto srcEC = multi_llvm::getVectorElementCount(srcTy); + const auto resEC = multi_llvm::getVectorElementCount(maskTy); + + auto *const resTy = VectorType::get(srcTy->getElementType(), resEC); + + // We can't create the intrinsics with a scalar size smaller than 8 bits, so + // extend it to i8, perform the shuffle, and truncate the result back. + if (srcTy->getScalarSizeInBits() < 8) { + auto *const fix = B.CreateZExt(src, VectorType::get(B.getInt8Ty(), srcEC)); + auto *const res = createVectorShuffle(B, fix, mask, VL); + return B.CreateTrunc(res, resTy); + } + + Intrinsic::ID intrinsicID; + unsigned intrIdxBitWidth; + std::tie(intrinsicID, intrIdxBitWidth) = getGatherIntrinsic(srcTy); + + auto *const indexEltTy = B.getIntNTy(intrIdxBitWidth); + auto *const indexVecTy = VectorType::get(indexEltTy, resEC); + + // We cannot use this optimization if the types are not legal in the target + // machine. + if (!isOperationLegal(intrinsicID, {srcTy})) { + return TargetInfo::createVectorShuffle(B, src, mask, VL); + } + + // The intrinsic may demand a larger index type than we currently have; + // extend up to the right type. + if (indexVecTy != maskTy) { + mask = B.CreateZExtOrTrunc(mask, indexVecTy); + } + + auto *const zero = B.getInt64(0); + + const bool same = (resEC == srcEC); + const bool narrow = !same && (srcEC.isScalable() || !resEC.isScalable()) && + resEC.getKnownMinValue() <= srcEC.getKnownMinValue(); + const bool widen = !same && (resEC.isScalable() || !srcEC.isScalable()) && + srcEC.getKnownMinValue() <= resEC.getKnownMinValue(); + + assert((srcTy == resTy || narrow || widen) && + "TargetInfoRISCV::createVectorShuffle: " + "unexpected combination of source and mask vector types"); + + auto *gatherTy = resTy; + if (narrow) { + // The vrgather intrinsics need equally-sized vector types. So + // insert the indices into a wide dummy vector (e.g., ), + // perform the vrgather, and extract the subvector back out again. + auto *const wideMaskTy = VectorType::get(indexEltTy, srcEC); + mask = B.CreateInsertVector(wideMaskTy, PoisonValue::get(wideMaskTy), mask, + zero); + gatherTy = srcTy; + } else if (widen) { + // The result is wider than the source, so insert the source vector into a + // wider vector first. + src = B.CreateInsertVector(resTy, PoisonValue::get(resTy), src, zero); + } + + auto *const avl = getIntrinsicVL(B, VL, gatherTy, getTargetMachine()); + + SmallVector ops; + // Add the pass-through operand - we set it to poison. + ops.push_back(PoisonValue::get(gatherTy)); + ops.push_back(src); + ops.push_back(mask); + ops.push_back(avl); + + auto *const gather = + B.CreateIntrinsic(intrinsicID, {gatherTy, avl->getType()}, ops); + + if (narrow) { + return B.CreateExtractVector(resTy, gather, zero); + } + return gather; +} + +llvm::Value *TargetInfoRISCV::createVectorSlideUp(llvm::IRBuilder<> &B, + llvm::Value *src, + llvm::Value *insert, + llvm::Value *VL) const { + auto *const srcTy = dyn_cast(src->getType()); + assert(srcTy && + "TargetInfo::createVectorShuffle: source must have vector type"); + + if (isa(srcTy)) { + // The slide1up intrinsics don't work with fixed vectors. + return TargetInfo::createVectorSlideUp(B, src, insert, VL); + } + + const auto intrinsicID = getSlideUpIntrinsic(srcTy); + + auto *const avl = getIntrinsicVL(B, VL, srcTy, getTargetMachine()); + + SmallVector ops; + // Add the pass-through operand - we set it to poison. + ops.push_back(PoisonValue::get(srcTy)); + ops.push_back(src); + ops.push_back(insert); + ops.push_back(avl); + + return B.CreateIntrinsic(intrinsicID, + {srcTy, insert->getType(), avl->getType()}, ops); +} + +// This enum was copy/pasted from the RISCV backend +enum VLMUL : uint8_t { + LMUL_1 = 0, + LMUL_2, + LMUL_4, + LMUL_8, + LMUL_RESERVED, + LMUL_F8, + LMUL_F4, + LMUL_F2 +}; + +Value *TargetInfoRISCV::createVPKernelWidth(IRBuilder<> &B, + Value *RemainingIters, + unsigned WidestEltTy, + ElementCount VF) const { + // The widest element type can only be one of the supported legal RVV vector + // element types. + if (WidestEltTy < 8 || WidestEltTy > 64 || !isPowerOf2_32(WidestEltTy)) { + return nullptr; + } + const auto KnownMin = VF.getKnownMinValue(); + // The vectorization factor must be scalable and a legal vsetvli amount: no + // greater than the maximum vector length for each element width: + // nx64vi8,nx32vi16,nx16vi32,nxv8i64 + if (!VF.isScalable() || !isPowerOf2_32(KnownMin) || + KnownMin > MaxLegalVectorTypeBits / WidestEltTy) { + return nullptr; + } + + unsigned LMUL = 0; + const unsigned MaxLegalElementWidth = 64; + + if ((WidestEltTy * KnownMin) / MaxLegalElementWidth) { + // Non-fractional LMULs + LMUL = Log2_64((WidestEltTy * KnownMin) / MaxLegalElementWidth); + } else { + // Fractional LMULs + const auto Fraction = MaxLegalElementWidth / (WidestEltTy * KnownMin); + if (Fraction == 2) { + LMUL = LMUL_F2; + } else if (Fraction == 4) { + LMUL = LMUL_F4; + } else if (Fraction == 8) { + LMUL = LMUL_F4; + } else { + return nullptr; + } + } + + auto *const VLMul = B.getInt64(LMUL); + auto *const VSEW = B.getInt64(Log2_64(WidestEltTy) - 3); + + auto *const I32Ty = Type::getInt32Ty(B.getContext()); + auto *const I64Ty = Type::getInt64Ty(B.getContext()); + + auto *const VL = B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli, + {I64Ty}, {RemainingIters, VSEW, VLMul}); + + return B.CreateTrunc(VL, I32Ty); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp new file mode 100644 index 0000000000000..0c16da1c10106 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp @@ -0,0 +1,172 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include + +#include "vecz/vecz_choices.h" + +using namespace llvm; + +namespace { +using namespace vecz; +static const VectorizationChoices::ChoiceInfo choicesArray[] = { + {"PacketizeUniform", VectorizationChoices::eOptimizationPacketizeUniform, + "Packetizes all packetizable instructions whether they are varying or " + "not."}, + + {"PacketizeUniformInLoops", + VectorizationChoices::eOptimizationPacketizeUniformInLoops, + "Packetizes all packetizable instructions in loops, whether they are " + "varying or not."}, + + {"InstantiateCallsInLoops", + VectorizationChoices::eOptimizationInstantiateCallsInLoops, + "Uses loops to instantiate call instructions, instead of duplication."}, + + {"LinearizeBOSCC", VectorizationChoices::eLinearizeBOSCC, + "Control Flow Conversion uses Branch On Superword Condition Code."}, + + {"FullScalarization", VectorizationChoices::eFullScalarization, + "The scalarization pass scalarizes everything it can, regardless of any " + "performance benefit."}, + + {"DivisionExceptions", VectorizationChoices::eDivisionExceptions, + "Specify this when the target throws hardware exceptions on integer " + "division by zero."}, + + {"VectorPredication", VectorizationChoices::eVectorPredication, + "Generate a vector-predicated kernel safe to run on any workgroup size, " + "even those smaller than the vectorization width"}, + + {"TargetIndependentPacketization", + VectorizationChoices::eTargetIndependentPacketization, + "Force target-independent packetization choices (e.g., for testing " + "purposes)"}, +}; + +} // namespace + +namespace vecz { + +VectorizationChoices::VectorizationChoices() {} + +bool VectorizationChoices::parseChoicesString(StringRef Str) { + // If the string is empty, our work here is done + if (Str.empty()) { + return true; + } + + // first = Choice, second = enable + using ChoiceValuePair = std::pair; + // The lexer implementation from the name mangling module is fairly generic, + // so we will use it here. + compiler::utils::Lexer L(Str); + // We support multiple separators in case of platform-dependent issues + const StringRef Separators = ":;,"; + // All the parsed choices will be stored in a set and will only be + // enabled/disabled after the parsing has been completed successfully. + SmallVector ParsedChoices; + + // Start by lexing and parsing the Choices string + + bool read_separator = false; + do { + StringRef ParsedChoice; + // Strip any leading whitespace + L.ConsumeWhitespace(); + // If we have reached the end of the string, we are done + if (L.Left() == 0) { + break; + } + // Consume the optional "no" prefix, which disables the given prefix + const bool disable = L.Consume("no"); + // Consume the Choice name + if (L.ConsumeAlphanumeric(ParsedChoice)) { + // Convert the string to a Choice value + const Choice C = fromString(ParsedChoice); + if (C == eInvalid) { + printChoicesParseError(Str, L.CurrentPos() - ParsedChoice.size(), + "Invalid Choice \"" + ParsedChoice + "\""); + return false; + } + ParsedChoices.push_back(std::make_pair(C, !disable)); + } else { + printChoicesParseError(Str, L.CurrentPos(), "Expected Choice"); + return false; + } + // Strip any trailing whitespace + L.ConsumeWhitespace(); + // Consume the separator (if any) + read_separator = false; + auto Current = L.Current(); + if (Current != -1 && Separators.contains(char(Current))) { + L.Consume(1); + read_separator = true; + } + } while (read_separator && L.Left() > 0); + + // If there is any string left, there must be some kind of mistake + if (L.Left() != 0) { + printChoicesParseError(Str, L.CurrentPos(), "Expected ';'"); + return false; + } + + // Set all the choices parsed in the loop + + for (auto C : ParsedChoices) { + if (C.second == true) { + enable(C.first); + } else { + disable(C.first); + } + } + + // We have finished successfully + + return true; +} + +VectorizationChoices::Choice VectorizationChoices::fromString(StringRef Str) { + auto Choose = StringSwitch(Str); + for (const auto &info : ArrayRef(choicesArray)) { + Choose.Case(info.name, info.number); + } + return Choose.Default(eInvalid); +} + +ArrayRef +VectorizationChoices::queryAvailableChoices() { + return ArrayRef(choicesArray); +} + +void VectorizationChoices::printChoicesParseError(StringRef Input, + unsigned Position, + Twine Msg) { + errs() << "CODEPLAY_VECZ_CHOICES parsing error: " << Msg << " at position " + << Position << "\n"; + errs() << " " << Input << "\n "; + // We use the range [1, Position) instead of [0, Position - 1) to avoid + // an underflow in the case of Position = 0 + for (unsigned i = 0; i < Position; ++i) { + errs() << ' '; + } + errs() << "^\n"; +} +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp new file mode 100644 index 0000000000000..a90ce8d767048 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp @@ -0,0 +1,1283 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "vectorization_context.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "llvm_helpers.h" +#include "memory_operations.h" +#include "transform/packetization_helpers.h" +#include "vectorization_helpers.h" +#include "vectorization_unit.h" +#include "vecz/vecz_choices.h" +#include "vecz/vecz_target_info.h" + +#define DEBUG_TYPE "vecz" + +using namespace llvm; +using namespace vecz; + +STATISTIC(VeczContextFailBuiltin, + "Context: builtins with no vector equivalent [ID#V84]"); +STATISTIC(VeczContextFailScalarizeCall, + "Context: non-scalarizable vector builtin [ID#V86]"); + +/// @brief Prefix used to distinguish internal vecz builtins from OpenCL +/// builtins and user functions. +const char *VectorizationContext::InternalBuiltinPrefix = "__vecz_b_"; + +VectorizationContext::VectorizationContext(llvm::Module &target, + TargetInfo &vti, + compiler::utils::BuiltinInfo &bi) + : VTI(vti), Module(target), BI(bi), DL(&Module.getDataLayout()) {} + +TargetTransformInfo +VectorizationContext::getTargetTransformInfo(Function &F) const { + auto *const TM = targetInfo().getTargetMachine(); + if (TM) { + return TM->getTargetTransformInfo(F); + } else { + return TargetTransformInfo(F.getParent()->getDataLayout()); + } +} + +VectorizationUnit *VectorizationContext::getActiveVU(const Function *F) const { + const auto I = ActiveVUs.find(F); + if (I == ActiveVUs.end()) { + return nullptr; + } + VectorizationUnit *VU = I->second; + assert(VU->vectorizedFunction() == F); + return VU; +} + +compiler::utils::BuiltinInfo &VectorizationContext::builtins() { return BI; } + +const compiler::utils::BuiltinInfo &VectorizationContext::builtins() const { + return BI; +} + +VectorizationUnit *VectorizationContext::createVectorizationUnit( + llvm::Function &F, ElementCount VF, unsigned Dimension, + const VectorizationChoices &Ch) { + KernelUnits.push_back( + std::make_unique(F, VF, Dimension, *this, Ch)); + return KernelUnits.back().get(); +} + +bool VectorizationContext::isVector(const Instruction &I) { + if (I.getType()->isVectorTy()) { + return true; + } + for (const Use &op : I.operands()) { + if (op->getType()->isVectorTy()) { + return true; + } + } + return false; +} + +bool VectorizationContext::canExpandBuiltin(const Function *ScalarFn) const { + // Builtins that return no value must have side-effects. + if (ScalarFn->getReturnType()->isVoidTy()) { + return false; + } + for (const Argument &Arg : ScalarFn->args()) { + // Most builtins that take pointers have side-effects. Be conservative. + if (Arg.getType()->isPointerTy()) { + return false; + } + } + return true; +} + +VectorizationResult & +VectorizationContext::getOrCreateBuiltin(llvm::Function &F, + unsigned SimdWidth) { + compiler::utils::BuiltinInfo &BI = builtins(); + const auto Cached = VectorizedBuiltins.find(&F); + if (Cached != VectorizedBuiltins.end()) { + const auto Found = Cached->second.find(SimdWidth); + if (Found != Cached->second.end()) { + return Found->second; + } + } + + auto &result = VectorizedBuiltins[&F][SimdWidth]; + + const auto Builtin = BI.analyzeBuiltin(F); + if (!Builtin) { + ++VeczContextFailBuiltin; + return result; + } + + // Try to find a vector equivalent for the builtin. + Function *const VectorCallee = + isInternalBuiltin(&F) + ? getInternalVectorEquivalent(&F, SimdWidth) + : BI.getVectorEquivalent(*Builtin, SimdWidth, &Module); + + if (!VectorCallee) { + ++VeczContextFailBuiltin; + return result; + } + + result.func = VectorCallee; + + // Gather information about the function's arguments. + const auto Props = Builtin->properties; + unsigned i = 0; + for (const Argument &Arg : F.args()) { + Type *pointerRetPointeeTy = nullptr; + VectorizationResult::Arg::Kind kind = VectorizationResult::Arg::SCALAR; + + if (Arg.getType()->isPointerTy()) { + pointerRetPointeeTy = + compiler::utils::getPointerReturnPointeeTy(F, Props); + kind = VectorizationResult::Arg::POINTER_RETURN; + } else { + kind = VectorizationResult::Arg::VECTORIZED; + } + result.args.emplace_back(kind, VectorCallee->getArg(i)->getType(), + pointerRetPointeeTy); + i++; + } + return result; +} + +VectorizationResult +VectorizationContext::getVectorizedFunction(Function &callee, + ElementCount factor) { + VectorizationResult result; + if (factor.isScalable()) { + // We can't vectorize builtins by a scalable factor yet. + return result; + } + + auto simdWidth = factor.getFixedValue(); + if (auto *vecTy = dyn_cast(callee.getReturnType())) { + Function *scalarEquiv = nullptr; + if (const auto Builtin = BI.analyzeBuiltin(callee)) { + scalarEquiv = builtins().getScalarEquivalent(*Builtin, &Module); + } + if (!scalarEquiv) { + ++VeczContextFailScalarizeCall; + return VectorizationResult(); + } + + auto scalarWidth = vecTy->getNumElements(); + + result = getOrCreateBuiltin(*scalarEquiv, simdWidth * scalarWidth); + } else { + result = getOrCreateBuiltin(callee, simdWidth); + } + return result; +} + +bool VectorizationContext::isInternalBuiltin(const Function *F) { + return F->getName().starts_with(VectorizationContext::InternalBuiltinPrefix); +} + +Function *VectorizationContext::getOrCreateInternalBuiltin(StringRef Name, + FunctionType *FT) { + Function *F = Module.getFunction(Name); + if (!F && FT) { + F = dyn_cast_or_null( + Module.getOrInsertFunction(Name, FT).getCallee()); + if (F) { + // Set some default attributes on the function. + // We never use exceptions + F->addFnAttr(Attribute::NoUnwind); + // Recursion is not supported in ComputeMux + F->addFnAttr(Attribute::NoRecurse); + } + } + + return F; +} + +Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) { + Function *F = CI->getCalledFunction(); + if (!F) { + F = dyn_cast(CI->getCalledOperand()->stripPointerCasts()); + } + VECZ_FAIL_IF(!F); // TODO: Support indirect function calls. + LLVMContext &ctx = F->getContext(); + + // We will handle printf statements, but handling every possible vararg + // function can become a bit too complex, among other things because name + // mangling with arbitrary types can become a bit complex. printf is the only + // vararg OpenCL builtin, so only user functions are affected by this. + const bool isVarArg = F->isVarArg(); + VECZ_FAIL_IF(isVarArg && F->getName() != "printf"); + // Copy the argument types. This is done from the CallInst instead of the + // called Function because the called Function might be a VarArg function, in + // which case we need to create the wrapper with the expanded argument list. + SmallVector argTys; + for (const auto &U : CI->args()) { + argTys.push_back(U->getType()); + } + AttributeList fnAttrs = F->getAttributes(); + unsigned firstImmArg; + const bool hasImmArg = + F->isIntrinsic() && + fnAttrs.hasAttrSomewhere(Attribute::ImmArg, &firstImmArg); + if (hasImmArg) { + firstImmArg -= AttributeList::FirstArgIndex; + // We can only handle a single `i1` `Immarg` parameter. If we outgrow this + // limitation we need a different approach to the single inner branch + int count = 0; + for (unsigned i = firstImmArg, n = argTys.size(); i < n; ++i) { + if (!fnAttrs.hasAttributeAtIndex(AttributeList::FirstArgIndex + i, + Attribute::ImmArg)) { + continue; + } + // We only support one ImmArg or i1 type + if (count++ || argTys[i] != Type::getInt1Ty(ctx)) { + return nullptr; + } + fnAttrs = fnAttrs.removeAttributeAtIndex(ctx, i, Attribute::ImmArg); + } + } + // Add one extra argument for the mask + argTys.push_back(Type::getInt1Ty(ctx)); + // Generate the function name + compiler::utils::NameMangler mangler(&ctx); + const SmallVector quals( + argTys.size(), compiler::utils::TypeQualifiers()); + std::string newFName; + raw_string_ostream O(newFName); + O << VectorizationContext::InternalBuiltinPrefix << "masked_" << F->getName(); + // We need to mangle the names of the vararg masked functions, since we will + // generate different masked functions for invocations with different argument + // types. For non-vararg functions, we don't need the mangling so we skip it. + if (isVarArg) { + O << "_"; + for (auto T : argTys) { + VECZ_FAIL_IF(!mangler.mangleType( + O, T, + compiler::utils::TypeQualifiers(compiler::utils::eTypeQualNone))); + } + } + O.flush(); + // Check if we have a masked version already + auto maskedVersion = MaskedVersions.find(newFName); + if (maskedVersion != MaskedVersions.end()) { + LLVM_DEBUG(dbgs() << "vecz: Found existing masked function " << newFName + << "\n"); + return maskedVersion->second; + } + // Create the function type + FunctionType *newFunctionTy = + FunctionType::get(F->getReturnType(), argTys, false); + Function *newFunction = Function::Create( + newFunctionTy, GlobalValue::PrivateLinkage, newFName, F->getParent()); + const CallingConv::ID cc = CI->getCallingConv(); + LLVM_DEBUG(dbgs() << "vecz: Created masked function " << newFName << "\n"); + + // Create the function's basic blocks + BasicBlock *entryBlock = BasicBlock::Create(ctx, "entry", newFunction); + BasicBlock *activeBlock = BasicBlock::Create(ctx, "active", newFunction); + BasicBlock *mergeBlock = BasicBlock::Create(ctx, "exit", newFunction); + + // Create a new call instruction to call the masked function + SmallVector CIArgs; + for (Value &arg : newFunction->args()) { + CIArgs.push_back(&arg); + } + // Remove the mask argument + CIArgs.pop_back(); + + FunctionType *FTy = CI->getFunctionType(); + const AttributeList callAttrs = CI->getAttributes(); + SmallVector, 4> PhiOperands; + if (hasImmArg) { + Value *immArg = newFunction->getArg(firstImmArg); + BasicBlock *const immTrueBB = + BasicBlock::Create(ctx, "active.imm.1", newFunction, mergeBlock); + CIArgs[firstImmArg] = ConstantInt::getTrue(ctx); + CallInst *c0 = + CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immTrueBB); + c0->setCallingConv(cc); + c0->setAttributes(callAttrs); + BranchInst::Create(mergeBlock, immTrueBB); + + CIArgs[firstImmArg] = ConstantInt::getFalse(ctx); + // Now the false half + BasicBlock *const immFalseBB = + BasicBlock::Create(ctx, "active.imm.0", newFunction, mergeBlock); + + CallInst *c1 = + CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immFalseBB); + c1->setCallingConv(cc); + c1->setAttributes(callAttrs); + BranchInst::Create(mergeBlock, immFalseBB); + BranchInst::Create(immTrueBB, immFalseBB, immArg, activeBlock); + PhiOperands.push_back({c0, immTrueBB}); + PhiOperands.push_back({c1, immFalseBB}); + + // Now fix up the new function's signature. It can't be inheriting illegal + // attributes; only intrinsics may have the `ImmArg` Attribute. The verifier + // complains loudly otherwise, and then comes into our houses at night, and + // wrecks up the place... + for (unsigned i = 0, n = fnAttrs.getNumAttrSets(); i < n; ++i) { + fnAttrs = fnAttrs.removeAttributeAtIndex(ctx, i, Attribute::ImmArg); + } + } else { + // We are using the called Value instead of F because it might contain + // a bitcast or something, which makes the function types different. + CallInst *c = + CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", activeBlock); + c->setCallingConv(cc); + c->setAttributes(callAttrs); + PhiOperands.push_back({c, activeBlock}); + BranchInst::Create(mergeBlock, activeBlock); + } + newFunction->setCallingConv(cc); + newFunction->setAttributes(fnAttrs); + + // Get the last argument (the mask) and use it as our branch predicate as to + // the live blocks or a no-op + Value *mask = newFunction->arg_end() - 1; + BranchInst::Create(activeBlock, mergeBlock, mask, entryBlock); + + Type *returnTy = F->getReturnType(); + if (returnTy != Type::getVoidTy(ctx)) { + PHINode *result = PHINode::Create(returnTy, 2, "", mergeBlock); + for (auto &phiOp : PhiOperands) { + result->addIncoming(phiOp.first, phiOp.second); + } + result->addIncoming(getDefaultValue(returnTy), entryBlock); + ReturnInst::Create(ctx, result, mergeBlock); + } else { + ReturnInst::Create(ctx, mergeBlock); + } + + MaskedVersions.insert(std::make_pair(newFName, newFunction)); + insertMaskedFunction(newFunction, F); + return newFunction; +} + +std::optional +VectorizationContext::isMaskedAtomicFunction(const Function &F) const { + auto VFInfo = decodeVectorizedFunctionName(F.getName()); + if (!VFInfo) { + return std::nullopt; + } + auto [FnNameStr, VF, Choices] = *VFInfo; + + llvm::StringRef FnName = FnNameStr; + if (!FnName.consume_front("masked_")) { + return std::nullopt; + } + const bool IsCmpXchg = FnName.consume_front("cmpxchg_"); + if (!IsCmpXchg && !FnName.consume_front("atomicrmw_")) { + return std::nullopt; + } + VectorizationContext::MaskedAtomic AtomicInfo; + + AtomicInfo.VF = VF; + AtomicInfo.IsVectorPredicated = Choices.vectorPredication(); + + if (IsCmpXchg) { + AtomicInfo.IsWeak = FnName.consume_front("weak_"); + } + AtomicInfo.IsVolatile = FnName.consume_front("volatile_"); + + AtomicInfo.BinOp = AtomicRMWInst::BinOp::BAD_BINOP; + + if (!IsCmpXchg) { + if (auto BinOp = multi_llvm::consume_binop_with_underscore(FnName)) { + AtomicInfo.BinOp = *BinOp; + } else { + return std::nullopt; + } + } + + if (!FnName.consume_front("align")) { + return std::nullopt; + } + + uint64_t Alignment = 0; + if (FnName.consumeInteger(/*Radix=*/10, Alignment)) { + return std::nullopt; + } + + AtomicInfo.Align = Align(Alignment); + + if (!FnName.consume_front("_")) { + return std::nullopt; + } + + auto demangleOrdering = [&FnName]() -> std::optional { + if (FnName.consume_front("acquire_")) { + return AtomicOrdering::Acquire; + } else if (FnName.consume_front("acqrel_")) { + return AtomicOrdering::AcquireRelease; + } else if (FnName.consume_front("monotonic_")) { + return AtomicOrdering::Monotonic; + } else if (FnName.consume_front("notatomic_")) { + return AtomicOrdering::NotAtomic; + } else if (FnName.consume_front("release_")) { + return AtomicOrdering::Release; + } else if (FnName.consume_front("seqcst_")) { + return AtomicOrdering::SequentiallyConsistent; + } else if (FnName.consume_front("unordered_")) { + return AtomicOrdering::Unordered; + } else { + return std::nullopt; + } + }; + + if (auto Ordering = demangleOrdering()) { + AtomicInfo.Ordering = *Ordering; + } else { + return std::nullopt; + } + + if (IsCmpXchg) { + if (auto Ordering = demangleOrdering()) { + AtomicInfo.CmpXchgFailureOrdering = Ordering; + } else { + return std::nullopt; + } + } + + unsigned SyncScopeID = 0; + if (FnName.consumeInteger(/*Radix=*/10, SyncScopeID)) { + return std::nullopt; + } + + AtomicInfo.SyncScope = static_cast(SyncScopeID); + + if (!FnName.consume_front("_")) { + return std::nullopt; + } + + // Note - we just assume the rest of the builtin name is okay, here. It + // should be mangled types, but vecz builtins use a strange mangling system, + // purely for uniqueness and not to infer types. Types are always assumed to + // be inferrable from the function parameters. + AtomicInfo.PointerTy = F.getFunctionType()->getParamType(0); + AtomicInfo.ValTy = F.getFunctionType()->getParamType(1); + + return AtomicInfo; +} + +Function *VectorizationContext::getOrCreateMaskedAtomicFunction( + MaskedAtomic &I, const VectorizationChoices &Choices, ElementCount VF) { + const bool isCmpXchg = I.isCmpXchg(); + LLVMContext &ctx = I.ValTy->getContext(); + + SmallVector argTys; + + argTys.push_back(I.PointerTy); + argTys.push_back(I.ValTy); + if (isCmpXchg) { + argTys.push_back(I.ValTy); + } + // Add one extra argument for the mask, which is always the same length + // (scalar or vector) as the value type. + auto *i1Ty = Type::getInt1Ty(ctx); + auto *maskTy = + !I.ValTy->isVectorTy() + ? dyn_cast(i1Ty) + : VectorType::get(i1Ty, cast(I.ValTy)->getElementCount()); + argTys.push_back(maskTy); + if (Choices.vectorPredication()) { + argTys.push_back(Type::getInt32Ty(ctx)); + } + + std::string maskedFnName; + raw_string_ostream O(maskedFnName); + O << (isCmpXchg ? "masked_cmpxchg_" : "masked_atomicrmw_"); + + if (I.IsWeak) { + assert(isCmpXchg && "Bad MaskedAtomic state"); + O << "weak_"; + } + + if (I.IsVolatile) { + O << "volatile_"; + } + + if (!isCmpXchg) { + O << multi_llvm::to_string(I.BinOp) << "_"; + } + + O << "align" << I.Align.value() << "_"; + + // Mangle ordering + auto mangleOrdering = [&O](AtomicOrdering Ordering) { + switch (Ordering) { + case AtomicOrdering::Acquire: + O << "acquire"; + return; + case AtomicOrdering::AcquireRelease: + O << "acqrel"; + return; + case AtomicOrdering::Monotonic: + O << "monotonic"; + return; + case AtomicOrdering::NotAtomic: + O << "notatomic"; + return; + case AtomicOrdering::Release: + O << "release"; + return; + case AtomicOrdering::SequentiallyConsistent: + O << "seqcst"; + return; + case AtomicOrdering::Unordered: + O << "unordered"; + return; + } + + O << static_cast(Ordering); + }; + + mangleOrdering(I.Ordering); + // Failure Ordering + if (I.CmpXchgFailureOrdering) { + O << "_"; + mangleOrdering(*I.CmpXchgFailureOrdering); + } + + // Syncscope + O << "_" << static_cast(I.SyncScope) << "_"; + + // Mangle types + compiler::utils::NameMangler mangler(&ctx); + for (auto *ty : argTys) { + VECZ_FAIL_IF(!mangler.mangleType( + O, ty, + compiler::utils::TypeQualifiers(compiler::utils::eTypeQualNone))); + } + + maskedFnName = + getVectorizedFunctionName(maskedFnName, VF, Choices, /*IsBuiltin=*/true); + + Type *maskedFnRetTy = isCmpXchg ? StructType::get(I.ValTy, maskTy) : I.ValTy; + + // Create the function type + FunctionType *maskedFnTy = + FunctionType::get(maskedFnRetTy, argTys, /*isVarArg=*/false); + + return getOrCreateInternalBuiltin(maskedFnName, maskedFnTy); +} + +namespace { +std::optional> +isSubgroupScan(StringRef fnName, Type *const ty) { + compiler::utils::Lexer L(fnName); + if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) { + return std::nullopt; + } + if (!L.Consume("sub_group_scan_")) { + return std::nullopt; + } + const bool isInt = ty->isIntOrIntVectorTy(); + const bool isInclusive = L.Consume("inclusive_"); + if (isInclusive || L.Consume("exclusive_")) { + StringRef OpKind; + if (L.ConsumeAlpha(OpKind)) { + RecurKind opKind; + if (OpKind == "add") { + opKind = isInt ? RecurKind::Add : RecurKind::FAdd; + } else if (OpKind == "min") { + assert(!isInt && "unexpected internal scan builtin"); + opKind = RecurKind::FMin; + } else if (OpKind == "max") { + assert(!isInt && "unexpected internal scan builtin"); + opKind = RecurKind::FMax; + } else if (OpKind == "smin") { + opKind = RecurKind::SMin; + } else if (OpKind == "smax") { + opKind = RecurKind::SMax; + } else if (OpKind == "umin") { + opKind = RecurKind::UMin; + } else if (OpKind == "umax") { + opKind = RecurKind::UMax; + } else if (OpKind == "mul") { + opKind = isInt ? RecurKind::Mul : RecurKind::FMul; + } else if (OpKind == "and") { + opKind = RecurKind::And; + assert(isInt && "unexpected internal scan builtin"); + } else if (OpKind == "or") { + opKind = RecurKind::Or; + assert(isInt && "unexpected internal scan builtin"); + } else if (OpKind == "xor") { + opKind = RecurKind::Xor; + assert(isInt && "unexpected internal scan builtin"); + } else { + return std::nullopt; + } + const bool isVP = L.Consume("_vp"); + return std::make_tuple(isInclusive, opKind, isVP); + } + } + return std::nullopt; +} +} // namespace + +bool VectorizationContext::defineInternalBuiltin(Function *F) { + assert(F->isDeclaration() && "builtin is already defined"); + + // Handle masked memory loads and stores. + if (std::optional Desc = MemOpDesc::analyzeMemOpFunction(*F)) { + if (Desc->isMaskedMemOp()) { + return emitMaskedMemOpBody(*F, *Desc); + } + + // Handle interleaved memory loads and stores. + if (Desc->isInterleavedMemOp()) { + return emitInterleavedMemOpBody(*F, *Desc); + } + + // Handle masked interleaved memory loads and stores + if (Desc->isMaskedInterleavedMemOp()) { + return emitMaskedInterleavedMemOpBody(*F, *Desc); + } + + // Handle scatter stores and gather loads. + if (Desc->isScatterGatherMemOp()) { + return emitScatterGatherMemOpBody(*F, *Desc); + } + + // Handle masked scatter stores and gather loads. + if (Desc->isMaskedScatterGatherMemOp()) { + return emitMaskedScatterGatherMemOpBody(*F, *Desc); + } + } + + // Handle subgroup scan operations. + if (auto scanInfo = isSubgroupScan(F->getName(), F->getReturnType())) { + const bool isInclusive = std::get<0>(*scanInfo); + const RecurKind opKind = std::get<1>(*scanInfo); + const bool isVP = std::get<2>(*scanInfo); + return emitSubgroupScanBody(*F, isInclusive, opKind, isVP); + } + + if (auto AtomicInfo = isMaskedAtomicFunction(*F)) { + return emitMaskedAtomicBody(*F, *AtomicInfo); + } + + return false; +} + +bool VectorizationContext::emitMaskedMemOpBody(Function &F, + const MemOpDesc &Desc) const { + Value *Data = Desc.getDataOperand(&F); + Value *Ptr = Desc.getPointerOperand(&F); + Value *Mask = Desc.getMaskOperand(&F); + Value *VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr; + Type *DataTy = Desc.isLoad() ? F.getReturnType() : Data->getType(); + + BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F); + IRBuilder<> B(Entry); + Value *Result = nullptr; + if (Desc.isLoad()) { + Result = + VTI.createMaskedLoad(B, DataTy, Ptr, Mask, VL, Desc.getAlignment()); + B.CreateRet(Result); + } else { + Result = VTI.createMaskedStore(B, Data, Ptr, Mask, VL, Desc.getAlignment()); + B.CreateRetVoid(); + } + VECZ_FAIL_IF(!Result); + return true; +} + +bool VectorizationContext::emitInterleavedMemOpBody( + Function &F, const MemOpDesc &Desc) const { + return emitMaskedInterleavedMemOpBody(F, Desc); +} + +bool VectorizationContext::emitMaskedInterleavedMemOpBody( + Function &F, const MemOpDesc &Desc) const { + Value *Data = Desc.getDataOperand(&F); + auto *const Ptr = Desc.getPointerOperand(&F); + VECZ_FAIL_IF(!isa(Desc.getDataType()) || !Ptr); + + auto *const Mask = Desc.getMaskOperand(&F); + auto *const VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr; + const auto Align = Desc.getAlignment(); + const auto Stride = Desc.getStride(); + + BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F); + IRBuilder<> B(Entry); + + // If the mask is missing, assume that this is a normal interleaved memop that + // we want to emit as an unmasked interleaved memop + if (Desc.isLoad()) { + auto *const Result = + Mask ? VTI.createMaskedInterleavedLoad(B, F.getReturnType(), Ptr, Mask, + Stride, VL, Align) + : VTI.createInterleavedLoad(B, F.getReturnType(), Ptr, Stride, VL, + Align); + VECZ_FAIL_IF(!Result); + B.CreateRet(Result); + } else { + auto *const Result = + Mask ? VTI.createMaskedInterleavedStore(B, Data, Ptr, Mask, Stride, VL, + Align) + : VTI.createInterleavedStore(B, Data, Ptr, Stride, VL, Align); + VECZ_FAIL_IF(!Result); + B.CreateRetVoid(); + } + return true; +} + +bool VectorizationContext::emitScatterGatherMemOpBody( + Function &F, const MemOpDesc &Desc) const { + return emitMaskedScatterGatherMemOpBody(F, Desc); +} + +bool VectorizationContext::emitMaskedScatterGatherMemOpBody( + Function &F, const MemOpDesc &Desc) const { + Value *Data = Desc.getDataOperand(&F); + auto *const VecDataTy = dyn_cast(Desc.getDataType()); + auto *const Ptr = Desc.getPointerOperand(&F); + VECZ_FAIL_IF(!VecDataTy || !Ptr); + + auto *const Mask = Desc.getMaskOperand(&F); + auto *const VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr; + const auto Align = Desc.getAlignment(); + + BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F); + IRBuilder<> B(Entry); + + // If the mask is missing, assume that this is a normal scatter/gather memop + // that we want to emit as an unmasked scatter/gather memop + if (Desc.isLoad()) { + auto *const Result = + Mask ? VTI.createMaskedGatherLoad(B, VecDataTy, Ptr, Mask, VL, Align) + : VTI.createGatherLoad(B, VecDataTy, Ptr, VL, Align); + VECZ_FAIL_IF(!Result); + B.CreateRet(Result); + } else { + auto *const Result = + Mask ? VTI.createMaskedScatterStore(B, Data, Ptr, Mask, VL, Align) + : VTI.createScatterStore(B, Data, Ptr, VL, Align); + VECZ_FAIL_IF(!Result); + B.CreateRetVoid(); + } + return true; +} + +// Emit a subgroup scan operation. +// If the vectorization factor is fixed, we can do a scan in log2(N) steps, +// by noting that an inclusive scan can be split into two, and recombined into +// a single result by adding the last element of the first half onto every +// element of the second half. To deal with exclusive scans, we rotate the +// result by one element and insert the neutral element at the beginning. +// +// For now, when using scalable vectorization factor, this takes the form of a +// simple loop that accumulates the scan operation in scalar form, extracting +// and inserting elements of the resulting vector on each iteration: +// %v = +// Iteration 0: +// %e.0 = extractelement %v, 0 (A) +// %s.0 = add N, %e.0 (A) +// %v.0 = insertelement poison, %s.0, 0 () +// Iteration 1: +// %e.1 = extractelement %v, 1 (B) +// %s.1 = add %s.0, %e.1 (A+B) +// %v.1 = insertelement %v.0, %s.1, 1 () +// Iteration 2: +// %e.2 = extractelement %v, 2 (C) +// %s.2 = add %s.1, %e.2 (A+B+C) +// %v.2 = insertelement %v.1, %s.2, 2 () +// Iteration 3: +// %e.3 = extractelement %v, 3 (D) +// %s.3 = add %s.2, %e.2 (A+B+C+D) +// %v.3 = insertelement %v.2, %s.3, 3 () +// Result: +// %v.3 = +// +// Exclusive scans operate by pre-filling the vector with the neutral value, +// looping from 1 onwards, and extracting from one less than the current +// iteration: +// %z = insertelement poison, N, 0 +// Iteration 0: +// %e.0 = extractelement %v, 0 (A) +// %s.0 = add N, %e.0 (A) +// %v.0 = insertelement %z, %s.0, 1 () +// This loop operates up to the VL input, if it is a vector-predicated scan. +// Elements past the vector length will receive a default zero value. +// Note: This method is not optimal for fixed-length code, but serves as a way +// of producing scalable- and fixed-length vector code equivalently. +bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive, + RecurKind OpKind, + bool IsVP) const { + LLVMContext &Ctx = F.getContext(); + + auto *const Entry = BasicBlock::Create(Ctx, "entry", &F); + IRBuilder<> B(Entry); + + Type *const VecTy = F.getReturnType(); + Type *const EltTy = multi_llvm::getVectorElementType(VecTy); + const ElementCount EC = multi_llvm::getVectorElementCount(VecTy); + + Function::arg_iterator Arg = F.arg_begin(); + + Value *const Vec = Arg; + Value *const VL = IsVP ? ++Arg : nullptr; + + // If it's not a scalable vector, we can do it the fast way. + if (!EC.isScalable() && !IsVP) { + auto *const NeutralVal = compiler::utils::getNeutralVal(OpKind, EltTy); + const auto Width = EC.getFixedValue(); + auto *const UndefVal = PoisonValue::get(VecTy); + + // Put the Neutral element in a vector so we can shuffle it in. + auto *const NeutralVec = + B.CreateInsertElement(UndefVal, NeutralVal, B.getInt64(0)); + + auto *Result = Vec; + unsigned N = 1u; + + SmallVector mask(Width); + while (N < Width) { + // Build shuffle mask. + // The sequence of masks will be, for a width of 16 + // (in hexadecimal for concision, where x represents the neutral value + // element): + // + // x0x2x4x6x8xAxCxE + // xx11xx55xx99xxDD + // xxxx3333xxxxBBBB + // xxxxxxxx77777777 + // + const auto N2 = N << 1u; + auto MaskIt = mask.begin(); + for (size_t i = 0; i < Width; i += N2) { + for (size_t j = 0; j < N; ++j) { + *MaskIt++ = Width; + } + + const auto k = i + N - 1; + for (size_t j = 0; j < N; ++j) { + *MaskIt++ = k; + } + } + N = N2; + auto *const Shuffle = + createOptimalShuffle(B, Result, NeutralVec, mask, Twine("scan_impl")); + Result = + compiler::utils::createBinOpForRecurKind(B, Result, Shuffle, OpKind); + } + + if (!IsInclusive) { + // If it is an exclusive scan, rotate the result. + auto *const IdentityVal = compiler::utils::getIdentityVal(OpKind, EltTy); + VECZ_FAIL_IF(!IdentityVal); + Result = VTI.createVectorSlideUp(B, Result, IdentityVal, VL); + } + + B.CreateRet(Result); + return true; + } + + // If the vector is scalable, we don't know the number of iterations required, + // so we have to use a loop and shuffle masks generated from the step vector. + + auto *const IVTy = B.getInt32Ty(); + auto *const IndexTy = VectorType::get(IVTy, EC); + auto *const Step = B.CreateStepVector(IndexTy, "step"); + auto *const VZero = Constant::getNullValue(IndexTy); + + auto *const Loop = BasicBlock::Create(Ctx, "loop", &F); + auto *const Exit = BasicBlock::Create(Ctx, "exit", &F); + + // The length of the vector. + Value *Width = nullptr; + if (IsVP) { + Width = VL; + } else { + Width = B.CreateElementCount(IVTy, EC); + } + + B.CreateBr(Loop); + + // Loop induction starts at 1 and doubles each time. + auto *const IVStart = ConstantInt::get(IVTy, 1); + + // Create the loop instructions + B.SetInsertPoint(Loop); + + // The induction variable (IV) which determines both our loop bounds and our + // vector indices. + auto *N = B.CreatePHI(IVTy, 2, "iv"); + N->addIncoming(IVStart, Entry); + + // A vector phi representing the vectorized value we're building up. + auto *VecPhi = B.CreatePHI(VecTy, 2, "vec"); + VecPhi->addIncoming(Vec, Entry); + + // A vector phi representing the vectorized value we're building up. + auto *MaskPhi = B.CreatePHI(IndexTy, 2, "mask.phi"); + MaskPhi->addIncoming(Step, Entry); + + // This will create shuffle masks like the following sequence: + // + // 1032547698BADCFE = (0123456789ABCDEF ^ splat(1)) + // 33117755BB99FFDD = (1032547698BADCFE ^ splat(2)) | splat(1) + // 77773333FFFFBBBB = (33117755BB99FFDD ^ splat(4)) | splat(2) + // FFFFFFFF77777777 = (77773333FFFFBBBB ^ splat(8)) | splat(4) + // + // We don't mix the neutral element into the vector in this case, but use a + // Select instruction to choose between the updated or original value, so that + // backends can lower it as a masked binary operation. The select condition + // therefore needs to be like the following sequence: + // + // 0101010101010101 + // 0011001100110011 + // 0000111100001111 + // 0000000011111111 + + auto *const SplatN = B.CreateVectorSplat(EC, N, "splatN"); + auto *const Mask = B.CreateXor(MaskPhi, SplatN, "mask"); + auto *const Shuffle = VTI.createVectorShuffle(B, VecPhi, Mask, VL); + auto *const Accum = + compiler::utils::createBinOpForRecurKind(B, VecPhi, Shuffle, OpKind); + + auto *const NBit = B.CreateAnd(MaskPhi, SplatN, "isolate"); + auto *const Which = B.CreateICmpNE(NBit, VZero, "which"); + auto *const NewVec = B.CreateSelect(Which, Accum, VecPhi, "newvec"); + + auto *const NewMask = B.CreateOr(Mask, SplatN, "newmask"); + auto *const N2 = B.CreateShl(N, ConstantInt::get(IVTy, 1), "N2", + /*HasNUW*/ true, /*HasNSW*/ true); + + VecPhi->addIncoming(NewVec, Loop); + MaskPhi->addIncoming(NewMask, Loop); + N->addIncoming(N2, Loop); + + // Loop exit condition + auto *const Cond = B.CreateICmpULT(N2, Width, "iv.cmp"); + B.CreateCondBr(Cond, Loop, Exit); + + // Function exit instructions: + B.SetInsertPoint(Exit); + + // Create an LCSSA PHI node. + auto *const ResultPhi = B.CreatePHI(VecTy, 1, "res.phi"); + ResultPhi->addIncoming(NewVec, Loop); + + Value *Result = ResultPhi; + if (!IsInclusive) { + // If it is an exclusive scan, rotate the result. + auto *const IdentityVal = compiler::utils::getIdentityVal(OpKind, EltTy); + VECZ_FAIL_IF(!IdentityVal); + Result = VTI.createVectorSlideUp(B, Result, IdentityVal, VL); + } + + B.CreateRet(Result); + return true; +} + +bool VectorizationContext::emitMaskedAtomicBody( + Function &F, const VectorizationContext::MaskedAtomic &MA) const { + LLVMContext &Ctx = F.getContext(); + const bool IsCmpXchg = MA.isCmpXchg(); + + auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F); + + IRBuilder<> B(EntryBB); + + BasicBlock *LoopEntryBB = EntryBB; + if (MA.IsVectorPredicated) { + auto *const VL = F.getArg(3 + IsCmpXchg); + // Early exit if the vector length is zero. We're going to unconditionally + // jump into the loop after this. + auto *const EarlyExitBB = BasicBlock::Create(Ctx, "earlyexit", &F); + auto *const CmpZero = + B.CreateICmpEQ(VL, ConstantInt::get(VL->getType(), 0)); + + LoopEntryBB = BasicBlock::Create(Ctx, "loopentry", &F); + + B.CreateCondBr(CmpZero, EarlyExitBB, LoopEntryBB); + + B.SetInsertPoint(EarlyExitBB); + B.CreateRet(PoisonValue::get(F.getReturnType())); + } + + B.SetInsertPoint(LoopEntryBB); + + auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F); + + auto *const PtrArg = F.getArg(0); + auto *const ValArg = F.getArg(1); + Value *MaskArg = F.getArg(2 + IsCmpXchg); + + const bool IsVector = ValArg->getType()->isVectorTy(); + + Value *const IdxStart = B.getInt32(0); + Value *IdxEnd; + if (MA.IsVectorPredicated) { + IdxEnd = F.getArg(3 + IsCmpXchg); + } else { + IdxEnd = B.CreateElementCount(B.getInt32Ty(), MA.VF); + } + + Value *RetVal = nullptr; + Value *RetSuccessVal = nullptr; + + auto CreateLoopBody = + [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal, &RetSuccessVal, + IsVector, IsCmpXchg](BasicBlock *BB, Value *Idx, ArrayRef IVs, + MutableArrayRef IVsNext) -> BasicBlock * { + IRBuilder<> IRB(BB); + + Value *MaskElt = MaskArg; + if (IsVector) { + MaskElt = IRB.CreateExtractElement(MaskArg, Idx, "mask"); + } + auto *const MaskCmp = + IRB.CreateICmpNE(MaskElt, IRB.getInt1(false), "mask.cmp"); + + auto *const IfBB = BasicBlock::Create(F.getContext(), "if.then", &F); + auto *const ElseBB = BasicBlock::Create(F.getContext(), "if.else", &F); + + IRB.CreateCondBr(MaskCmp, IfBB, ElseBB); + + { + IRB.SetInsertPoint(IfBB); + Value *Ptr = PtrArg; + Value *Val = ValArg; + if (IsVector) { + Ptr = IRB.CreateExtractElement(PtrArg, Idx, "ptr"); + Val = IRB.CreateExtractElement(ValArg, Idx, "val"); + } + + if (IsCmpXchg) { + Value *NewValArg = F.getArg(2); + Value *NewVal = NewValArg; + if (IsVector) { + NewVal = IRB.CreateExtractElement(NewValArg, Idx, "newval"); + } + auto *const CmpXchg = + IRB.CreateAtomicCmpXchg(Ptr, Val, NewVal, MA.Align, MA.Ordering, + *MA.CmpXchgFailureOrdering, MA.SyncScope); + CmpXchg->setWeak(MA.IsWeak); + CmpXchg->setVolatile(MA.IsVolatile); + + if (IsVector) { + RetVal = IRB.CreateInsertElement( + IVs[0], IRB.CreateExtractValue(CmpXchg, 0), Idx, "retvec"); + RetSuccessVal = IRB.CreateInsertElement( + IVs[1], IRB.CreateExtractValue(CmpXchg, 1), Idx, "retsuccess"); + } else { + RetVal = IRB.CreateExtractValue(CmpXchg, 0); + RetSuccessVal = IRB.CreateExtractValue(CmpXchg, 1); + } + + } else { + auto *const AtomicRMW = IRB.CreateAtomicRMW( + MA.BinOp, Ptr, Val, MA.Align, MA.Ordering, MA.SyncScope); + AtomicRMW->setVolatile(MA.IsVolatile); + + if (IsVector) { + RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec"); + } else { + RetVal = AtomicRMW; + } + } + + IRB.CreateBr(ElseBB); + } + + { + IRB.SetInsertPoint(ElseBB); + + auto *MergePhi = IRB.CreatePHI(RetVal->getType(), 2, "merge"); + MergePhi->addIncoming(IVs[0], BB); + MergePhi->addIncoming(RetVal, IfBB); + RetVal = MergePhi; + } + IVsNext[0] = RetVal; + + if (IsCmpXchg) { + auto *MergePhi = + IRB.CreatePHI(RetSuccessVal->getType(), 2, "mergesuccess"); + MergePhi->addIncoming(IVs[1], BB); + MergePhi->addIncoming(RetSuccessVal, IfBB); + RetSuccessVal = MergePhi; + IVsNext[1] = RetSuccessVal; + } + + // Move the exit block right to the end of the function. + ExitBB->moveAfter(ElseBB); + + return ElseBB; + }; + + compiler::utils::CreateLoopOpts Opts; + { + Opts.IVs.push_back(PoisonValue::get(MA.ValTy)); + Opts.loopIVNames.push_back("retvec.prev"); + } + if (IsCmpXchg) { + Opts.IVs.push_back(PoisonValue::get(MaskArg->getType())); + Opts.loopIVNames.push_back("retsuccess.prev"); + } + compiler::utils::createLoop(LoopEntryBB, ExitBB, IdxStart, IdxEnd, Opts, + CreateLoopBody); + + B.SetInsertPoint(ExitBB); + if (IsCmpXchg) { + Value *RetStruct = PoisonValue::get(F.getReturnType()); + RetStruct = B.CreateInsertValue(RetStruct, RetVal, 0); + RetStruct = B.CreateInsertValue(RetStruct, RetSuccessVal, 1); + B.CreateRet(RetStruct); + } else { + B.CreateRet(RetVal); + } + return true; +} + +Function * +VectorizationContext::getInternalVectorEquivalent(Function *ScalarFn, + unsigned SimdWidth) { + // Handle masked memory loads and stores. + if (!ScalarFn) { + return nullptr; + } + if (auto Desc = MemOpDesc::analyzeMaskedMemOp(*ScalarFn)) { + auto *NewDataTy = FixedVectorType::get(Desc->getDataType(), SimdWidth); + return getOrCreateMaskedMemOpFn( + *this, NewDataTy, cast(Desc->getPointerType()), + Desc->getAlignment(), Desc->isLoad(), Desc->isVLOp()); + } + + return nullptr; +} + +bool VectorizationContext::isMaskedFunction(const llvm::Function *F) const { + return MaskedFunctionsMap.count(F) > 0; +} + +bool VectorizationContext::insertMaskedFunction(llvm::Function *F, + llvm::Function *WrappedF) { + auto result = MaskedFunctionsMap.insert({F, WrappedF}); + return result.second; +} + +llvm::Function * +VectorizationContext::getOriginalMaskedFunction(llvm::Function *F) { + auto Iter = MaskedFunctionsMap.find(F); + if (Iter != MaskedFunctionsMap.end()) { + return dyn_cast_or_null(Iter->second); + } + + return nullptr; +} + +//////////////////////////////////////////////////////////////////////////////// + +char DefineInternalBuiltinsPass::PassID = 0; + +PreservedAnalyses DefineInternalBuiltinsPass::run(Module &M, + ModuleAnalysisManager &AM) { + llvm::FunctionAnalysisManager &FAM = + AM.getResult(M).getManager(); + + // Remove internal builtins that may not be needed any more. + SmallVector ToRemove; + + bool NonePreserved = false; + // Implement internal builtins that we now know are needed. + // We find all declarations that should be builtins, and then define them if + // they have users that have associated vectorization units. + // On failure to define, we notify those vectorization units of failure + // and remove any partially defined body. + // Unused declarations are removed + for (Function &F : M.functions()) { + if (!F.isDeclaration() || !VectorizationContext::isInternalBuiltin(&F)) { + continue; + } + if (F.use_empty()) { + ToRemove.push_back(&F); + NonePreserved = true; + continue; + } + llvm::SmallPtrSet UserVUs; + for (const Use &U : F.uses()) { + if (CallInst *CI = dyn_cast(U.getUser())) { + auto R = FAM.getResult(*CI->getFunction()); + if (R.hasResult()) { + UserVUs.insert(&R.getVU()); + } + } + } + if (std::all_of(UserVUs.begin(), UserVUs.end(), + [](VectorizationUnit *VU) { return VU->failed(); })) { + // If the vectorization has failed, we do not want to define the internal + // builtins, both because its a waste of time and because we might try to + // instantiate some invalid builtin that would have been replaced by the + // packetization process. + continue; + } + + VectorizationContext &Ctx = (*UserVUs.begin())->context(); + const bool DefinedBuiltin = Ctx.defineInternalBuiltin(&F); + if (!DefinedBuiltin) { + // If we've failed to define this builtin, ensure we clean up the + // half-complete body. We can't simply delete it because it will have + // uses in the vector kernel. This will revert it to a declaration, which + // will be cleaned up later by the global optimizer. + if (!F.isDeclaration()) { + // defineInternalBuiltin may have partially defined the function body. + // Clean it up. FIXME defineInternalBuiltin should probably clean up + // after itself if there is a failure condition + F.deleteBody(); + } + for (VectorizationUnit *VU : UserVUs) { + VU->setFailed("failed to define an internal builtin"); + } + continue; + } + NonePreserved = true; + } + + for (Function *F : ToRemove) { + F->eraseFromParent(); + } + + return NonePreserved ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp new file mode 100644 index 0000000000000..29d505d28369b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp @@ -0,0 +1,343 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "vectorization_helpers.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "debugging.h" +#include "vectorization_context.h" +#include "vectorization_unit.h" +#include "vecz/vecz_choices.h" + +using namespace llvm; +using namespace vecz; + +namespace { + +Function *declareFunction(const VectorizationUnit &VU) { + Module &Module = VU.context().module(); + const Function *const ScalarFn = VU.scalarFunction(); + const ElementCount SimdWidth = VU.width(); + + // For kernels, the vectorized function type is is the same as the original + // scalar function type, since function arguments are uniform. We no longer + // use Vectorization Units for builtins. + FunctionType *VectorizedFnType = VU.scalarFunction()->getFunctionType(); + VECZ_FAIL_IF(!VectorizedFnType); + const std::string VectorizedName = + getVectorizedFunctionName(ScalarFn->getName(), SimdWidth, VU.choices()); + Module.getOrInsertFunction(VectorizedName, VectorizedFnType); + auto *const VectorizedFn = Module.getFunction(VectorizedName); + if (VectorizedFn) { + VectorizedFn->setCallingConv(ScalarFn->getCallingConv()); + } + return VectorizedFn; +} + +/// @brief Clone the OpenCL named metadata node with name NodeName +/// @param[in] NodeName The name of the node to clone +/// +/// This function works with nodes that follow a specific pattern, +/// specifically nodes that have as their operands other metadata nodes, which +/// in turn have their first operand set to the OpenCL kernel Function. It +/// searches for the node that contains the scalar kernel, and copies all its +/// metadata, which the exception of the Function itself, which is replaced by +/// the vectorized kernel. +void cloneOpenCLNamedMetadataHelper(const VectorizationUnit &VU, + const std::string &NodeName) { + const Module &M = VU.context().module(); + + // Try to get the OpenCL metadata + NamedMDNode *KernelsMD = M.getNamedMetadata(NodeName); + if (!KernelsMD) { + return; + } + + // Find which metadata node contains the metadata for the scalar function + MDNode *ScalarKernelMD = nullptr; + for (auto *KernelMD : KernelsMD->operands()) { + // The function name is the first operand + if (KernelMD->getNumOperands() > 0) { + // Get the Constant containing the function + ConstantAsMetadata *KernelNameMD = + dyn_cast_or_null(KernelMD->getOperand(0)); + if (KernelNameMD) { + // Check if the function in the metadata is the original OpenCL kernel + if (KernelNameMD->getValue() == VU.scalarFunction()) { + ScalarKernelMD = KernelMD; + break; + } + } + } + } + + // Did we find the correct metadata? + if (!ScalarKernelMD) { + return; + } + + // Replace the kernel name and clone the rest of the metadata + SmallVector KernelMDArgs; + KernelMDArgs.push_back( + llvm::ConstantAsMetadata::get(VU.vectorizedFunction())); + auto MDIt = ScalarKernelMD->op_begin() + 1; + auto MDEnd = ScalarKernelMD->op_end(); + for (; MDIt != MDEnd; ++MDIt) { + KernelMDArgs.push_back(*MDIt); + } + + // Create a new metadata node and add it to the opencl.kernels node + llvm::MDNode *KernelMDNode = + llvm::MDNode::get(VU.context().module().getContext(), KernelMDArgs); + KernelsMD->addOperand(KernelMDNode); +} + +/// @brief Create placeholder instructions for arguments that will be +/// vectorized. This is necessary to clone the original function's scalar code +/// into the vectorized function. +/// +/// @param[in,out] ValueMap Map to update with the arguments. +SmallVector +createArgumentPlaceholders(const VectorizationUnit &VU, Function *VecFunc, + ValueToValueMapTy &ValueMap) { + SmallVector Placeholders; + const auto &Arguments = VU.arguments(); + unsigned i = 0u; + for (Argument &DstArg : VecFunc->args()) { + Argument *SrcArg = Arguments[i++].OldArg; + DstArg.setName(SrcArg->getName()); + if (DstArg.getType() != SrcArg->getType()) { + // Map old argument to a temporary placeholder to work around the + // difference in argument types. This usually happens when vectorizing + // builtin functions. + Type *IndexTy = Type::getInt32Ty(VecFunc->getParent()->getContext()); + Constant *Index = Constant::getNullValue(IndexTy); + auto *const Placeholder = ExtractElementInst::Create(&DstArg, Index); + ValueMap[SrcArg] = Placeholder; + Placeholders.push_back(Placeholder); + } else { + ValueMap[SrcArg] = &DstArg; + } + } + return Placeholders; +} + +} // namespace + +namespace vecz { +std::string getVectorizedFunctionName(StringRef ScalarName, ElementCount VF, + VectorizationChoices Choices, + bool IsBuiltin) { + const Twine Prefix = Twine(VF.isScalable() ? "nxv" : "v"); + const Twine IsVP = Twine(Choices.vectorPredication() ? "_vp_" : "_"); + return ((IsBuiltin ? VectorizationContext::InternalBuiltinPrefix + : Twine("__vecz_")) + + Prefix + Twine(VF.getKnownMinValue()) + IsVP + ScalarName) + .str(); +} + +std::optional> +decodeVectorizedFunctionName(StringRef Name) { + if (!Name.consume_front(VectorizationContext::InternalBuiltinPrefix)) { + if (!Name.consume_front("__vecz_")) { + return std::nullopt; + } + } + + ElementCount VF; + bool Scalable = false; + if (Name.consume_front("nxv")) { + Scalable = true; + } else if (!Name.consume_front("v")) { + return std::nullopt; + } + + unsigned KnownMin = 0; + if (Name.consumeInteger(10, KnownMin)) { + return std::nullopt; + } + + VF = ElementCount::get(KnownMin, Scalable); + + VectorizationChoices Choices; + if (Name.consume_front("_vp_")) { + Choices.enableVectorPredication(); + } else if (!Name.consume_front("_")) { + return std::nullopt; + } + + return std::make_tuple(Name.str(), VF, Choices); +} + +Function *cloneFunctionToVector(const VectorizationUnit &VU) { + auto *const VectorizedFn = declareFunction(VU); + VECZ_ERROR_IF(!VectorizedFn, "declareFunction failed to initialize"); + + auto *const ScalarFn = VU.scalarFunction(); + + // Map the old arguments to the new ones. + ValueToValueMapTy ValueMap; + auto Placeholders = createArgumentPlaceholders(VU, VectorizedFn, ValueMap); + + // Clone the function to preserve instructions that do not need vectorization. + SmallVector Returns; + + // Setting `moduleChanges` to true allows `llvm::CloneFunctionInto()` to do + // the work of cloning debug info across translation unit boundaries. + // However there can be issues with inlined kernels if the inlined kernel + // still exists in the kernel, and also has a vectorized variant. + // This value was set to true in this code since LLVM_VERSION_MAJOR > 4 but as + // of llvm > 12 we need to be a bit more careful with that value as there is + // more nuance introduced in 22a52dfddc with requisite assertions + const bool moduleChanges = VectorizedFn->getParent() != ScalarFn->getParent(); + auto cloneMode = moduleChanges ? CloneFunctionChangeType::DifferentModule + : CloneFunctionChangeType::LocalChangesOnly; + CloneFunctionInto(VectorizedFn, ScalarFn, ValueMap, cloneMode, Returns); + + // Remove unwanted return value attributes. + if (VectorizedFn->getReturnType()->isVectorTy()) { + LLVMContext &Ctx = VectorizedFn->getContext(); + AttributeList PAL = VectorizedFn->getAttributes(); + bool RemovedAttribute = false; + for (const Attribute::AttrKind Kind : {Attribute::ZExt, Attribute::SExt}) { + if (PAL.hasRetAttr(Kind)) { + PAL = PAL.removeRetAttribute(Ctx, Kind); + RemovedAttribute = true; + } + } + if (RemovedAttribute) { + VectorizedFn->setAttributes(PAL); + } + } + + // Override the base function name component for the vectorized function. + compiler::utils::setBaseFnName(*VectorizedFn, VectorizedFn->getName()); + + // Drop any metadata where the scalar kernel already serves as the base or + // result of vectorization: this vectorized function does not serve as such: + // not directly in the case of 'derived' metadata, anyway: that relationship + // will be transitive. + compiler::utils::dropVeczOrigMetadata(*VectorizedFn); + compiler::utils::dropVeczDerivedMetadata(*VectorizedFn); + + // Add any 'argument placeholder' instructions to the entry block. + // Skip over Alloca instructions if there are any. + BasicBlock &BB = VectorizedFn->getEntryBlock(); + auto InsertPt = BB.getFirstInsertionPt(); + while (isa(*InsertPt)) { + ++InsertPt; + } + + for (auto *Placeholder : Placeholders) { + Placeholder->insertBefore(InsertPt); + } + + return VectorizedFn; +} + +static DILocation *getDILocation(unsigned Line, unsigned Column, MDNode *Scope, + MDNode *InlinedAt = nullptr) { + // If no scope is available, this is an unknown location. + if (!Scope) + return DebugLoc(); + return DILocation::get(Scope->getContext(), Line, Column, Scope, InlinedAt, + /*ImplicitCode*/ false); +} + +void cloneDebugInfo(const VectorizationUnit &VU) { + DISubprogram *const ScalarDI = VU.scalarFunction()->getSubprogram(); + // We don't have debug info + if (!ScalarDI) { + return; + } + + // Create a DISubprogram entry for the vectorized kernel + DIBuilder DIB(*VU.scalarFunction()->getParent(), false); + DICompileUnit *CU = + DIB.createCompileUnit(dwarf::DW_LANG_OpenCL, ScalarDI->getFile(), "", + ScalarDI->isOptimized(), "", 0); + DISubprogram *const VectorDI = DIB.createFunction( + CU->getFile(), ScalarDI->getName(), + StringRef(), /* Don't need a linkage name */ + CU->getFile(), ScalarDI->getLine(), ScalarDI->getType(), + ScalarDI->getScopeLine(), ScalarDI->getFlags(), ScalarDI->getSPFlags()); + + // Point kernel function to a parent compile unit + VectorDI->replaceUnit(ScalarDI->getUnit()); + + VU.vectorizedFunction()->setSubprogram(VectorDI); + + DIB.finalize(); + + // Iterate over all the instructions in the kernel looking for + // intrinsics containing debug info metadata that must be updated. + // Changing the scope to point to the new vectorized function, rather + // than the scalar function. + + for (auto &BBItr : *VU.vectorizedFunction()) { + for (auto &InstItr : BBItr) { + if (InstItr.getDebugLoc()) { + // Update debug info line numbers to have vectorized kernel scope, + // taking care to preserve inlined locations. + const DebugLoc &ScalarLoc = InstItr.getDebugLoc(); + DebugLoc VectorLoc; + if (DILocation *const InlinedLoc = ScalarLoc.getInlinedAt()) { + // Don't support nested inlined locations for now + if (!InlinedLoc->getInlinedAt()) { + const DebugLoc VectorKernel = getDILocation( + InlinedLoc->getLine(), InlinedLoc->getColumn(), VectorDI); + VectorLoc = getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(), + ScalarLoc.getScope(), VectorKernel); + } + } else { + VectorLoc = + getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(), VectorDI); + } + InstItr.setDebugLoc(VectorLoc); + } + } + } + + // Replace temporary MDNode with list of vectorized DILocals we have created + // In LLVM 7.0 the variables attribute of DISubprogram was changed to + // retainedNodes + auto *VectorizedKernelVariables = VectorDI->getRetainedNodes().get(); + assert(VectorizedKernelVariables && "Could not get retained nodes"); + if (VectorizedKernelVariables->isTemporary()) { + auto NewLocals = + MDTuple::getTemporary(VectorizedKernelVariables->getContext(), {}); + VectorizedKernelVariables->replaceAllUsesWith(NewLocals.get()); + } + + return; +} + +void cloneOpenCLMetadata(const VectorizationUnit &VU) { + cloneOpenCLNamedMetadataHelper(VU, "opencl.kernels"); + cloneOpenCLNamedMetadataHelper(VU, "opencl.kernel_wg_size_info"); +} + +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp new file mode 100644 index 0000000000000..afa45e4c325c0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp @@ -0,0 +1,394 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "vectorization_heuristics.h" + +#include +#include +#include + +#include + +#include "vectorization_context.h" + +#define DEBUG_TYPE "vecz" + +using namespace vecz; +using namespace llvm; + +namespace { +class Heuristics { + enum class BrClauseKind { None = 0, True, False }; + +public: + Heuristics(llvm::Function &F, VectorizationContext &Ctx, ElementCount VF, + unsigned SimdDimIdx) + : F(F), Ctx(Ctx), SimdWidth(VF), SimdDimIdx(SimdDimIdx) {} + + /// @brief Look through the scalar code to find patterns that indicate + /// we should not vectorize the kernel; e.g.: + /// __kernel Type FuncName(Params) { + /// if (get_global_id(0) == 0) { + /// // Do something. + /// } + /// // Do nothing. + /// } + /// @return Whether we should vectorize the function or not. + bool shouldVectorize(); + +private: + /// @brief Passthrough to CmpInst. + /// + /// @param[in] Comp The instruction to inspect. + /// + /// @return The branch's path not to vectorize, if any. + BrClauseKind shouldVectorizeVisitBr(const llvm::Value *Comp) const; + /// @brief Visit a Cmp to check if it involves a call to an opencl builtin. + /// + /// @param[in] Cmp The comparison instruction to inspect. + /// + /// @return The branch's path not to vectorize, if any. + BrClauseKind shouldVectorizeVisitCmp(const llvm::CmpInst *Cmp) const; + /// @brief Visit the operand of a Cmp to strip it down to a + /// CallInst or ConstantInt, if possible. + /// + /// @param[in] Val The instruction to inspect. + /// @param[in] Cmp The comparison instruction Val belongs to. + /// @param[in] Cache A map containing previously generated results. + /// + /// @return A CallInst or ConstantInt, nullptr otherwise. + const llvm::Value *shouldVectorizeVisitCmpOperand( + const llvm::Value *Val, const llvm::CmpInst *Cmp, + DenseMap &Cache) const; + /// @brief Inspect the predicate and the operand that is compared against an + /// opencl builtin to determine if it's better not to vectorize the + /// kernel. + /// + /// @param[in] RHS The operand compared against an opencl builtin. + /// @param[in] Pred The kind of comparison. + /// + /// @return The branch's path not to vectorize, if any. + BrClauseKind + shouldVectorizeVisitCmpOperands(const llvm::Value *RHS, + llvm::CmpInst::Predicate Pred) const; + + /// @brief The function to analyze. + llvm::Function &F; + + /// @brief The vectorization context. + VectorizationContext &Ctx; + + /// @brief Vectorization factor to use. + ElementCount SimdWidth; + + /// @brief Vectorization dimension to use. + unsigned SimdDimIdx; +}; + +Heuristics::BrClauseKind +Heuristics::shouldVectorizeVisitCmpOperands(const Value *RHS, + CmpInst::Predicate Pred) const { + // If we have an `EQ` comparison, the single lane computation happens on + // the true successor. + if (Pred == CmpInst::Predicate::ICMP_EQ) { + return BrClauseKind::True; + } + + // If we have an `NE` comparison, the single lane computation happens on + // the false successor. + if (Pred == CmpInst::Predicate::ICMP_NE) { + return BrClauseKind::False; + } + + if (!RHS) { + return BrClauseKind::None; + } + + // If the value we compare against the opencl builtin call is a constant, + // determine if it is worth it to vectorize based on the chances to hit a + // branch. + if (const ConstantInt *Val = dyn_cast(RHS)) { + // If we have a branch whose condition only applies for at most half of the + // simd width, it is not worth vectorizing it. + switch (Pred) { + default: + break; + // If we have a `GT` or `GE` comparison, if the constant we compare the + // opencl builtin against is greater than half of the simd width, we will + // not take the true branch as often as the false branch. + case CmpInst::Predicate::ICMP_UGT: + case CmpInst::Predicate::ICMP_UGE: + case CmpInst::Predicate::ICMP_SGT: + case CmpInst::Predicate::ICMP_SGE: + if (SimdWidth.isScalable()) { + return BrClauseKind::True; + } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) { + return BrClauseKind::True; + } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) { + return BrClauseKind::False; + } + break; + // If we have an `LT` or `LE` comparison, if the constant we compare the + // opencl builtin against is smaller than half of the simd width, we will + // not take the true branch as often as the false branch. + case CmpInst::Predicate::ICMP_ULT: + case CmpInst::Predicate::ICMP_ULE: + case CmpInst::Predicate::ICMP_SLT: + case CmpInst::Predicate::ICMP_SLE: + if (SimdWidth.isScalable()) { + return BrClauseKind::False; + } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) { + return BrClauseKind::True; + } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) { + return BrClauseKind::False; + } + break; + } + } + + return BrClauseKind::None; +} + +const Value *Heuristics::shouldVectorizeVisitCmpOperand( + const Value *Val, const CmpInst *Cmp, + DenseMap &Cache) const { + const auto It = Cache.find(Val); + if (It != Cache.end()) { + return It->second; + } + + // If we are visiting a binary operator, inspect both its operands. + if (const BinaryOperator *BO = dyn_cast(Val)) { + const Value *LHS = + shouldVectorizeVisitCmpOperand(BO->getOperand(0), Cmp, Cache); + const Value *RHS = + shouldVectorizeVisitCmpOperand(BO->getOperand(1), Cmp, Cache); + + auto &Result = Cache[Val]; + + // If any of LHS and RHS are null and the comparison instruction is not + // an equality, Val is not constant and used in a relational comparison. + // We don't want to work with that. + if ((!LHS || !RHS) && !Cmp->isEquality()) { + return (Result = nullptr); + } + + // If the operands of the BinaryOperator are a CallInst and anything else + // we do not want to keep going. We wish to avoid such comparisons: + // if ((get_local_id(0) & Constant) == Constant) {} + if (dyn_cast_or_null(LHS)) { + return (Result = nullptr); + } + if (dyn_cast_or_null(RHS)) { + return (Result = nullptr); + } + + // Up to this point, LHS and RHS are either ConstantInt or null. + if (LHS) { + return (Result = LHS); + } + return (Result = RHS); + } + + // If we are visiting an unary operator, inspect its operand. + if (const UnaryInstruction *UI = dyn_cast(Val)) { + return shouldVectorizeVisitCmpOperand(UI->getOperand(0), Cmp, Cache); + } + + if (const CallInst *CI = dyn_cast(Val)) { + // We only care if the CallInst does involve a call to a work-item builtin. + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + if (auto B = BI.analyzeBuiltinCall(*CI, SimdDimIdx)) { + const auto Uniformity = B->uniformity; + if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID || + Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) { + return (Cache[Val] = CI); + } + } + } + + if (const ConstantInt *CI = dyn_cast(Val)) { + return (Cache[Val] = CI); + } + + return (Cache[Val] = nullptr); +} + +Heuristics::BrClauseKind +Heuristics::shouldVectorizeVisitCmp(const CmpInst *Cmp) const { + // The following two calls return either a CallInst, a ConstantInt, or + // nullptr otherwise. If it returns a CallInst, it necessarily is a call to + // get_{global|local}_id, because otherwise we don't care. + DenseMap Cache; + const Value *LHS = + shouldVectorizeVisitCmpOperand(Cmp->getOperand(0), Cmp, Cache); + const Value *RHS = + shouldVectorizeVisitCmpOperand(Cmp->getOperand(1), Cmp, Cache); + + const CmpInst::Predicate pred = Cmp->getPredicate(); + + BrClauseKind vectorize = BrClauseKind::None; + // The CmpInst may involve two CallInst, or it may involve only one but + // we don't know on which side it may be. + if (llvm::isa_and_nonnull(LHS)) { + vectorize = shouldVectorizeVisitCmpOperands(RHS, pred); + } + if (llvm::isa_and_nonnull(RHS)) { + const BrClauseKind RHSStatus = shouldVectorizeVisitCmpOperands(LHS, pred); + // This should never happen but in case it does, we want to "void" the + // result and vectorize! + if (vectorize != BrClauseKind::None && vectorize != RHSStatus) { + return BrClauseKind::None; + } + vectorize = RHSStatus; + } + return vectorize; +} + +Heuristics::BrClauseKind +Heuristics::shouldVectorizeVisitBr(const Value *Comp) const { + // If we are visiting a binary operator, inspect both its operands to + // perhaps find CmpInsts. + // E.g.: %and = and ... + // br i1 %and, ... + if (const BinaryOperator *BO = dyn_cast(Comp)) { + return (static_cast( + static_cast(shouldVectorizeVisitBr(BO->getOperand(0))) && + static_cast(shouldVectorizeVisitBr(BO->getOperand(1))))); + } + + if (const CmpInst *CI = dyn_cast(Comp)) { + return shouldVectorizeVisitCmp(CI); + } + + return BrClauseKind::None; +} + +bool Heuristics::shouldVectorize() { + BasicBlock &BB = F.getEntryBlock(); + + // Weights computed by the kind of instructions. + // For the moment, we only consider stores/loads and function calls as being + // expensive, without looking at what function call it is + // (except for work item calls). + // + // Ultimately, it feels like this check should be done at some point during + // the vectorization process, so that we have a better overview on how bad + // the vectorized kernel is compared to the scalar one. + // + // We should most likely check only for instructions that have varying + // operands. + auto getWeight = [this](BasicBlock &B) { + unsigned weight = 0; + for (Instruction &I : B) { + if (isa(&I) || isa(&I)) { + weight++; + } else if (CallInst *CI = dyn_cast(&I)) { + const compiler::utils::BuiltinInfo &BI = Ctx.builtins(); + if (Function *Callee = CI->getCalledFunction()) { + const auto builtin = BI.analyzeBuiltin(*Callee); + if (!builtin || !(builtin->properties & + compiler::utils::eBuiltinPropertyWorkItem)) { + weight++; + } + } + } + } + return weight; + }; + + // If the program is laid out such that it may not be worth to vectorize + // based only on the comparison of the entry block, we also have to make + // sure that the entry block does not do as many expensive work as its + // successors, in which case it might still be worth to vectorize. + // We want to check if the entry block does some computation and store + // them. Basically, if the kernel looks like: + // + // __kernel void FuncName(Params) { + // // (1) Do something. + // // (2) Store that something. + // if (get_global_id(0) == 0) { + // // (3) Do something. + // } + // // (4) Do nothing. + // } + // + // then we might still want to vectorize it because (1) might be eligible for + // great vectorization improvements. + // If (2) is not present in the kernel, then we will probably not want to + // vectorize the kernel as (1) will then either be useless or only be used + // in (3). The former implies that it will never be used and the latter + // implies that it will be used only once per lane, so not worth vectorizing! + const unsigned entryBlockWeight = getWeight(BB); + + Instruction *TI = BB.getTerminator(); + if (BranchInst *BI = dyn_cast(TI)) { + if (BI->isConditional()) { + const BrClauseKind clause = shouldVectorizeVisitBr(BI->getCondition()); + unsigned succWeight = 0; + if (clause != BrClauseKind::None) { + BasicBlock *start = nullptr; + BasicBlock *terminatingBlock = nullptr; + if (clause == BrClauseKind::True) { + start = BI->getSuccessor(0); + terminatingBlock = BI->getSuccessor(1); + } else { + start = BI->getSuccessor(1); + terminatingBlock = BI->getSuccessor(0); + } + assert(terminatingBlock && + "Failed to get terminating block of branch inst"); + + std::unordered_set visited; + std::vector worklist{start}; + visited.insert(start); + while (!worklist.empty()) { + BasicBlock *cur = worklist.back(); + worklist.pop_back(); + succWeight += getWeight(*cur); + for (BasicBlock *succ : successors(cur)) { + if (succ == terminatingBlock) { + continue; + } + if (visited.insert(succ).second) { + worklist.push_back(succ); + } + } + } + + // We don't want to vectorize if the path that will be taken the most + // is the exit block of the function and does nothing else but return. + if (isa(terminatingBlock->getTerminator()) && + (terminatingBlock->size() == 1) && + // Arbitrary limit. + (entryBlockWeight < succWeight)) { + return false; + } + } + } + } + + return true; +} +} // namespace + +namespace vecz { +bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx, + ElementCount VF, unsigned SimdDimIdx) { + Heuristics VH(F, Ctx, VF, SimdDimIdx); + return VH.shouldVectorize(); +} +} // namespace vecz diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp new file mode 100644 index 0000000000000..6516d2f593982 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp @@ -0,0 +1,170 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "vectorization_unit.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "debugging.h" +#include "vectorization_context.h" +#include "vectorization_helpers.h" +#include "vecz/vecz_choices.h" + +#define DEBUG_TYPE "vecz" + +using namespace vecz; +using namespace llvm; + +VectorizationUnit::VectorizationUnit(Function &F, ElementCount Width, + unsigned Dimension, + VectorizationContext &Ctx, + const VectorizationChoices &Ch) + : Ctx(Ctx), Choices(Ch), ScalarFn(&F), VectorizedFn(nullptr), + SimdWidth(ElementCount()), LocalSize(0), AutoSimdWidth(false), + SimdDimIdx(Dimension), FnFlags(eFunctionNoFlag) { + // Gather information about the function's arguments. + for (Argument &Arg : F.args()) { + VectorizerTargetArgument TargetArg; + TargetArg.OldArg = &Arg; + TargetArg.NewArg = nullptr; + TargetArg.IsVectorized = false; + TargetArg.PointerRetPointeeTy = nullptr; + TargetArg.Placeholder = nullptr; + Arguments.push_back(TargetArg); + } + + // Set the desired SIMD width and try to look up the vectorized function. + setWidth(Width); +} + +VectorizationUnit::~VectorizationUnit() {} + +Function &VectorizationUnit::function() { + if (VectorizedFn) { + return *VectorizedFn; + } else { + return *ScalarFn; + } +} + +const Function &VectorizationUnit::function() const { + if (VectorizedFn) { + return *VectorizedFn; + } else { + return *ScalarFn; + } +} + +void VectorizationUnit::setWidth(ElementCount NewWidth) { + if (NewWidth == SimdWidth) { + return; + } + SimdWidth = NewWidth; + + // Determine the vectorized function's name and try to look it up. + const std::string VectorizedName = + getVectorizedFunctionName(ScalarFn->getName(), SimdWidth, Choices); + if (VectorizedFn) { + VectorizedFn->setName(VectorizedName); + } else { + setVectorizedFunction(Ctx.module().getFunction(VectorizedName)); + } +} + +void VectorizationUnit::setScalarFunction(llvm::Function *NewFunction) { + if (!NewFunction) { + return; + } + ScalarFn = NewFunction; + unsigned i = 0; + for (Argument &Arg : NewFunction->args()) { + VectorizerTargetArgument &TargetArg = Arguments[i]; + TargetArg.OldArg = &Arg; + i++; + } +} + +void VectorizationUnit::setVectorizedFunction(llvm::Function *NewFunction) { + VectorizedFn = NewFunction; + ArgumentPlaceholders.clear(); + if (!NewFunction) { + for (unsigned i = 0; i < Arguments.size(); i++) { + VectorizerTargetArgument &TargetArg = Arguments[i]; + TargetArg.NewArg = nullptr; + TargetArg.Placeholder = nullptr; + } + } else { + unsigned i = 0; + for (Argument &Arg : NewFunction->args()) { + VectorizerTargetArgument &TargetArg = Arguments[i]; + TargetArg.NewArg = &Arg; + + Instruction *Placeholder = nullptr; + if (TargetArg.IsVectorized && !TargetArg.PointerRetPointeeTy && + !Arg.user_empty()) { + // A vectorized argument will be used only by its placeholder extract + // element instruction + Placeholder = cast(*Arg.user_begin()); + } + + TargetArg.Placeholder = Placeholder; + if (Placeholder) { + // Mark the extract to distinguish them from other instructions. + ArgumentPlaceholders.insert(Placeholder); + } + i++; + } + } +} + +vecz::internal::AnalysisFailResult +VectorizationUnit::setFailed(const char *remark, const llvm::Function *F, + const llvm::Value *V) { + setFlag(eFunctionVectorizationFailed); + emitVeczRemarkMissed(F ? F : &function(), V, remark); + return vecz::internal::AnalysisFailResult(); +} + +VectorizationResult VectorizationUnit::getResult() const { + VectorizationResult res; + res.func = VectorizedFn; + + for (const VectorizerTargetArgument &TargetArg : Arguments) { + Type *pointerRetPointeeTy = nullptr; + VectorizationResult::Arg::Kind kind = VectorizationResult::Arg::SCALAR; + if (auto *ty = TargetArg.PointerRetPointeeTy) { + pointerRetPointeeTy = ty; + kind = VectorizationResult::Arg::POINTER_RETURN; + } else if (TargetArg.IsVectorized) { + kind = VectorizationResult::Arg::VECTORIZED; + } + res.args.emplace_back(kind, TargetArg.NewArg->getType(), + pointerRetPointeeTy); + } + return res; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp new file mode 100644 index 0000000000000..a9c44e44b2cd4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp @@ -0,0 +1,363 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "vectorizer.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "analysis/vectorizable_function_analysis.h" +#include "debugging.h" +#include "memory_operations.h" +#include "vectorization_context.h" +#include "vectorization_helpers.h" +#include "vectorization_heuristics.h" +#include "vectorization_unit.h" +#include "vecz/pass.h" +#include "vecz/vecz_choices.h" + +#define DEBUG_TYPE "vecz" + +using namespace vecz; +using namespace llvm; + +namespace { +static cl::opt + VeczDumpReport("vecz-dump-report", + cl::desc("report the post-vectorization status")); +// static cl options allow us to access these options from other cpp files, +// such as vectorization_unit.cpp + +} // namespace + +// Statistics +STATISTIC(VeczSuccess, "Number of kernels successfully vectorized [ID#V80]"); +STATISTIC(VeczFail, "Number of kernels that failed to vectorize [ID#V81]"); +STATISTIC(VeczBail, + "Number of kernels where vectorization was not attempted [ID#V82]"); + +STATISTIC(ScalarInstructions, + "Number of instructions in the scalar kernel [ID#V00]"); +STATISTIC(ScalarLoadStores, + "Number of loads and stores in the scalar kernel [ID#V01]"); +STATISTIC(ScalarVectorInsts, + "Number of vector instructions in the scalar kernel [ID#V02]"); +STATISTIC(ScalarMaxVectorWidth, + "The width of the bigger vector instruction found in the scalar " + "kernel [ID#V13]"); +STATISTIC(VeczInstructions, + "Number of instructions in the vectorized kernel [ID#V03]"); +STATISTIC(VeczScalarInstructions, + "Number of scalar instructions in the vectorized kernel [ID#V04]"); +STATISTIC(VeczVectorInstructions, + "Number of vector instructions in the vectorized kernel [ID#V05]"); +STATISTIC(VeczInsertExtract, + "Number of insert/extractelement instructions in the vectorized " + "kernel [ID#V06]"); +STATISTIC(VeczSplats, + "Number of vector splats in the vectorized kernel [ID#V07]"); +STATISTIC( + VeczScalarMemOp, + "Number of scalar loads and stores in the vectorized kernel [ID#V0A]"); +STATISTIC( + VeczVectorMemOp, + "Number of vector loads and stores in the vectorized kernel [ID#V0B]"); +STATISTIC( + VeczMaskedMemOps, + "Number of masked memory operations in the vectorized kernel [ID#V0C]"); +STATISTIC(VeczInterleavedMemOps, + "Number of interleaved memory operations in the vectorized kernel " + "[ID#V0D]"); +STATISTIC(VeczMaskedInterleavedMemOps, + "Number of masked interleaved memory operations in the vectorized " + "kernel [ID#V0E]"); +STATISTIC(VeczScatterGatherMemOps, + "Number of scatter/gather memory operations in the vectorized kernel " + "[ID#V10]"); +STATISTIC(VeczMaskedScatterGatherMemOps, + "Number of masked scatter/gather operations in the vectorized " + "kernel [ID#V11]"); +STATISTIC(VeczVectorWidth, "Vector width of the vectorized kernel [ID#V12]"); +STATISTIC(Ratio, "Normalized ratio of theoretical speedup[ID#V13]"); + +namespace { +/// @brief Calculate vectorization related statistics from the kernels +/// +/// @param[in] VU The Vectorization Unit we are working on +/// @param[in] Scalar The scalar function that we have vectorized +/// @param[in] Vectorized The vectorized version of the scalar function +void collectStatistics(VectorizationUnit &VU, Function *Scalar, + Function *Vectorized) { + // Do not gather statistics if we failed to vectorize, if we're doing + // scalable vectorization, or if statistics aren't enabled in the first + // place. + if (!Scalar || !Vectorized || !AreStatisticsEnabled() || + VU.width().isScalable()) { + return; + } + + VeczVectorWidth = VU.width().getFixedValue(); + + // Function to check if an instruction is a vector instruction or not + auto isVectorInst = [](Instruction &I) -> bool { + Type *Ty = I.getType(); + + // Insert/extractelement are not really vector instructions + if (isa(I) || isa(I)) { + return false; + } + // Instructions that return a vector + if (isa(Ty)) { + return true; + } + // Store instructions that store a vector value + if (StoreInst *SI = dyn_cast(&I)) { + auto *ValOp = SI->getValueOperand(); + assert(ValOp && "Could not get value operand"); + return isa(ValOp->getType()); + } + // Internal builtins that work on vectors. This is relevant for stores only, + // as loads return a vector type and will be caught earlier on. + if (CallInst *CI = dyn_cast(&I)) { + if (auto Op = MemOp::get(CI)) { + // With the exception of masked loads and stores, every other internal + // builtin works with vectors + if (!Op->isMaskedMemOp()) { + return true; + } + // Masked loads are handled earlier on as they return a vector type. + // We need to check if masked stores are storing vectors or not. + if (Op->isStore() && isa(Op->getDataType())) { + return true; + } + } + } + + return false; + }; + + unsigned MaxScalarVectorWidth = 0; + // Collect the scalar kernel's statistics + for (auto &BB : *Scalar) { + for (auto &I : BB) { + ++ScalarInstructions; + ScalarLoadStores += (isa(I) || isa(I)); + ScalarVectorInsts += isVectorInst(I); + // Find out how wide is the widest vector used in the scalar kernel + if (auto *VecTy = dyn_cast(I.getType())) { + MaxScalarVectorWidth = + std::max(VecTy->getNumElements(), MaxScalarVectorWidth); + } + } + } + ScalarMaxVectorWidth = MaxScalarVectorWidth; + + // Collect the vectorized kernel's statistics + for (auto &BB : *Vectorized) { + for (auto &I : BB) { + // Count instructions + ++VeczInstructions; + + // Detect vector splats + // Count insert/extractelement instructions + if (isa(I) || isa(I)) { + if (I.getName().starts_with(".splatinsert")) { + ++VeczSplats; + } + ++VeczInsertExtract; + } + + // Count vector and scalar instructions + if (isVectorInst(I)) { + ++VeczVectorInstructions; + } else { + ++VeczScalarInstructions; + } + + // Count memory operation types + if (isa(I) || isa(I)) { + // Normal scalar/vector loads and stores + if (isVectorInst(I)) { + ++VeczVectorMemOp; + } else { + ++VeczScalarMemOp; + } + } else if (CallInst *CI = dyn_cast(&I)) { + Function *F = CI->getCalledFunction(); + if (!F) { + continue; + } + // Subtract 1 for the call instruction, since we are inlining + --VeczInstructions; + + for (auto &BB : *F) { + for (auto &Inst : BB) { + VeczInstructions += !isa(&Inst); + } + } + // Internal builtin memory operations + if (auto Op = MemOp::get(&I)) { + VeczMaskedMemOps += Op->isMaskedMemOp(); + VeczInterleavedMemOps += Op->getDesc().isInterleavedMemOp(); + VeczMaskedInterleavedMemOps += Op->isMaskedInterleavedMemOp(); + VeczScatterGatherMemOps += Op->getDesc().isScatterGatherMemOp(); + VeczMaskedScatterGatherMemOps += Op->isMaskedScatterGatherMemOp(); + } + } + } + } + + // Ratio = Normalized Scalar Insts / Vector Insts + // Normalized Scalar Insts = Simd Width * Scalar Insts + // IK - Input Kernel + // Scalar Insts = IK's Scalar Insts + IK's Vec Insts * IK's VecWidth + const unsigned SimdWidth = VU.width().getFixedValue(); + Ratio = (SimdWidth * (ScalarInstructions - ScalarVectorInsts + + ScalarVectorInsts * MaxScalarVectorWidth)) / + VeczInstructions; +} +} // namespace + +VectorizationUnit *vecz::createVectorizationUnit(VectorizationContext &Ctx, + Function *Kernel, + const VeczPassOptions &Opts, + FunctionAnalysisManager &FAM, + bool Check) { + const unsigned SimdDimIdx = Opts.vec_dim_idx; + const unsigned LocalSize = Opts.local_size; + const bool Auto = Opts.vecz_auto; + auto VF = Opts.factor; + + if (!Kernel || VF.isScalar()) { + ++VeczBail; + VECZ_FAIL(); + } + + // Up to MAX_SIMD_DIM supported dimensions + VECZ_ERROR_IF(SimdDimIdx >= MAX_SIMD_DIM, + "Specified vectorization dimension is invalid"); + + VECZ_ERROR_IF(VF.getKnownMinValue() == 0, "Vectorization factor of zero"); + + // Adjust VF if the local size is known to vectorize more often. + if (LocalSize && !VF.isScalable()) { + // If we know the vectorized loop will never be entered, because the + // vectorization factor is too large, then vectorizing is a waste of time. + // It is better instead to vectorize by a smaller factor. Keep on halfing + // the vector width until a useable value is found (worst case this value + // will be 1, because that evenly divides everything). + unsigned FixedSimdWidth = VF.getFixedValue(); + // Note FixedSimdWidth is either a power of two or 3. If FixedSimdWidth + // was 1 then we would not enter the body of the loop (as X%1 is 0 for all + // X), if FixedSimdWidth is a greater power of two then dividing it by 2 + // gives another power of two, 3 divided by 2 gives 1, a power of two. Thus + // if this loop runs at least once then FixedSimdWidth will be a power of + // 2. + assert(FixedSimdWidth == 3 || llvm::isPowerOf2_32(FixedSimdWidth)); + while (FixedSimdWidth != 1 && FixedSimdWidth > LocalSize) { + FixedSimdWidth /= 2; + assert(FixedSimdWidth > 0 && "Cannot vectorize (or modulo) by 0."); + } + if (FixedSimdWidth == 1) { + ++VeczBail; + emitVeczRemarkMissed(Kernel, nullptr, + "requested Vectorization factor of 1"); + return nullptr; + } + VF = ElementCount::get(FixedSimdWidth, false); + } + + bool canVectorize = true; + if (Check) { + auto Res = FAM.getResult(*Kernel); + canVectorize = Res.canVectorize; + } + + if (canVectorize && + (!Auto || shouldVectorize(*Kernel, Ctx, VF, SimdDimIdx))) { + auto VU = + Ctx.createVectorizationUnit(*Kernel, VF, SimdDimIdx, Opts.choices); + VU->setAutoWidth(Auto); + VU->setLocalSize(Opts.local_size); + return VU; + } + return nullptr; +} + +void vecz::trackVeczSuccessFailure(VectorizationUnit &VU) { + Function *Fn = VU.scalarFunction(); + Function *vectorizedFn = VU.vectorizedFunction(); + const bool failed = VU.failed(); + VeczFail += failed; + VeczSuccess += !failed; + collectStatistics(VU, Fn, vectorizedFn); + + if (VeczDumpReport) { + const auto VF = VU.width(); + auto FnName = Fn->getName(); + if (vectorizedFn) { + errs() << "vecz: Vectorization succeeded for kernel '" << FnName + << "' << (" << (VF.isScalable() ? "scalable-vector" : "SIMD") + << " factor: " << VF.getKnownMinValue() << ") " + << *vectorizedFn->getType() << "\n"; + } else { + errs() << "vecz: Vectorization failed for kernel '" << FnName << "'\n"; + } + } +} + +bool vecz::createVectorizedFunctionMetadata(VectorizationUnit &vu) { + Function *fn = vu.scalarFunction(); + Function *vectorizedFn = vu.vectorizedFunction(); + if (vu.failed()) { + vectorizedFn = nullptr; + } else { + // If vectorization succeeded, clone the OpenCL related metadata from the + // scalar kernel. We do not do this while cloning the kernel because if + // vectorization fails we will have metadata pointing to non-existing + // kernels. + cloneOpenCLMetadata(vu); + } + const auto vf = vu.width(); + const auto dim = vu.dimension(); + + // emit output metadata based on vectorization result + auto finalVF = vf; + + const compiler::utils::VectorizationInfo info{ + finalVF, dim, vu.choices().vectorPredication()}; + + if (vectorizedFn && vectorizedFn != fn) { // success + // Link the original function to the vectorized one. + compiler::utils::linkOrigToVeczFnMetadata(*fn, *vectorizedFn, info); + + // Link the vectorized function back to the original one. + compiler::utils::linkVeczToOrigFnMetadata(*vectorizedFn, *fn, info); + } else { // fail or bail + compiler::utils::encodeVectorizationFailedMetadata(*fn, info); + } + return vectorizedFn; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp new file mode 100644 index 0000000000000..bcbeabbf9766b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp @@ -0,0 +1,289 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "vecz_pass_builder.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "analysis/control_flow_analysis.h" +#include "analysis/divergence_analysis.h" +#include "analysis/liveness_analysis.h" +#include "analysis/packetization_analysis.h" +#include "analysis/simd_width_analysis.h" +#include "analysis/stride_analysis.h" +#include "analysis/uniform_value_analysis.h" +#include "analysis/vectorizable_function_analysis.h" +#include "analysis/vectorization_unit_analysis.h" +#include "debugging.h" +#include "multi_llvm/llvm_version.h" +#include "transform/common_gep_elimination_pass.h" +#include "transform/control_flow_conversion_pass.h" +#include "transform/inline_post_vectorization_pass.h" +#include "transform/interleaved_group_combine_pass.h" +#include "transform/packetization_helpers.h" +#include "transform/packetization_pass.h" +#include "transform/passes.h" +#include "transform/scalarization_pass.h" +#include "transform/ternary_transform_pass.h" + +#define DEBUG_TYPE "vecz" +using namespace llvm; +using namespace vecz; + +VeczPassMachinery::VeczPassMachinery( + llvm::LLVMContext &llvmCtx, llvm::TargetMachine *TM, + VectorizationContext &Ctx, bool verifyEach, + compiler::utils::DebugLogging debugLogLevel) + : compiler::utils::PassMachinery(llvmCtx, TM, verifyEach, debugLogLevel), + Ctx(Ctx) {} + +void VeczPassMachinery::registerPasses() { + // Register standard passes + compiler::utils::PassMachinery::registerPasses(); + + FAM.registerPass([&] { return VectorizationContextAnalysis(Ctx); }); + FAM.registerPass([&] { return VectorizationUnitAnalysis(Ctx); }); + FAM.registerPass([&] { return VectorizableFunctionAnalysis(); }); + FAM.registerPass([] { return StrideAnalysis(); }); + FAM.registerPass([] { return UniformValueAnalysis(); }); + FAM.registerPass([] { return LivenessAnalysis(); }); + FAM.registerPass([] { return PacketizationAnalysis(); }); + FAM.registerPass([] { return CFGAnalysis(); }); + FAM.registerPass([] { return DivergenceAnalysis(); }); + + if (!TM) { + FAM.registerPass([] { return TargetIRAnalysis(); }); + } else { + FAM.registerPass( + [this] { return TargetIRAnalysis(TM->getTargetIRAnalysis()); }); + FAM.registerPass([] { return SimdWidthAnalysis(); }); + } +} + +void VeczPassMachinery::addClassToPassNames() { + { +#define MODULE_PASS(NAME, CREATE_PASS) \ + PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define FUNCTION_PASS(NAME, CREATE_PASS) \ + PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#define LOOP_PASS(NAME, CREATE_PASS) \ + PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME); +#include "passes.def" + } + + // Register a callback which skips all passes once we've failed to vectorize + // a function. + PIC.registerShouldRunOptionalPassCallback([&](StringRef, llvm::Any IR) { + const Function *const *FPtr = any_cast(&IR); + const Function *F = FPtr ? *FPtr : nullptr; + if (!F) { + if (const auto *const *L = any_cast(&IR)) { + F = (*L)->getHeader()->getParent(); + } else { + // Always run module passes + return true; + } + } + // FIXME: This is repeating the job of the VectorizationUnitAnalysis. + // We should track 'failure' more directly in the + // Function/VectorizationContext? + const auto *const VU = Ctx.getActiveVU(F); + if (!VU) { + // Don't run on anything without a VU since it's not currently being + // vectorized. + return false; + } + return !VU->failed(); + }); +} + +void VeczPassMachinery::registerPassCallbacks() { + // Add a backwards-compatible way of supporting simplifycfg, which used + // to be called simplify-cfg before LLVM 12. + PB.registerPipelineParsingCallback( + [](StringRef Name, ModulePassManager &PM, + ArrayRef) { +#define MODULE_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) { \ + PM.addPass(CREATE_PASS); \ + return true; \ + } +#define FUNCTION_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) { \ + PM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS)); \ + return true; \ + } +#define LOOP_PASS(NAME, CREATE_PASS) \ + if (Name == NAME) { \ + PM.addPass(createModuleToFunctionPassAdaptor( \ + createFunctionToLoopPassAdaptor(CREATE_PASS))); \ + return true; \ + } +#include "passes.def" + return false; + }); +} + +bool vecz::buildPassPipeline(ModulePassManager &PM) { + // Preparation passes + PM.addPass(BuiltinInliningPass()); + { + FunctionPassManager FPM; + // Lower switches after builtin inlining, in case the builtins had switches. + FPM.addPass(LowerSwitchPass()); + FPM.addPass(FixIrreduciblePass()); + + // It's helpful to run SROA in case it opens up more opportunities to + // eliminate aggregates in (particularly SYCL) kernels. This is especially + // true after inlining - which we've (usually) just performed in the + // BuiltinInliningPass - because otherwise SROA is unable to analyze the + // lifetime of allocas due to them being "escaped" by the function callee. + FPM.addPass(SROAPass(SROAOptions::ModifyCFG)); + // We have to run LLVM's Mem2Reg pass in case the front end didn't. Note + // that SROA usually runs Mem2Reg internally (unless disabled via a + // command-line option) though using its own heuristic. We run it + // unconditionally regardless, just for good measure. + FPM.addPass(PromotePass()); + // LLVM's own Mem2Reg pass doesn't always get everything + FPM.addPass(BasicMem2RegPass()); + + FPM.addPass(InstCombinePass()); + FPM.addPass(AggressiveInstCombinePass()); + FPM.addPass(DCEPass()); + FPM.addPass(PreLinearizePass()); + // If pre-linearization created any unnecessary Hoist Guards, + // Instruction Combining Pass will handily clean them up. + FPM.addPass(InstCombinePass()); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(DCEPass()); + FPM.addPass(UnifyFunctionExitNodesPass()); + FPM.addPass(LoopSimplifyPass()); + // Lower switches again because CFG simplifcation can create them. + FPM.addPass(LowerSwitchPass()); + { + LoopPassManager LPM; + LPM.addPass(VeczLoopRotatePass()); + // IndVarSimplify can create a lot of duplicate instructions when there + // are unrolled loops. EarlyCSE is there to clear them up. However, + // this can destroy LCSSA form, so we need to restore it. + LPM.addPass(IndVarSimplifyPass()); + FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM))); + } + + FPM.addPass(EarlyCSEPass()); + // We run this last because EarlyCSE can actually create infinite loops + // (with a "conditional" branch on true) + FPM.addPass(createFunctionToLoopPassAdaptor(SimplifyInfiniteLoopPass())); + + FPM.addPass(RemoveIntPtrPass()); + FPM.addPass(SquashSmallVectorsPass()); + FPM.addPass(UniformReassociationPass()); + FPM.addPass(TernaryTransformPass()); + + FPM.addPass(BreakCriticalEdgesPass()); + FPM.addPass(LCSSAPass()); + FPM.addPass(ControlFlowConversionPass()); + + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + + // Verify that the preparation passes (particularly control-flow conversion) + // have left the module in a correct state. + PM.addPass(VerifierPass()); + + { + FunctionPassManager FPM; + + FPM.addPass(DivergenceCleanupPass()); + + FPM.addPass(CommonGEPEliminationPass()); + FPM.addPass(ScalarizationPass()); + + FPM.addPass(AggressiveInstCombinePass()); + FPM.addPass(ADCEPass()); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(SimplifyMaskedMemOpsPass()); + + // Having multiple GEP instructions that perform the same operation + // greatly amplifies the code generated by the packetizer as it duplicates + // the amount of extractelement instructions, so we want to remove what + // is unnecessary. + FPM.addPass(CommonGEPEliminationPass()); + + // The packetizer - the 'main' bit of the vectorization process. + FPM.addPass(PacketizationPass()); + + FPM.addPass(InlinePostVectorizationPass()); + FPM.addPass(FlattenCFGPass()); + FPM.addPass(GVNPass(GVNOptions().setMemDep(true))); + FPM.addPass(AggressiveInstCombinePass()); + FPM.addPass(ADCEPass()); + FPM.addPass(SinkingPass()); + FPM.addPass(SimplifyCFGPass()); + FPM.addPass(AggressiveInstCombinePass()); + + FPM.addPass(InterleavedGroupCombinePass(eInterleavedStore)); + FPM.addPass(InterleavedGroupCombinePass(eInterleavedLoad)); + FPM.addPass(InstCombinePass()); + FPM.addPass(InferAlignmentPass()); + FPM.addPass(DCEPass()); + FPM.addPass(SimplifyMaskedMemOpsPass()); + + PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + } + + PM.addPass(DefineInternalBuiltinsPass()); + PM.addPass(VerifierPass()); + + return true; +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt new file mode 100644 index 0000000000000..b47f8f35b3df2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lit) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt new file mode 100644 index 0000000000000..7f67eb3a1a873 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt @@ -0,0 +1,26 @@ +configure_lit_site_cfg( + ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in + ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py + MAIN_CONFIG + ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py + PATHS + "CMAKE_OSX_SYSROOT" + "LLVM_SOURCE_DIR" + "LLVM_BINARY_DIR" + "LLVM_TOOLS_DIR" + "LLVM_LIBS_DIR" + "SHLIBDIR" + ) + +# TODO: Consider adding to check-sycl if this is every moved to under llvm/tests +# Add a target to invoke tests via Ninja/make. +add_lit_testsuite(check-sycl-vecz-tests "Running SYCL vecz lit tests" + "${CMAKE_CURRENT_BINARY_DIR}" + + DEPENDS + veczc + FileCheck +) + +add_custom_target(check-sycl-vecz) +add_dependencies(check-sycl-vecz check-sycl-vecz-tests) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py new file mode 100644 index 0000000000000..0c0a2590b6274 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py @@ -0,0 +1,37 @@ +# Copyright (C) Codeplay Software Limited +# +# Licensed under the Apache License, Version 2.0 (the "License") with LLVM +# Exceptions; you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +"""Python configuration file for lit.""" + +import os +import lit.formats +from lit.llvm import llvm_config + + +# Name of the test suite. +config.name = "LLVM" + +# File extensions for testing. +config.suffixes = [".hlsl", ".ll"] + +# The test format used to interpret tests. +config.test_format = lit.formats.ShTest(execute_external=False) + +# The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +llvm_config.with_environment( + "PATH", os.path.abspath(config.llvm_tools_dir), append_path=True +) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in new file mode 100644 index 0000000000000..785ee42143601 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in @@ -0,0 +1,22 @@ +"""Python configuration file for lit.""" + +@LIT_SITE_CFG_IN_HEADER@ + +import os +from lit.llvm.subst import ToolSubst +from lit.llvm import llvm_config + +config.test_exec_root = r"@CURRENT_BINARY_DIR@" + +# Paths to helper utilities +config.tools = [ ToolSubst('veczc') ] + +config.targets = frozenset('@LLVM_TARGETS_TO_BUILD@'.split(';')) + +config.llvm_tools_dir = lit_config.substitute(path(r"@LLVM_TOOLS_DIR@")) + +import lit.llvm +lit.llvm.initialize(lit_config, config) + +# Let the main config do the real work. +lit_config.load_config(config, os.path.join('@CMAKE_CURRENT_SOURCE_DIR@', "lit.cfg.py")) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg new file mode 100644 index 0000000000000..13f31884ad10f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg @@ -0,0 +1,18 @@ +# Copyright (C) Codeplay Software Limited +# +# Licensed under the Apache License, Version 2.0 (the "License") with LLVM +# Exceptions; you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if not 'AArch64' in config.root.targets: + config.unsupported = True diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll new file mode 100644 index 0000000000000..4a73b10725a00 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %idx.ext = sext i32 %mul3 to i64 + %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idx.ext + %0 = load i32, i32 addrspace(1)* %add.ptr, align 4 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr, i64 1 + %1 = load i32, i32 addrspace(1)* %arrayidx4, align 4 + %add5 = add nsw i32 %1, %0 + %idxprom = sext i32 %add to i64 + %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %add5, i32 addrspace(1)* %arrayidx8, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define {{(dso_local )?}}spir_kernel void @load16 +; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1 +; CHECK-NOT: load <4 x i32> +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2 +; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1 +; CHECK-NOT: extractvalue +; CHECK-NOT: shufflevector +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll new file mode 100644 index 0000000000000..fc0cc97549baf --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %conv4 = sext i32 %mul3 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %conv4 + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %add5 = or i64 %conv4, 1 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add5 + %1 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %add7 = add nsw i32 %1, %0 + %idxprom = sext i32 %add to i64 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %add7, i32 addrspace(1)* %arrayidx10, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define {{(dso_local )?}}spir_kernel void @load16 +; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1 +; CHECK-NOT: load <4 x i32> +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2 +; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1 +; CHECK-NOT: extractvalue +; CHECK-NOT: shufflevector +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll new file mode 100644 index 0000000000000..f000efae816a6 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll @@ -0,0 +1,57 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %idxprom = sext i32 %mul3 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %add7 = or i32 %mul3, 1 + %idxprom8 = sext i32 %add7 to i64 + %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8 + %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4 + %add10 = add nsw i32 %1, %0 + %idxprom13 = sext i32 %add to i64 + %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom13 + store i32 %add10, i32 addrspace(1)* %arrayidx14, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define {{(dso_local )?}}spir_kernel void @load16 +; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1 +; CHECK-NOT: load <4 x i32> +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2 +; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1 +; CHECK-NOT: extractvalue +; CHECK-NOT: shufflevector +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll new file mode 100644 index 0000000000000..82c8454716a5f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll @@ -0,0 +1,57 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %add4 = or i32 %mul3, 1 + %idxprom = sext i32 %add4 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %idxprom8 = sext i32 %mul3 to i64 + %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8 + %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4 + %sub = sub nsw i32 %0, %1 + %idxprom12 = sext i32 %add to i64 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom12 + store i32 %sub, i32 addrspace(1)* %arrayidx13, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define {{(dso_local )?}}spir_kernel void @load16 +; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1 +; CHECK-NOT: load <4 x i32> +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2 +; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1 +; CHECK-NOT: extractvalue +; CHECK-NOT: shufflevector +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll new file mode 100644 index 0000000000000..cd0d380e50e54 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll @@ -0,0 +1,69 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %idxprom = sext i32 %mul3 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %add7 = or i32 %mul3, 1 + %idxprom8 = sext i32 %add7 to i64 + %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8 + %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4 + %add13 = add nsw i32 %mul3, 2 + %idxprom14 = sext i32 %add13 to i64 + %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom14 + %2 = load i32, i32 addrspace(1)* %arrayidx15, align 4 + %add19 = add nsw i32 %mul3, 3 + %idxprom20 = sext i32 %add19 to i64 + %arrayidx21 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom20 + %3 = load i32, i32 addrspace(1)* %arrayidx21, align 4 + %add22 = add nsw i32 %1, %0 + %add23 = add nsw i32 %add22, %2 + %add24 = add nsw i32 %add23, %3 + %idxprom27 = sext i32 %add to i64 + %arrayidx28 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom27 + store i32 %add24, i32 addrspace(1)* %arrayidx28, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define {{(dso_local )?}}spir_kernel void @load16 +; CHECK: [[LOAD1:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD1]], 0 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD1]], 1 +; CHECK: [[LOAD2:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD2]], 0 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD2]], 1 +; CHECK-NOT: load <4 x i8> +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2 +; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load +; CHECK-NOT: shufflevector <4 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll new file mode 100644 index 0000000000000..b6327e55775cd --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll @@ -0,0 +1,59 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %add4 = add nsw i32 %mul3, 3 + %idxprom = sext i32 %add4 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %shl = shl i32 %0, 1 + %add8 = add nsw i32 %mul3, 2 + %idxprom9 = sext i32 %add8 to i64 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9 + %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %sub = sub nsw i32 %shl, %1 + %idxprom13 = sext i32 %add to i64 + %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom13 + store i32 %sub, i32 addrspace(1)* %arrayidx14, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define {{(dso_local )?}}spir_kernel void @load16 +; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1 +; CHECK-NOT: load <4 x i32> +; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2 +; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0 +; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1 +; CHECK-NOT: extractvalue +; CHECK-NOT: shufflevector +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll new file mode 100644 index 0000000000000..b6b34d30c45fc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll @@ -0,0 +1,150 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k boscc_killer -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_local_size(i32) + +@boscc_killer.shared = internal unnamed_addr addrspace(3) global i32 poison, align 4 + +; Function Attrs: convergent nounwind +define spir_kernel void @boscc_killer(float addrspace(1)* %A, float addrspace(1)* %B, i32 %N, i32 %lda) { +entry: + %gid0 = tail call i64 @__mux_get_local_id(i32 0) + %cmp0 = icmp eq i64 %gid0, 0 + br i1 %cmp0, label %if.then, label %if.end + +if.then: ; preds = %if.end24 + store i32 %N, i32 addrspace(3)* @boscc_killer.shared, align 4 + br label %if.end + +if.end: ; preds = %for.end, %if.end24 + %ldl.a = load i32, i32 addrspace(3)* @boscc_killer.shared, align 4 + %ldl.b = trunc i64 %gid0 to i32 + %ldl = add i32 %ldl.a, %ldl.b + %cmp1 = icmp eq i32 %ldl, 0 + br i1 %cmp1, label %if.then2, label %if.else + +if.else: ; preds = %if.end + %cmp2 = icmp slt i32 %ldl, %N + br i1 %cmp2, label %for.body, label %exit + +for.body: ; preds = %for.inc, %if.end227 + %acc = phi i32 [ %update2, %for.inc ], [ 1, %if.else ] + %acc_shl = shl nuw nsw i32 %acc, 2 + %update = add i32 %ldl, %acc_shl + %cmp3 = icmp slt i32 %update, %ldl + br i1 %cmp3, label %for.if.then, label %for.inc + +for.if.then: ; preds = %for.body + %mul297.us = mul nsw i32 %update, %lda + %add298.us = add nsw i32 %mul297.us, %ldl + %idxprom299.us = sext i32 %add298.us to i64 + %arrayidx300.us = getelementptr inbounds float, float addrspace(1)* %A, i64 %idxprom299.us + store float zeroinitializer, float addrspace(1)* %arrayidx300.us, align 16 + br label %for.inc + +for.inc: ; preds = %for.if.then, %for.body + %update2 = add nuw nsw i32 %acc, 1 + %cmp4 = icmp ult i32 %acc, 4 + br i1 %cmp4, label %for.body, label %exit + +if.then2: ; preds = %if.end + %gid0_trunc = trunc i64 %gid0 to i32 + %cmp5 = icmp sgt i32 %ldl, %gid0_trunc + br i1 %cmp5, label %if.then3, label %exit + +if.then3: ; preds = %for.cond.exit, %if.then53 + %arrayidxB = getelementptr inbounds float, float addrspace(1)* %B, i64 %gid0 + %v23 = load float, float addrspace(1)* %arrayidxB, align 16 + %arrayidxA = getelementptr inbounds float, float addrspace(1)* %A, i64 %gid0 + store float %v23, float addrspace(1)* %arrayidxA, align 16 + %call149 = tail call i64 @__mux_get_local_size(i32 0) #6 + %conv152 = add i64 %call149, %gid0 + %cmp71 = icmp slt i64 %conv152, 0 + br label %exit + +exit: ; preds = %for.inc, %if.end227, %for.cond.exit, %if.then53, %entry + ret void +} + +; We mostly want to check that it succeeded since this CFG crashed the block +; ordering algorithm, however it seems it is not easy to create a UnitCL test +; for this, since the CFG gets changed into something that doesn't cause the +; crash. This bug was identified from an Ecosystem failure, however, so it must +; be possible to do somehow. +; +; CHECK: spir_kernel void @__vecz_v4_boscc_killer +; CHECK: entry: +; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %entry.boscc_indir +; CHECK: if.then.uniform: +; CHECK: br label %if.end +; CHECK: entry.boscc_indir: +; CHECK: br i1 %{{.+}}, label %if.end, label %if.then +; CHECK: if.then: +; CHECK: br label %if.end +; CHECK: if.end: +; CHECK: br i1 %{{.+}}, label %if.then2.uniform, label %if.end.boscc_indir +; CHECK: if.else.uniform: +; CHECK: br i1 %{{.+}}, label %for.body.preheader.uniform, label %if.else.uniform.boscc_indir +; CHECK: for.body.preheader.uniform: +; CHECK: br label %for.body.uniform +; CHECK: if.else.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %exit, label %for.body.preheader +; CHECK: for.body.uniform: +; CHECK: br i1 %{{.+}}, label %for.if.then.uniform, label %for.body.uniform.boscc_indir +; CHECK: for.if.then.uniform: +; CHECK: br label %for.inc.uniform +; CHECK: for.body.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %for.inc.uniform, label %for.body.uniform.boscc_store +; CHECK: for.body.uniform.boscc_store: +; CHECK: br label %for.if.then +; CHECK: for.inc.uniform: +; CHECK: br i1 %{{.+}}, label %for.body.uniform, label %exit.loopexit.uniform +; CHECK: exit.loopexit.uniform: +; CHECK: br label %exit +; CHECK: if.then2.uniform: +; CHECK: br i1 %{{.+}}, label %if.then3.uniform, label %if.then2.uniform.boscc_indir +; CHECK: if.end.boscc_indir: +; CHECK: br i1 %{{.+}}, label %if.else.uniform, label %if.else +; CHECK: if.then3.uniform: +; CHECK: br label %exit +; CHECK: if.then2.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %exit, label %if.then3 +; CHECK: if.else: +; CHECK: br label %for.body.preheader +; CHECK: for.body.preheader: +; CHECK: br label %for.body +; CHECK: for.body: +; CHECK: br label %for.if.then +; CHECK: for.if.then: +; CHECK: br label %for.inc +; CHECK: for.inc: +; CHECK: br i1 %{{.+}}, label %for.body, label %exit.loopexit +; CHECK: if.then2: +; CHECK: br label %if.then3 +; CHECK: if.then3: +; CHECK: br label %exit +; CHECK: exit.loopexit: +; CHECK: br label %if.then2 +; CHECK: exit: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll new file mode 100644 index 0000000000000..014f19594e2b0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll @@ -0,0 +1,303 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k boscc_merge -vecz-passes="function(instcombine),function(simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) #0 +declare i64 @__mux_get_local_size(i32) #0 + +define spir_kernel void @boscc_merge(i32 %n, float addrspace(1)* %out, i64 %x) { +entry: + %lid = tail call i64 @__mux_get_local_id(i32 0) + %lsize = tail call i64 @__mux_get_local_size(i32 0) + %out_ptr = getelementptr inbounds float, float addrspace(1)* %out, i64 %x + %lid_sum_lsize = add i64 %lid, %lsize + %cmp1 = icmp ult i64 %lsize, %x + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %entry + %cmp2 = icmp ult i64 %lid, %x + br i1 %cmp2, label %if.then2, label %if.else2.preheader + +if.else2.preheader: ; preds = %if.then + store float 0.000000e+00, float addrspace(1)* %out_ptr, align 4 ; just so it's non-trivial for BOSCC + br label %if.else2 + +if.then2: ; preds = %if.then + %cmp3 = icmp ugt i64 %lsize, %x + br i1 %cmp3, label %if.then3.preheader, label %if.else3.preheader + +if.else3.preheader: ; preds = %if.then2 + br label %if.else3 + +if.then3.preheader: ; preds = %if.then2 + br label %if.then3 + +if.then3: ; preds = %if.then3.preheader, %if.else5 + %cmp4 = icmp ugt i64 %lid, %x + br i1 %cmp4, label %if.then4.preheader, label %if.else4.preheader + +if.else4.preheader: ; preds = %if.then3 + br label %if.else4 + +if.then4.preheader: ; preds = %if.then3 + br label %if.then4 + +if.else4: ; preds = %if.else4.preheader, %if.else4 + %cmp5 = icmp ult i64 %lid, %x + br i1 %cmp5, label %if.else4, label %if.else5.loopexit1 + +if.else5.loopexit: ; preds = %if.then4 + br label %if.else5 + +if.else5.loopexit1: ; preds = %if.else4 + br label %if.else5 + +if.else5: ; preds = %if.else5.loopexit1, %if.else5.loopexit + %cmp6 = icmp ult i64 %lid, %x + br i1 %cmp6, label %if.then3, label %if.else.loopexit + +if.then4: ; preds = %if.then4.preheader, %if.then4 + %cmp7 = icmp ult i64 %lid_sum_lsize, %x + br i1 %cmp7, label %if.then4, label %if.else5.loopexit + +if.else3: ; preds = %if.else3.preheader, %if.else3 + %cmp8 = icmp ult i64 %lid_sum_lsize, %x + br i1 %cmp8, label %if.else3, label %if.else.loopexit2 + +if.else2: ; preds = %if.else2.preheader, %if.else2 + %cmp9 = icmp ult i64 %lid_sum_lsize, %x + br i1 %cmp9, label %if.else2, label %if.else.loopexit3 + +if.else.loopexit: ; preds = %if.else5 + br label %if.else + +if.else.loopexit2: ; preds = %if.else3 + br label %if.else + +if.else.loopexit3: ; preds = %if.else2 + br label %if.else + +if.else: ; preds = %if.else.loopexit3, %if.else.loopexit2, %if.else.loopexit, %entry + %cmp10 = icmp ult i64 %lid, %x + br i1 %cmp10, label %if.then5, label %if.else6 + +if.then5: ; preds = %if.else + %cmp11 = icmp eq i64 %x, 0 + br i1 %cmp11, label %if.then6, label %if.else7 + +if.else7: ; preds = %if.then5 + %load = load float, float addrspace(1)* %out, align 4 + br label %if.then6 + +if.then6: ; preds = %if.else7, %if.then5 + %ret = phi float [ 0.000000e+00, %if.then5 ], [ %load, %if.else7 ] + store float %ret, float addrspace(1)* %out_ptr, align 4 + br label %if.else6 + +if.else6: ; preds = %if.then6, %if.else + ret void +} + +; CHECK: spir_kernel void @__vecz_v4_boscc_merge +; CHECK: %[[CMP1:.+]] = icmp +; CHECK: br i1 %[[CMP1]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: %[[CMP2:.+]] = icmp +; CHECK: br i1 %{{.+}}, label %[[IFTHEN2UNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]] + +; CHECK: [[IFELSE2PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[IFELSE2UNIFORM:.+]] + +; CHECK: [[IFELSE2UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE2UNIFORM]], label %[[IFELSE2UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFELSE2UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSELOOPEXIT3UNIFORM:.+]], label %[[IFELSE2UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSE2UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFELSE2:.+]] + +; CHECK: [[IFELSELOOPEXIT3UNIFORM]]: +; CHECK: br label %[[IFELSEUNIFORM:.+]] + +; CHECK: [[IFTHEN2UNIFORM]]: +; CHECK: %[[CMP3UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP3UNIFORM]], label %[[IFTHEN3PREHEADERUNIFORM:.+]], label %[[IFELSE3PREHEADERUNIFORM:.+]] + +; CHECK: [[IFTHENBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE2PREHEADERUNIFORM]], label %[[IFELSE2PREHEADER:.+]] + +; CHECK: [[IFELSE3PREHEADERUNIFORM]]: +; CHECK: br label %[[IFELSE3UNIFORM:.+]] + +; CHECK: [[IFELSE3UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE3UNIFORM]], label %[[IFELSE3UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFELSE3UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSELOOPEXIT2UNIFORM:.+]], label %[[IFELSE3UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSE3UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFELSE3:.+]] + +; CHECK: [[IFELSELOOPEXIT2UNIFORM]]: +; CHECK: br label %[[IFELSEUNIFORM]] + +; CHECK: [[IFTHEN3PREHEADERUNIFORM]]: +; CHECK: br label %[[IFTHEN3UNIFORM:.+]] + +; CHECK: [[IFTHEN3UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN4PREHEADERUNIFORM:.+]], label %[[IFTHEN3UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFELSE5UNIFORMBOSCCINDIR:.+]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSELOOPEXITUNIFORM:.+]], label %[[IFELSE5UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSE5UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFTHEN3:.+]] + +; CHECK: [[IFELSE4PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[IFELSE4UNIFORM:.+]] + +; CHECK: [[IFELSE4UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE4UNIFORM]], label %[[IFELSE4UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFELSE4UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE5LOOPEXIT1UNIFORM:.+]], label %[[IFELSE4UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSE4UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFELSE4:.+]] + +; CHECK: [[IFELSE5LOOPEXIT1UNIFORM]]: +; CHECK: br label %[[IFELSE5UNIFORM:.+]] + +; CHECK: [[IFTHEN4PREHEADERUNIFORM]]: +; CHECK: br label %[[IFTHEN4UNIFORM:.+]] + +; CHECK: [[IFTHEN3UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE4PREHEADERUNIFORM]], label %[[IFTHEN3UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFTHEN3UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFELSE4PREHEADER:.+]] + +; CHECK: [[IFTHEN4UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM]], label %[[IFTHEN4UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFTHEN4UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE5LOOPEXITUNIFORM:.+]], label %[[IFTHEN4UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFTHEN4UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFTHEN4:.+]] + +; CHECK: [[IFELSE5LOOPEXITUNIFORM]]: +; CHECK: br label %[[IFELSE5UNIFORM]] + +; CHECK: [[IFELSE5UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN3UNIFORM]], label %[[IFELSE5UNIFORMBOSCCINDIR]] + +; CHECK: [[IFELSELOOPEXITUNIFORM]]: +; CHECK: br label %[[IFELSE]] + +; CHECK: [[IFELSE2PREHEADER]]: +; CHECK: br label %[[IFELSE2]] + +; CHECK: [[IFTHEN2:.+]]: +; CHECK: %[[CMP3:.+]] = icmp +; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even +; on inactive lanes. +; CHECK: %[[CMP3_ACTIVE:.+]] = select i1 %[[CMP2]], i1 %[[CMP3]], i1 false +; CHECK: %[[CMP3_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP3_ACTIVE]]) +; CHECK: br i1 %[[CMP3_ACTIVE_ANY]], label %[[IFTHEN3PREHEADER:.+]], label %[[IFELSE3PREHEADER:.+]] + +; CHECK: [[IFELSE3PREHEADER]]: +; CHECK: br label %[[IFELSE3]] + +; CHECK: [[IFTHEN3PREHEADER]]: +; CHECK: br label %[[IFTHEN3]] + +; CHECK: [[IFTHEN3]]: +; CHECK: br label %[[IFELSE4PREHEADER]] + +; CHECK: [[IFELSE4PREHEADER]]: +; CHECK: br label %[[IFELSE4]] + +; CHECK: [[IFTHEN4PREHEADER:.+]]: +; CHECK: br label %[[IFTHEN4]] + +; CHECK: [[IFELSE4]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE4]], label %[[IFELSE4PUREEXIT:.+]] + +; CHECK: [[IFELSE4PUREEXIT]]: +; CHECK: br label %[[IFELSE5LOOPEXIT1:.+]] + +; CHECK: [[IFELSE5LOOPEXIT:.+]]: +; CHECK: br label %[[IFELSE5:.+]] + +; CHECK: [[IFELSE5LOOPEXIT1]]: +; CHECK: br label %[[IFTHEN4PREHEADER]] + +; CHECK: [[IFELSE5]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN3]], label %[[IFTHEN3PUREEXIT:.+]] + +; CHECK: [[IFTHEN3PUREEXIT]]: +; CHECK: br label %[[IFELSELOOPEXIT:.+]] + +; CHECK: [[IFTHEN4]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN4]], label %[[IFTHEN4PUREEXIT:.+]] + +; CHECK: [[IFTHEN4PUREEXIT]]: +; CHECK: br label %[[IFELSE5LOOPEXIT]] + +; CHECK: [[IFELSE3]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE3]], label %[[IFELSE3PUREEXIT:.+]] + +; CHECK: [[IFELSE3PUREEXIT]]: +; CHECK: br label %[[IFELSELOOPEXIT2:.+]] + +; CHECK: [[IFELSE2]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE2]], label %[[IFELSE2PUREEXIT:.+]] + +; CHECK: [[IFELSE2PUREEXIT]]: +; CHECK: br label %[[IFELSELOOPEXIT3:.+]] + +; CHECK: [[IFELSELOOPEXIT]]: +; CHECK: br label %[[IFELSE]] + +; CHECK: [[IFELSELOOPEXIT2]]: +; CHECK: br label %[[IFELSE]] + +; CHECK: [[IFELSELOOPEXIT3]]: +; CHECK: br label %[[IFTHEN2]] + +; CHECK: [[IFELSE]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE7UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFELSE7UNIFORM]]: +; CHECK: br label %[[IFELSE6:.+]] + +; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE6]], label %[[IFELSE7:.+]] + +; CHECK: [[IFELSE7]]: +; CHECK: br label %[[IFELSE6]] + +; CHECK: [[IFELSE6]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll new file mode 100644 index 0000000000000..bdaf96b9903fe --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll @@ -0,0 +1,173 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k boscc_merge2 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare float @llvm.fmuladd.f32(float, float, float) #2 +declare void @__mux_work_group_barrier(i32, i32, i32) #3 +declare spir_func float @_Z3maxff(float, float) #1 +declare i64 @__mux_get_local_id(i32) #1 +declare i64 @__mux_get_group_id(i32) #1 + +@fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared = internal addrspace(3) global [640 x float] poison, align 4 +@fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared = internal addrspace(3) global [1152 x float] poison, align 4 + +; Function Attrs: convergent nounwind +define spir_kernel void @boscc_merge2(float addrspace(1)* noalias %input0, float addrspace(1)* noalias %input1, float addrspace(1)* noalias %tensor, float addrspace(1)* noalias %input2) #2 { +entry: + %compute = alloca [28 x float], align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc2, %for.inc ] + %cmp1 = icmp ult i32 %storemerge, 16 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.cond + %call1 = call i64 @__mux_get_local_id(i32 0) #5 + %call2 = call i64 @__mux_get_group_id(i32 1) #5 + %idx1 = getelementptr inbounds [640 x float], [640 x float] addrspace(3)* @fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared, i64 0, i64 %call1 + store float 0.000000e+00, float addrspace(3)* %idx1, align 4 + %cmp2 = icmp sgt i64 %call2, %call1 + br i1 %cmp2, label %if.then2, label %land.lhs.true1 + +land.lhs.true1: ; preds = %if.then + %call3 = call i64 @__mux_get_group_id(i32 1) #5 + %call4 = call i64 @__mux_get_local_id(i32 0) #5 + %cmp3 = icmp slt i64 %call3, %call4 + br i1 %cmp3, label %land.lhs.true2, label %if.then2 + +land.lhs.true2: ; preds = %land.lhs.true1 + %call5 = call i64 @__mux_get_local_id(i32 0) #5 + %call6 = call i64 @__mux_get_group_id(i32 0) #5 + %cmp4 = icmp sgt i64 %call6, %call5 + br i1 %cmp4, label %if.then2, label %land.lhs.true3 + +land.lhs.true3: ; preds = %land.lhs.true2 + %call7 = call i64 @__mux_get_group_id(i32 0) #5 + %call8 = call i64 @__mux_get_local_id(i32 0) #5 + %cmp5 = icmp slt i64 %call7, %call8 + br i1 %cmp5, label %cond.true4, label %if.then2 + +cond.true4: ; preds = %land.lhs.true3 + %call9 = call i64 @__mux_get_local_id(i32 1) #5 + %idx2 = getelementptr inbounds float, float addrspace(1)* %input0, i64 %call9 + br label %if.then2 + +if.then2: ; preds = %cond.true4, %land.lhs.true3, %land.lhs.true2, %land.lhs.true1, %if.then + %call10 = call i64 @__mux_get_local_id(i32 0) #5 + %conv = trunc i64 %call10 to i32 + %idx3 = sext i32 %conv to i64 + %idx4 = getelementptr inbounds [1152 x float], [1152 x float] addrspace(3)* @fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared, i64 0, i64 %idx3 + %idx5 = getelementptr inbounds float, float addrspace(1)* %input1, i64 %idx3 + %load1 = load float, float addrspace(1)* %idx5, align 4 + store float %load1, float addrspace(3)* %idx4, align 4 + call void @__mux_work_group_barrier(i32 0, i32 1, i32 272) #4 + br label %for.cond2 + +for.cond2: ; preds = %for.body, %if.then2 + %storemerge1 = phi i32 [ 0, %if.then2 ], [ %inc1, %for.body ] + %cmp6 = icmp ult i32 %storemerge1, 4 + br i1 %cmp6, label %for.body, label %for.inc + +for.body: ; preds = %for.cond2 + %load2 = load float, float addrspace(3)* %idx4, align 4 + %fmul = call float @llvm.fmuladd.f32(float %load2, float %load2, float %load2) + %idx6 = getelementptr inbounds [28 x float], [28 x float]* %compute, i64 0, i64 27 + store float %fmul, float* %idx6, align 4 + %inc1 = add nuw nsw i32 %storemerge1, 1 + br label %for.cond2 + +for.inc: ; preds = %for.cond2 + %inc2 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +if.else: ; preds = %for.cond + %idx7 = getelementptr inbounds [28 x float], [28 x float]* %compute, i64 0, i64 0 + %load3 = load float, float* %idx7, align 4 + %storemerge_sext = sext i32 %storemerge to i64 + %idx8 = getelementptr inbounds float, float addrspace(1)* %tensor, i64 %storemerge_sext + store float %load3, float addrspace(1)* %idx8, align 4 + ret void +} + +attributes #0 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +; CHECK: spir_kernel void @__vecz_v4_boscc_merge2 +; CHECK: br label %[[IFTHEN:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN2:.+]], label %[[IFTHENBOSCCINDIR:.+]] + +; CHECK: [[LANDLHSTRUE1UNIFORM:.+]]: +; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE2UNIFORM:.+]], label %[[LANDLHSTRUE1UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[LANDLHSTRUE2UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[LANDLHSTRUE2UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[LANDLHSTRUE1UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[LANDLHSTRUE2:.+]] + +; CHECK: [[LANDLHSTRUE3UNIFORM:.+]]: +; CHECK: br i1 %{{.+}}, label %[[CONDTRUE4UNIFORM:.+]], label %[[LANDLHSTRUE3UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[CONDTRUE4UNIFORM]]: +; CHECK: br label %[[IFTHEN2]] + +; CHECK: [[LANDLHSTRUE3UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[CONDTRUE4:.+]] + +; CHECK: [[LANDLHSTRUE1:.+]]: +; CHECK: br label %[[LANDLHSTRUE2]] + +; CHECK: [[LANDLHSTRUE2]]: +; CHECK: br label %[[LANDLHSTRUE3:.+]] + +; CHECK: [[LANDLHSTRUE3]]: +; CHECK: br label %[[CONDTRUE4]] + +; CHECK: [[CONDTRUE4]]: +; CHECK: br label %[[IFTHEN2]] + +; CHECK: [[IFTHEN2]]: +; CHECK: br label %[[FORCOND2:.+]] + +; CHECK: [[LANDLHSTRUE2UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE3UNIFORM]], label %[[LANDLHSTRUE3]] + +; CHECK: [[IFTHENBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE1UNIFORM]], label %[[LANDLHSTRUE1]] + +; CHECK: [[FORCOND2]]: +; CHECK: %[[EXITCOND:.+]] = icmp +; CHECK: br i1 %[[EXITCOND]], label %[[FORBODY:.+]], label %[[FORINC:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND2]] + +; CHECK: [[FORINC]]: +; CHECK: %[[EXITCOND4:.+]] = icmp +; CHECK: br i1 %[[EXITCOND4]], label %[[IFTHEN]], label %[[IFELSE:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll new file mode 100644 index 0000000000000..c73edafd0548d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll @@ -0,0 +1,130 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k boscc_merge3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_global_id(i32) #0 + +; Function Attrs: nounwind readnone +declare spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64, float addrspace(1)*) + +define spir_kernel void @boscc_merge3(float addrspace(1)* %out, i64 noundef %n, float noundef %m) { +entry: + %gid0 = tail call i64 @__mux_get_global_id(i32 0) #0 + %gid1 = tail call i64 @__mux_get_global_id(i32 1) #0 + %cmp1 = icmp slt i64 %gid0, %n + br i1 %cmp1, label %if.then1, label %end + +if.then1: ; preds = %entry + %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %gid1 + %cmp2 = fcmp une float %m, 0.000000e+00 + br i1 %cmp2, label %if.then2, label %if.end1 + +if.then2: ; preds = %if.then1 + %cmp3 = icmp sge i64 %gid1, %n + %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %gid0 + br i1 %cmp3, label %if.then3, label %if.else3 + +if.then3: ; preds = %x51 + %load1 = load float, float addrspace(1)* %gep2, align 4 + %ie_load1 = insertelement <4 x float> poison, float %load1, i32 0 + br label %if.end2 + +if.else3: ; preds = %x51 + %vload1 = tail call spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64 0, float addrspace(1)* %gep2) + %cmp4 = icmp slt i64 %gid0, %n + br i1 %cmp4, label %if.then4, label %if.end2 + +if.then4: ; preds = %x175 + %vload2 = tail call spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64 4, float addrspace(1)* %gep2) + br label %if.end2 + +if.end2: ; preds = %x274, %x271, %if.then4, %x175, %x155, %x132 + %phi_gep2_load = phi <4 x float> [ %ie_load1, %if.then3 ], [ %vload2, %if.then4 ], [ %vload1, %if.else3 ] + %ie_m = insertelement <4 x float> poison, float %m, i32 0 + %shuffle_ie_m = shufflevector <4 x float> %ie_m, <4 x float> poison, <4 x i32> zeroinitializer + %fmul = fmul <4 x float> %shuffle_ie_m, %phi_gep2_load + br label %if.end1 + +if.end1: ; preds = %if.end2, %if.then1 + %phi_fmul = phi <4 x float> [ %fmul, %if.end2 ], [ zeroinitializer, %if.then1 ] + %ee0 = extractelement <4 x float> %phi_fmul, i32 0 + store float %ee0, float addrspace(1)* %gep1, align 4 + br label %end + +end: + ret void +} + +attributes #0 = { nounwind readnone } + +; CHECK: spir_kernel void @__vecz_v4_boscc_merge3 +; CHECK: entry: +; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp1) +; CHECK: br i1 %[[BOSCC]], label %if.then1.uniform, label %entry.boscc_indir + +; CHECK: if.then1.uniform: +; CHECK: %gep1.uniform = +; CHECK: br i1 %cmp2.uniform, label %if.then2.uniform, label %if.end1.uniform + +; CHECK: if.else3.uniform: +; CHECK: %[[BOSCC2:.+]] = call i1 @__vecz_b_divergence_all(i1 %{{if.then4.uniform.exit_mask|cmp4.uniform}}) +; CHECK: br i1 %[[BOSCC2]], label %if.then4.uniform, label %if.else3.uniform.boscc_indir + +; CHECK: if.else3.uniform.boscc_indir: +; CHECK: %[[BOSCC3:.+]] = call i1 @__vecz_b_divergence_all(i1 %if.end2.uniform.exit_mask) +; CHECK: br i1 %[[BOSCC3]], label %if.end2.uniform, label %if.then4 + +; CHECK: if.then1: +; CHECK: %gep1 = +; CHECK: br i1 %cmp2, label %if.then2, label %if.end1 + +; Generalizing the expected %cmp3 value because the 'icmp' could go off +; by one BB between LLVM versions. Therefore we can get %cmp3.not. +; CHECK: if.then2: +; CHECK: br i1 %cmp3{{(.+)?}}, label %if.else3, label %if.then3 + +; CHECK: if.then3: +; CHECK: br label %if.end2 + +; CHECK: if.else3: +; CHECK: br label %if.then4 + +; CHECK: if.then4: +; CHECK: %gep1.boscc_blend = phi ptr addrspace(1) [ %gep1.uniform, %if.else3.uniform.boscc_indir ], [ %gep1, %if.else3 ] +; CHECK: br label %if.end2 + +; CHECK: if.end2: + +; Check we have correctly blended the instruction during the BOSCC connection +; rather than while repairing the SSA form. +; CHECK-NOT: %gep1.boscc_blend.merge{{.*}} = phi +; CHECK: %gep1.boscc_blend{{[0-9]*}} = phi ptr addrspace(1) [ %gep1.boscc_blend{{[0-9]*}}, %if.then4 ], [ %gep1, %if.then3 ] +; CHECK: br label %if.end1 + +; CHECK: if.end1: + +; Check we have correctly blended the instruction during the BOSCC connection +; rather than while repairing the SSA form. +; CHECK-NOT: %gep1.boscc_blend.merge{{.*}} = phi +; CHECK: %gep1.boscc_blend{{[0-9]*}} = phi ptr addrspace(1) [ %gep1.boscc_blend{{[0-9]*}}, %if.end2 ], [ %gep1, %if.then1 ] +; CHECK: br label %end diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll new file mode 100644 index 0000000000000..64c5d6e7cbc28 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll @@ -0,0 +1,134 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; This test checks that we create a new preheader that blends the preheader +; of the uniform and the predicated paths for a loop that has not been +; duplicated (because of the barrier in it). + +; RUN: veczc -k duplicate_preheader -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: noduplicate +declare void @__mux_work_group_barrier(i32, i32, i32) #1 +; Function Attrs: nounwind readnone +declare i64 @__mux_get_local_id(i32) + +define spir_kernel void @duplicate_preheader(i32 addrspace(1)* %out, i32 %n) { +entry: + %id = tail call i64 @__mux_get_local_id(i32 0) + %cmp = icmp sgt i64 %id, 3 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + br label %for.cond + +for.cond: + %ret.0 = phi i64 [ 0, %if.then ], [ %inc, %for.body ] + %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp uge i32 %storemerge8, %mul + br i1 %cmp2, label %for.body, label %if.end + +for.body: + %inc = add nsw i64 %ret.0, 1 + %inc4 = add nsw i32 %storemerge8, 1 + br label %for.cond + +if.end: ; preds = %if.then, %entry + %idx.blend = phi i64 [ %id, %entry ], [ %ret.0, %for.cond ] + %gep_var = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx.blend + br label %barrier + +barrier: ; preds = %latch, %if.end + call void @__mux_work_group_barrier(i32 0, i32 1, i32 272) + br i1 %cmp, label %body, label %latch + +body: ; preds = %barrier + %gep_uni = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1 + %ret = load i32, i32 addrspace(1)* %gep_uni, align 16 + store i32 %ret, i32 addrspace(1)* %gep_var, align 16 + br label %latch + +latch: ; preds = %body, %barrier + %cmp3 = icmp sgt i32 %n, 10 + br i1 %cmp3, label %exit, label %barrier + +exit: ; preds = %latch + ret void +} + +attributes #1 = { noduplicate } + +; CHECK: spir_kernel void @__vecz_v4_duplicate_preheader +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]] + +; Make sure we have both the uniform and non-uniform versions of the for loop. +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[ENTRYBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFEND:.+]], label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[IFENDLOOPEXITUNIFORM:.+]], label %[[FORBODYUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[IFENDLOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[IFENDLOOPEXIT:.+]], label %[[FORBODY:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[IFENDLOOPEXIT]]: +; CHECK: br label %[[IFEND]] + +; Make sure we're reconverging here from the uniform and predicated paths before +; branching to the barrier. +; CHECK: [[IFEND]]:{{.*}}preds +; CHECK-DAG: %[[IFENDLOOPEXIT]] +; CHECK-DAG: %[[IFENDLOOPEXITUNIFORM]] +; CHECK: br label %[[BARRIER:.+]] + +; CHECK: [[BARRIER]]: +; CHECK: br i1 %{{.+}}, label %[[BODYUNIFORM:.+]], label %[[BARRIERBOSCCINDIR:.+]] + +; CHECK: [[BODYUNIFORM]]: +; CHECK: br label %[[LATCHUNIFORM:.+]] + +; CHECK: [[BARRIERBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[LATCH:.+]], label %[[BODY:.+]] + +; CHECK: [[BODY]]: +; CHECK: br label %[[LATCH]] + +; CHECK: [[LATCH]]: +; CHECK: %[[CMP3:.+]] = icmp +; CHECK: br i1 %[[CMP3]], label %[[EXIT:.+]], label %[[BARRIER]] + +; CHECK: [[EXIT]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll new file mode 100644 index 0000000000000..33ea2580691fb --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll @@ -0,0 +1,198 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k nested_loops1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_global_id(i32) #0 + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_global_size(i32) #0 + +; Function Attrs: nounwind readnone +declare spir_func float @_Z3madfff(float, float, float) #0 + +; Function Attrs: nounwind +define spir_kernel void @nested_loops1(i32 %n, float addrspace(1)* %out) #1 { +entry: + %gid = tail call i64 @__mux_get_global_id(i32 0) #0 + %gsize = tail call i64 @__mux_get_global_size(i32 0) #0 + %trunc_gid = trunc i64 %gid to i32 + %trunc_gsize = trunc i64 %gsize to i32 + %cmp1 = icmp slt i32 %trunc_gid, %n + br i1 %cmp1, label %if.then1, label %end + +if.then1: ; preds = %16 + %cmp2 = icmp slt i32 %n, 0 + %cmp3 = icmp slt i32 %n, 0 + %cmp4 = icmp sgt i32 %n, 0 + %cmp5 = icmp slt i32 %n, 1 + br label %for.cond + +for.cond: ; preds = %if.else4, %if.then1 + %trunc_gid_phi = phi i32 [ %trunc_gid, %if.then1 ], [ %add3, %if.else4 ] + %cmp6 = icmp eq i32 %trunc_gid_phi, -2147483648 + %select1 = select i1 %cmp6, i32 1, i32 %n + %div1 = sdiv i32 %trunc_gid_phi, %select1 + br i1 %cmp2, label %if.then2, label %if.else2 + +if.else2: ; preds = %for.cond + %cmp7 = icmp eq i32 %n, 0 + %select2 = select i1 %cmp7, i32 1, i32 %n + %div2 = sdiv i32 %n, %select2 + br label %if.then2 + +if.then2: ; preds = %if.else2, %for.cond + br i1 %cmp3, label %if.then3, label %if.else3 + +if.else3: ; preds = %if.then2 + %cmp8 = icmp eq i32 %n, 0 + %select3 = select i1 %cmp8, i32 1, i32 %n + %div3 = sdiv i32 %n, %select3 + br label %if.then3 + +if.then3: ; preds = %if.else3, %if.then2 + br i1 %cmp4, label %if.then4, label %if.else4 + +if.then4: ; preds = %if.then3 + br i1 %cmp5, label %if.else4, label %if.else5 + +if.else5: ; preds = %if.then4 + %sext_div1 = sext i32 %div1 to i64 + %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_div1 + %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_div1 + br label %for.cond2 + +for.cond2: ; preds = %if.else6, %if.else5 + %float_idx = phi float [ 0.000000e+00, %if.else5 ], [ %phi_phi_mad, %if.else6 ] + %phi_div1_1 = phi i32 [ %div1, %if.else5 ], [ %add2, %if.else6 ] + %i32_idx = phi i32 [ 0, %if.else5 ], [ %add2, %if.else6 ] + %cmp9 = icmp slt i32 %phi_div1_1, %n + br i1 %cmp9, label %if.then6, label %if.else6 + +if.then6: ; preds = %for.cond2 + br label %for.cond3 + +for.cond3: ; preds = %if.else7, %if.then6 + %phi_float_idx = phi float [ %float_idx, %if.then6 ], [ %phi_mad, %if.else7 ] + %phi_div1_2 = phi i32 [ %div1, %if.then6 ], [ %add1, %if.else7 ] + %phi_i32_idx = phi i32 [ %i32_idx, %if.then6 ], [ %add1, %if.else7 ] + %cmp10 = icmp sgt i32 %phi_div1_2, -1 + br i1 %cmp10, label %if.then7, label %if.else7 + +if.then7: ; preds = %for.cond3 + %sext_phi_div1_2 = sext i32 %phi_div1_2 to i64 + %gep3 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %sext_phi_div1_2 + %load1 = load float, float addrspace(1)* %gep3, align 4 + %sext_phi_i32_idx = sext i32 %phi_i32_idx to i64 + %gep4 = getelementptr inbounds float, float addrspace(1)* %gep2, i64 %sext_phi_i32_idx + %load2 = load float, float addrspace(1)* %gep4, align 4 + %mad = tail call spir_func float @_Z3madfff(float %load1, float %load2, float %phi_float_idx) #0 + br label %if.else7 + +if.else7: ; preds = %if.then7, %for.cond3 + %phi_mad = phi float [ %mad, %if.then7 ], [ %phi_float_idx, %for.cond3 ] + %add1 = add nsw i32 %phi_i32_idx, %n + %cmp11 = icmp slt i32 %add1, %div1 + br i1 %cmp11, label %for.cond3, label %if.else6 + +if.else6: ; preds = %if.else7, %for.cond2 + %phi_phi_mad = phi float [ %float_idx, %for.cond2 ], [ %phi_mad, %if.else7 ] + %add2 = add nsw i32 %i32_idx, %div1 + %cmp12 = icmp slt i32 %add2, %div1 + br i1 %cmp12, label %for.cond2, label %if.else4 + +if.else4: ; preds = %if.else8, %if.then4, %if.then3 + %phi_phi_float_idx = phi float [ 0.000000e+00, %if.then3 ], [ 0.000000e+00, %if.then4 ], [ %phi_phi_mad, %if.else6 ] + %sext_trunc_gid_phi = sext i32 %trunc_gid_phi to i64 + %gep5 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_trunc_gid_phi + store float %phi_phi_float_idx, float addrspace(1)* %gep5, align 4 + %add3 = add nsw i32 %trunc_gid_phi, %trunc_gsize + %cmp13 = icmp slt i32 %add3, %n + br i1 %cmp13, label %for.cond, label %end + +end: ; preds = %if.else4, %16 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } + +; The purpose of this test is to make sure we correctly blend all the loops +; live through at each entry point of the divergent loops and don't create +; merge instructions for them. + +; CHECK: spir_kernel void @__vecz_v4_nested_loops1 +; CHECK: entry: +; CHECK: br i1 %{{.+}}, label %if.then1.uniform, label %entry.boscc_indir + +; CHECK: if.then1.uniform: +; CHECK: br label %for.cond.uniform + +; CHECK: entry.boscc_indir: +; CHECK: br i1 %{{.+}}, label %end, label %if.then1 + +; CHECK: for.cond2.uniform: +; CHECK: br i1 %{{.+}}, label %for.cond3.preheader.uniform, label %for.cond2.uniform.boscc_indir + +; CHECK: for.cond2.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %if.else6.uniform, label %for.cond2.uniform.boscc_store + +; CHECK: for.cond2.uniform.boscc_store: +; CHECK: br label %for.cond3.preheader + +; CHECK: for.cond3.uniform: +; CHECK: br i1 %{{.+}}, label %if.then7.uniform, label %for.cond3.uniform.boscc_indir + +; CHECK: for.cond3.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %if.else7.uniform, label %for.cond3.uniform.boscc_store + +; CHECK: for.cond3.uniform.boscc_store: +; CHECK: br label %if.then7 + +; CHECK: end.loopexit.uniform: +; CHECK: br label %end + +; CHECK: for.cond: +; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} = +; CHECK: br + +; CHECK: for.cond2: +; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} = +; CHECK: br + +; CHECK: for.cond3: +; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} = +; CHECK: br + +; CHECK: if.then7: +; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} = +; CHECK: br + +; CHECK: if.else4: +; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} = +; CHECK: br + +; CHECK: end.loopexit: +; CHECK: br label %end + +; CHECK: end: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll new file mode 100644 index 0000000000000..cca12985f5031 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll @@ -0,0 +1,140 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k nested_loops2 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @nested_loops2(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %conv, 16 + br i1 %cmp, label %if.then, label %if.end25 + +if.then: ; preds = %entry + %mul2 = mul nsw i32 %conv, %n + %0 = icmp eq i32 %mul2, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div3 = sdiv i32 %mul2, %5 + %add = add nsw i32 %div3, %conv + br label %for.cond + +for.cond: ; preds = %for.inc, %if.then + %ret.0 = phi i32 [ 0, %if.then ], [ %ret.2, %for.inc ] + %storemerge = phi i32 [ 0, %if.then ], [ %inc24, %for.inc ] + %cmp7 = icmp slt i32 %storemerge, %n + br i1 %cmp7, label %for.body, label %if.end25 + +for.body: ; preds = %for.cond + %cmp9 = icmp slt i32 %conv, 9 + br i1 %cmp9, label %while.body, label %for.inc + +while.body: ; preds = %while.body, %for.body + %ret.1 = phi i32 [ %ret.0, %for.body ], [ %add17, %while.body ] + %j.0 = phi i32 [ 0, %for.body ], [ %inc18, %while.body ] + %mul13 = mul nsw i32 %mul2, %mul2 + %6 = icmp eq i32 %n, 0 + %7 = select i1 %6, i32 1, i32 %n + %div14 = sdiv i32 %mul13, %7 + %reass.add = add i32 %div14, %add + %reass.mul = mul i32 %reass.add, 8 + %add6 = add i32 %mul2, 1 + %add16 = add i32 %add6, %add + %inc = add i32 %add16, %ret.1 + %add17 = add i32 %inc, %reass.mul + %inc18 = add nuw nsw i32 %j.0, 1 + %add19 = add nsw i32 %j.0, %conv + %cmp20 = icmp sgt i32 %add19, 3 + br i1 %cmp20, label %for.inc, label %while.body + +for.inc: ; preds = %for.body, %while.body + %ret.2 = phi i32 [ %ret.0, %for.body ], [ %add17, %while.body ] + %inc24 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +if.end25: ; preds = %for.cond, %entry + %ret.3 = phi i32 [ 0, %entry ], [ %ret.0, %for.cond ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!llvm.ident = !{!2} +!opencl.kernels = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"} +!3 = !{void (i32 addrspace(1)*, i32)* @nested_loops2, !4, !5, !6, !7, !8, !9} +!4 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!5 = !{!"kernel_arg_access_qual", !"none", !"none"} +!6 = !{!"kernel_arg_type", !"int*", !"int"} +!7 = !{!"kernel_arg_base_type", !"int*", !"int"} +!8 = !{!"kernel_arg_type_qual", !"", !""} +!9 = !{!"kernel_arg_name", !"out", !"n"} + +; The purpose of this test is to make sure we correctly add a boscc connection +; at a div causing latch from the uniform region. + +; CHECK: spir_kernel void @__vecz_v4_nested_loops2 +; CHECK: entry: +; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp) +; CHECK: br i1 %[[BOSCC]], label %if.then.uniform, label %entry.boscc_indir + +; CHECK: if.then.uniform: +; CHECK: br i1 %cmp71.uniform, label %for.body.lr.ph.uniform, label %if.end25.loopexit.uniform + +; CHECK: entry.boscc_indir: +; CHECK: %[[BOSCC2:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp.not{{.*}}) +; CHECK: br i1 %[[BOSCC2]], label %if.end25, label %if.then + +; CHECK: for.body.lr.ph.uniform: +; CHECK: br label %for.body.uniform + +; CHECK: for.body.uniform: +; CHECK: br i1 %[[LBLCOND:.+]], label %while.body.preheader.uniform, label %for.body.uniform.boscc_indir + +; CHECK: while.body.preheader.uniform: +; CHECK: br label %while.body.uniform + +; CHECK: for.body.uniform.boscc_indir: +; CHECK: %[[BOSCC3:.+]] = call i1 @__vecz_b_divergence_all(i1 %for.inc.uniform.exit_mask) +; CHECK: br i1 %[[BOSCC3]], label %for.inc.uniform, label %for.body.uniform.boscc_store + +; CHECK: while.body.uniform: +; CHECK: %cmp20.uniform = icmp sgt i32 %add19.uniform, 3 +; CHECK-NOT: br i1 %[[LBLCOND3:.+]], label %for.inc.loopexit.uniform, label %while.body.uniform +; CHECK: br i1 %[[LBLCOND2:.+]], label %for.inc.loopexit.uniform, label %while.body.uniform.boscc_indir diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll new file mode 100644 index 0000000000000..e39f0e1361850 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll @@ -0,0 +1,149 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k nested_loops3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #0 + +; Function Attrs: nounwind readnone speculatable +declare float @llvm.fmuladd.f32(float, float, float) #1 + +; Function Attrs: convergent nounwind +define spir_kernel void @nested_loops3(float addrspace(1)* %symmat, float addrspace(1)* %data, i32 %m, i32 %n) #2 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %conv = trunc i64 %call to i32 + %sub = add nsw i32 %m, -1 + %cmp = icmp sgt i32 %sub, %conv + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %mul = mul nsw i32 %conv, %m + %add = add nsw i32 %mul, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom + store float 1.000000e+00, float addrspace(1)* %arrayidx, align 4 + br label %for.cond + +for.cond: ; preds = %for.end, %if.then + %storemerge.in = phi i32 [ %conv, %if.then ], [ %storemerge, %for.end ] + %storemerge = add nsw i32 %storemerge.in, 1 + %cmp3 = icmp slt i32 %storemerge, %m + br i1 %cmp3, label %for.cond5, label %if.end + +for.cond5: ; preds = %for.body8, %for.cond + %storemerge1 = phi i32 [ %inc, %for.body8 ], [ 0, %for.cond ] + %cmp6 = icmp slt i32 %storemerge1, %n + br i1 %cmp6, label %for.body8, label %for.end + +for.body8: ; preds = %for.cond5 + %mul9 = mul nsw i32 %storemerge1, %m + %add10 = add nsw i32 %mul9, %conv + %idxprom11 = sext i32 %add10 to i64 + %arrayidx12 = getelementptr inbounds float, float addrspace(1)* %data, i64 %idxprom11 + %0 = load float, float addrspace(1)* %arrayidx12, align 4 + %mul13 = mul nsw i32 %storemerge1, %m + %add14 = add nsw i32 %mul13, %storemerge + %idxprom15 = sext i32 %add14 to i64 + %arrayidx16 = getelementptr inbounds float, float addrspace(1)* %data, i64 %idxprom15 + %1 = load float, float addrspace(1)* %arrayidx16, align 4 + %mul18 = mul nsw i32 %conv, %m + %add19 = add nsw i32 %storemerge, %mul18 + %idxprom20 = sext i32 %add19 to i64 + %arrayidx21 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom20 + %2 = load float, float addrspace(1)* %arrayidx21, align 4 + %3 = call float @llvm.fmuladd.f32(float %0, float %1, float %2) + store float %3, float addrspace(1)* %arrayidx21, align 4 + %inc = add nuw nsw i32 %storemerge1, 1 + br label %for.cond5 + +for.end: ; preds = %for.cond5 + %mul22 = mul nsw i32 %conv, %m + %add23 = add nsw i32 %storemerge, %mul22 + %idxprom24 = sext i32 %add23 to i64 + %arrayidx25 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom24 + %4 = load float, float addrspace(1)* %arrayidx25, align 4 + %mul26 = mul nsw i32 %storemerge, %m + %add27 = add nsw i32 %mul26, %conv + %idxprom28 = sext i32 %add27 to i64 + %arrayidx29 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom28 + store float %4, float addrspace(1)* %arrayidx29, align 4 + br label %for.cond + +if.end: ; preds = %for.cond, %entry + ret void +} + +; The purpose of this test is to make sure we correctly set the incoming value +; of a boscc_blend instruction (in a loop header) from the latch as being the +; value defined in the latch iteself. + +; CHECK: spir_kernel void @__vecz_v4_nested_loops3 +; CHECK: entry: +; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %entry.boscc_indir + +; CHECK: if.then.uniform: +; CHECK: br i1 %{{.+}}, label %for.cond5.preheader.lr.ph.uniform, label %if.then.uniform.boscc_indir + +; CHECK: entry.boscc_indir: +; CHECK: br i1 %{{.+}}, label %if.end, label %if.then + +; CHECK: for.cond5.preheader.lr.ph.uniform: +; CHECK: br label %for.cond5.preheader.uniform + +; CHECK: if.then.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %if.end.loopexit.uniform, label %for.cond5.preheader.lr.ph + +; CHECK: for.cond5.preheader.uniform: +; CHECK: br label %for.cond5.uniform + +; CHECK: for.end.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %for.cond.if.end.loopexit_crit_edge.uniform, label %for.end.uniform.boscc_store + +; CHECK: for.end.uniform.boscc_store: +; CHECK: br label %for.cond5.preheader + +; CHECK: if.then: +; CHECK: br label %for.cond5.preheader.lr.ph + +; CHECK: for.cond5.preheader.lr.ph: +; CHECK: br label %for.cond5.preheader + +; CHECK: for.cond5.preheader: + +; This is the important bit of the test +; Note that the LCSSA PHI node got cleaned up! +; For some reason LIT needs these checks to be split across two lines +; CHECK: %[[LATCH_VALUE1:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ], +; CHECK-SAME: [ %[[LATCH_VALUE1]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ] + +; CHECK: %[[LATCH_VALUE2:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ], +; CHECK-SAME: [ %[[LATCH_VALUE2]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ] + +; CHECK: %[[LATCH_VALUE3:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ], +; CHECK-SAME: [ %[[LATCH_VALUE3]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ] + +; CHECK: %[[LATCH_VALUE4:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ], +; CHECK-SAME: [ %[[LATCH_VALUE4]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ] + +; CHECK: %[[LATCH_VALUE5:.+\.boscc_blend[0-9]*]] = phi i1 [ true, %for.end.uniform.boscc_store ], +; CHECK-SAME: [ %[[LATCH_VALUE5]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll new file mode 100644 index 0000000000000..a967bec0aed5d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll @@ -0,0 +1,190 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k nested_loops4 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_global_id(i32) #0 + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_global_size(i32) #0 + +; Function Attrs: nounwind readnone +declare spir_func float @_Z3dotDv2_fS_(<2 x float>, <2 x float>) #0 + +declare spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64, float addrspace(1)*) + +; Function Attrs: nounwind readnone +declare spir_func i32 @_Z6mul_hijj(i32, i32) #0 + +define spir_kernel void @nested_loops4(i32 %n, float addrspace(1)* %out) { +entry: + %gid = tail call i64 @__mux_get_global_id(i32 0) #0 + %gsize = tail call i64 @__mux_get_global_size(i32 0) #0 + %trunc_gid = trunc i64 %gid to i32 + %trunc_gsize = trunc i64 %gsize to i32 + %cmp1 = icmp slt i32 %trunc_gid, %n + br i1 %cmp1, label %for.cond1, label %end + +for.cond1: ; preds = %entry, %for.cond1.end + %phi_trunc_gid = phi i32 [ %trunc_gid, %entry ], [ %add2, %for.cond1.end ] + %mul_hi = tail call spir_func i32 @_Z6mul_hijj(i32 %phi_trunc_gid, i32 %n) #0 + %wrong = sdiv i32 %mul_hi, %n + %sext_mul_hi = sext i32 %mul_hi to i64 + %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_mul_hi + %cmp2 = icmp slt i32 %mul_hi, %n + br i1 %cmp2, label %for.cond2, label %for.cond1.end + +for.cond2: ; preds = %for.cond1, %for.cond2.end + %phi4_fadd = phi float [ %phi3_fadd, %for.cond2.end ], [ 0.000000e+00, %for.cond1 ] + %cmp3 = icmp slt i32 %mul_hi, %n + br i1 %cmp3, label %for.cond3.preheader, label %for.cond2.end + +for.cond3.preheader: ; preds = %for.cond2 + %add1 = add nsw i32 %mul_hi, %wrong + br label %for.cond3 + +for.cond3: ; preds = %for.cond3.preheader, %for.cond3.end + %phi_wrong_correct_correct = phi i32 [ %wrong, %for.cond3.preheader ], [ %correct, %for.cond3.end ] + %phi_add1 = phi i32 [ %add1, %for.cond3.preheader ], [ %phi_add1, %for.cond3.end ] + %phi2_fadd = phi float [ %phi4_fadd, %for.cond3.preheader ], [ %phi1_fadd, %for.cond3.end ] + %cmp4 = icmp slt i32 %phi_wrong_correct_correct, %n + br i1 %cmp4, label %for.cond3.body, label %for.cond3.end + +for.cond3.body: ; preds = %for.cond3 + %sext_phi_add1 = sext i32 %phi_add1 to i64 + %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %sext_phi_add1 + %vload = tail call spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64 0, float addrspace(1)* %gep2) + %dot = tail call spir_func float @_Z3dotDv2_fS_(<2 x float> %vload, <2 x float> %vload) #0 + %fadd = fadd float %phi2_fadd, %dot + br label %for.cond3.end + +for.cond3.end: ; preds = %for.cond3.body, %for.cond3 + %phi1_fadd = phi float [ %phi2_fadd, %for.cond3 ], [ %fadd, %for.cond3.body ] + %correct = add nsw i32 %phi_wrong_correct_correct, 1 + %cmp5 = icmp slt i32 %wrong, %n + br i1 %cmp5, label %for.cond3, label %for.cond2.end + +for.cond2.end: ; preds = %for.cond3.end, %for.cond2 + %phi3_fadd = phi float [ %phi4_fadd, %for.cond2 ], [ %phi1_fadd, %for.cond3.end ] + %cmp6 = icmp slt i32 %mul_hi, %n + br i1 %cmp6, label %for.cond2, label %for.cond1.end + +for.cond1.end: ; preds = %for.cond2.end, %for.cond1 + %ret = phi float [ 0.000000e+00, %for.cond1 ], [ %phi3_fadd, %for.cond2.end ] + %sext_phi_trunc_gid = sext i32 %phi_trunc_gid to i64 + %gep3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_phi_trunc_gid + store float %ret, float addrspace(1)* %gep3, align 4 + %add2 = add nsw i32 %phi_trunc_gid, %trunc_gsize + %cmp7 = icmp slt i32 %add2, %n + br i1 %cmp7, label %for.cond1, label %end + +end: ; preds = %for.cond1.end, %entry + ret void +} + +attributes #0 = { nounwind readnone } + +; The purpose of this test is to make sure we choose the correct incoming value +; for a boscc blend instruction. + +; CHECK: spir_kernel void @__vecz_v4_nested_loops4 +; CHECK: entry: +; CHECK: br i1 %{{.+}}, label %for.cond1.preheader.uniform, label %entry.boscc_indir + +; CHECK: for.cond1.preheader.uniform: +; CHECK: br label %for.cond1.uniform + +; CHECK: entry.boscc_indir: +; CHECK: br i1 %{{.+}}, label %end, label %for.cond1.preheader + +; CHECK: for.cond1.uniform: +; CHECK: %wrong.uniform = sdiv i32 %mul_hi.uniform, %n +; CHECK: br i1 %{{.+}}, label %for.cond2.preheader.uniform, label %for.cond1.uniform.boscc_indir + +; CHECK: for.cond1.end.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %end.loopexit.uniform, label %for.cond1.end.uniform.boscc_store + +; CHECK: for.cond1.end.uniform.boscc_store: +; CHECK: br label %for.cond1 + +; CHECK: for.cond2.preheader.uniform: +; CHECK: br label %for.cond2.uniform + +; CHECK: for.cond1.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %for.cond1.end.uniform, label %for.cond1.uniform.boscc_store + +; CHECK: for.cond1.uniform.boscc_store: +; LCSSA PHI nodes got cleaned up: +; CHECK-NOT: %{{.*\.boscc_lcssa.*}} +; CHECK: br label %for.cond2.preheader + +; CHECK: for.cond2.uniform: +; CHECK: br i1 %{{.+}}, label %for.cond3.preheader.uniform, label %for.cond2.uniform.boscc_indir + +; CHECK: for.cond2.end.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %for.cond1.end.loopexit.uniform, label %for.cond2.end.uniform.boscc_store + +; CHECK: for.cond3.preheader.uniform: +; CHECK: br label %for.cond3.uniform + +; CHECK: for.cond2.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %for.cond2.end.uniform, label %for.cond2.uniform.boscc_store + +; CHECK: for.cond3.uniform: +; CHECK: br i1 %{{.+}}, label %for.cond3.body.uniform, label %for.cond3.uniform.boscc_indir + +; CHECK: for.cond3.end.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %for.cond2.end.loopexit.uniform, label %for.cond3.end.uniform.boscc_store + +; CHECK: for.cond3.end.uniform.boscc_store: +; LCSSA PHI nodes got cleaned up: +; CHECK-NOT: %{{.*\.boscc_lcssa.*}} +; CHECK: br label %for.cond3 + +; CHECK: for.cond3.body.uniform: +; CHECK: br label %for.cond3.end.uniform + +; CHECK: for.cond3.uniform.boscc_indir: +; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %for.cond3.end.uniform.exit_mask) +; CHECK: br i1 %[[BOSCC]], label %for.cond3.end.uniform, label %for.cond3.uniform.boscc_store + +; CHECK: for.cond3.end.uniform: +; CHECK: br i1 %{{.+}}, label %for.cond3.uniform, label %for.cond3.end.uniform.boscc_indir + +; CHECK: for.cond1.preheader: +; CHECK: br label %for.cond1 + +; CHECK: for.cond1: +; CHECK: br label %for.cond2.preheader + +; CHECK: for.cond2.preheader: +; CHECK: br label %for.cond2 + +; CHECK: for.cond2: +; CHECK: br label %for.cond3.preheader + +; CHECK: for.cond3.preheader: +; CHECK: br label %for.cond3 + +; CHECK: for.cond3: + +; This is the important part of the test. +; CHECK: %phi_wrong_correct_correct = phi i32 [ %wrong.boscc_blend{{.+}}, %for.cond3.preheader ], [ %correct, %for.cond3.end ], [ %correct.uniform, %for.cond3.end.uniform.boscc_store ] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll new file mode 100644 index 0000000000000..3f8e7f2b3a395 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll @@ -0,0 +1,117 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k nested_loops5 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) + +declare i64 @__mux_get_local_size(i32) + +define spir_kernel void @nested_loops5(float addrspace(1)*) { +entry: + %lid = tail call i64 @__mux_get_local_id(i32 0) + %lsize = tail call i64 @__mux_get_local_size(i32 0) + %cmp1 = icmp ult i64 %lid, %lsize + br i1 %cmp1, label %loop, label %end + +loop: ; preds = %if.end, %entry + %livethrough = phi i64 [ %add2, %if.end ], [ %lsize, %entry ] + %add1 = add i64 %livethrough, %lsize + %cmp2 = icmp ult i64 %add1, %lsize + br i1 %cmp2, label %if.then, label %if.else + +if.then: ; preds = %if.then, %loop + %phi = phi i64 [ %add3, %if.then ], [ %lid, %loop ] + %add3 = add i64 %phi, %lsize + %cmp4 = icmp ult i64 %add3, %lsize + br i1 %cmp4, label %if.then, label %if.end + +if.else: ; preds = %loop + %gep = getelementptr inbounds float, float addrspace(1)* %0, i64 %add1 + store float 0.000000e+00, float addrspace(1)* %gep, align 4 + br label %if.end + +if.end: ; preds = %if.then, %if.else + %add2 = add i64 %livethrough, %lsize + %cmp3 = icmp ult i64 %add2, %lsize + br i1 %cmp3, label %loop, label %end + +end: ; preds = %if.end, %entry + ret void +} + +; The purpose of this test is to make sure we choose the correct incoming value +; for a boscc blend instruction. + +; CHECK: spir_kernel void @__vecz_v4_nested_loops5 +; CHECK: entry: +; CHECK: br i1 %{{.+}}, label %loop.preheader.uniform, label %entry.boscc_indir + +; CHECK: loop.preheader.uniform: +; CHECK: br label %loop.uniform + +; CHECK: entry.boscc_indir: +; CHECK: br i1 %{{.+}}, label %end, label %loop.preheader + +; CHECK: loop.uniform: +; CHECK: %livethrough.uniform = phi i64 [ %add2.uniform, %if.end.uniform ], [ %lsize, %loop.preheader.uniform ] +; CHECK: br i1 %{{.+}}, label %if.then.preheader.uniform, label %if.else.uniform + +; CHECK: if.then.preheader.uniform: +; CHECK: br label %if.then.uniform + +; CHECK: if.then.uniform: +; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %if.then.uniform.boscc_indir + +; CHECK: if.then.uniform.boscc_indir: +; CHECK: br i1 %{{.+}}, label %if.end.loopexit.uniform, label %if.then.uniform.boscc_store + +; CHECK: if.then.uniform.boscc_store: +; LCSSA PHI nodes got cleaned up: +; CHECK-NOT: %{{.*\.boscc_lcssa.*}} +; CHECK: br label %if.then + +; CHECK: loop.preheader: +; CHECK: br label %loop + +; CHECK: loop: +; CHECK: %livethrough = phi i64 [ %add2, %if.end ], [ %lsize, %loop.preheader ] +; CHECK: br i1 %{{.+}}, label %if.then.preheader, label %if.else + +; CHECK: if.then.preheader: +; CHECK: br label %if.then + +; CHECK: if.then: +; CHECK: %livethrough.boscc_blend = phi i64 [ %livethrough.uniform, %if.then.uniform.boscc_store ], [ %livethrough.boscc_blend, %if.then ], [ %livethrough, %if.then.preheader ] +; CHECK: br i1 %{{.+}}, label %if.then, label %if.then.pure_exit + +; CHECK: if.then.pure_exit: +; CHECK: br label %if.end.loopexit + +; CHECK: if.else: +; CHECK: br label %if.end + +; CHECK: if.end.loopexit: +; CHECK: br label %if.end + +; CHECK: if.end: +; CHECK-NOT: %livethrough.boscc_blend{{.+}}.merge = phi i64 [ %livethrough.boscc_blend, %if.end.loopexit ], [ 0, %if.else ] +; CHECK: %livethrough.boscc_blend{{.+}} = phi i64 [ %livethrough.boscc_blend, %if.end.loopexit ], [ %livethrough, %if.else ] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll new file mode 100644 index 0000000000000..96261d872a3df --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll @@ -0,0 +1,436 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c +; \ / +; d +; | +; e +; / \ +; / \ +; f g +; / \ / \ +; h i j k +; \ / \ / +; l m +; \ / +; \ / +; n +; +; * where node e is a uniform branch, and nodes a, f and g are varying +; branches. +; * where nodes b, c, d, h, i, j, k, l, m are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a___ +; / \ \ +; b c c' +; \ / | +; d b' +; | | +; | d' +; | / +; \ / +; e +; / \ +; / \ +; ___f g___ +; / / \ / \ \ +; i' h i j k k' +; | \ / \ / | +; h' l m j' +; | | | | +; l' | | m' +; \ | | / +; \ / \ / +; & -> n <- & +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization0(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (id % 5 == 0) { +; for (int i = 0; i < n * 2; i++) ret++; +; } else { +; for (int i = 0; i < n / 4; i++) ret++; +; } +; +; if (n > 10) { // uniform +; if (id % 2 == 0) { // varying +; for (int i = 0; i < n + 10; i++) ret++; +; } else { // varying +; for (int i = 0; i < n + 10; i++) ret *= 2; +; } +; ret += id * 10; +; } else { // uniform +; if (id % 2 == 0) { // varying +; for (int i = 0; i < n + 8; i++) ret++; +; } else { // varying +; for (int i = 0; i < n + 8; i++) ret *= 2; +; } +; ret += id / 2; +; } +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization0(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %rem = srem i32 %conv, 5 + %cmp = icmp eq i32 %rem, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + br label %for.cond + +for.cond: ; preds = %for.body, %if.then + %ret.0 = phi i32 [ 0, %if.then ], [ %inc, %for.body ] + %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp slt i32 %storemerge8, %mul + br i1 %cmp2, label %for.body, label %if.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.0, 1 + %inc4 = add nsw i32 %storemerge8, 1 + br label %for.cond + +if.else: ; preds = %entry + br label %for.cond6 + +for.cond6: ; preds = %for.body9, %if.else + %ret.1 = phi i32 [ 0, %if.else ], [ %inc10, %for.body9 ] + %storemerge = phi i32 [ 0, %if.else ], [ %inc12, %for.body9 ] + %div = sdiv i32 %n, 4 + %cmp7 = icmp slt i32 %storemerge, %div + br i1 %cmp7, label %for.body9, label %if.end + +for.body9: ; preds = %for.cond6 + %inc10 = add nsw i32 %ret.1, 1 + %inc12 = add nsw i32 %storemerge, 1 + br label %for.cond6 + +if.end: ; preds = %for.cond6, %for.cond + %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond6 ] + %cmp14 = icmp sgt i32 %n, 10 + %rem175 = and i32 %conv, 1 + %cmp18 = icmp eq i32 %rem175, 0 + br i1 %cmp14, label %if.then16, label %if.else44 + +if.then16: ; preds = %if.end + br i1 %cmp18, label %if.then20, label %if.else30 + +if.then20: ; preds = %if.then16 + br label %for.cond22 + +for.cond22: ; preds = %for.body25, %if.then20 + %ret.3 = phi i32 [ %ret.2, %if.then20 ], [ %inc26, %for.body25 ] + %storemerge7 = phi i32 [ 0, %if.then20 ], [ %inc28, %for.body25 ] + %add = add nsw i32 %n, 10 + %cmp23 = icmp slt i32 %storemerge7, %add + br i1 %cmp23, label %for.body25, label %if.end41 + +for.body25: ; preds = %for.cond22 + %inc26 = add nsw i32 %ret.3, 1 + %inc28 = add nsw i32 %storemerge7, 1 + br label %for.cond22 + +if.else30: ; preds = %if.then16 + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.else30 + %ret.4 = phi i32 [ %ret.2, %if.else30 ], [ %mul37, %for.body36 ] + %storemerge6 = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ] + %add33 = add nsw i32 %n, 10 + %cmp34 = icmp slt i32 %storemerge6, %add33 + br i1 %cmp34, label %for.body36, label %if.end41 + +for.body36: ; preds = %for.cond32 + %mul37 = shl nsw i32 %ret.4, 1 + %inc39 = add nsw i32 %storemerge6, 1 + br label %for.cond32 + +if.end41: ; preds = %for.cond32, %for.cond22 + %ret.5 = phi i32 [ %ret.3, %for.cond22 ], [ %ret.4, %for.cond32 ] + %mul42 = mul nsw i32 %conv, 10 + %add43 = add nsw i32 %ret.5, %mul42 + br label %if.end73 + +if.else44: ; preds = %if.end + br i1 %cmp18, label %if.then48, label %if.else59 + +if.then48: ; preds = %if.else44 + br label %for.cond50 + +for.cond50: ; preds = %for.body54, %if.then48 + %ret.6 = phi i32 [ %ret.2, %if.then48 ], [ %inc55, %for.body54 ] + %storemerge4 = phi i32 [ 0, %if.then48 ], [ %inc57, %for.body54 ] + %add51 = add nsw i32 %n, 8 + %cmp52 = icmp slt i32 %storemerge4, %add51 + br i1 %cmp52, label %for.body54, label %if.end70 + +for.body54: ; preds = %for.cond50 + %inc55 = add nsw i32 %ret.6, 1 + %inc57 = add nsw i32 %storemerge4, 1 + br label %for.cond50 + +if.else59: ; preds = %if.else44 + br label %for.cond61 + +for.cond61: ; preds = %for.body65, %if.else59 + %ret.7 = phi i32 [ %ret.2, %if.else59 ], [ %mul66, %for.body65 ] + %storemerge2 = phi i32 [ 0, %if.else59 ], [ %inc68, %for.body65 ] + %add62 = add nsw i32 %n, 8 + %cmp63 = icmp slt i32 %storemerge2, %add62 + br i1 %cmp63, label %for.body65, label %if.end70 + +for.body65: ; preds = %for.cond61 + %mul66 = shl nsw i32 %ret.7, 1 + %inc68 = add nsw i32 %storemerge2, 1 + br label %for.cond61 + +if.end70: ; preds = %for.cond61, %for.cond50 + %ret.8 = phi i32 [ %ret.6, %for.cond50 ], [ %ret.7, %for.cond61 ] + %div71 = sdiv i32 %conv, 2 + %add72 = add nsw i32 %ret.8, %div71 + br label %if.end73 + +if.end73: ; preds = %if.end70, %if.end41 + %storemerge3 = phi i32 [ %add72, %if.end70 ], [ %add43, %if.end41 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization0, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization0 +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]] + +; CHECK: [[FORCOND6PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND6UNIFORM:.+]] + +; CHECK: [[FORCOND6UNIFORM]]: +; CHECK: %[[CMP7UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP7UNIFORM]], label %[[FORBODY9UNIFORM:.+]], label %[[IFENDLOOPEXIT3UNIFORM:.+]] + +; CHECK: [[FORBODY9UNIFORM]]: +; CHECK: br label %[[FORCOND6UNIFORM]] + +; CHECK: [[IFENDLOOPEXIT3UNIFORM]]: +; CHECK: br label %[[IFEND:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[ENTRYBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND6PREHEADERUNIFORM]], label %[[FORCOND6PREHEADER:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[IFENDLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[IFENDLOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND]] + +; CHECK: [[FORCOND6PREHEADER]]: +; CHECK: br label %[[FORCOND6:.+]] + +; CHECK: [[FORCONDPREHEADER:.+]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFENDLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND6]]: +; CHECK: %[[CMP7:.+]] = icmp +; CHECK: br i1 %[[CMP7]], label %[[FORBODY9:.+]], label %[[IFENDLOOPEXIT3:.+]] + +; CHECK: [[FORBODY9]]: +; CHECK: br label %[[FORCOND6]] + +; CHECK: [[IFENDLOOPEXIT]]: +; CHECK: br label %[[IFEND]] + +; CHECK: [[IFENDLOOPEXIT3]]: +; CHECK: br label %[[FORCONDPREHEADER]] + +; CHECK: [[IFEND]]: +; CHECK: %[[CMP14:.+]] = icmp +; CHECK: br i1 %[[CMP14]], label %[[IFTHEN16:.+]], label %[[IFELSE44:.+]] + +; CHECK: [[IFTHEN16]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND22PREHEADERUNIFORM:.+]], label %[[IFTHEN16BOSCCINDIR:.+]] + +; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND32UNIFORM:.+]] + +; CHECK: [[FORCOND32UNIFORM]]: +; CHECK: %[[CMP34UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP34UNIFORM]], label %[[FORBODY36UNIFORM:.+]], label %[[IFEND41LOOPEXIT1UNIFORM:.+]] + +; CHECK: [[FORBODY36UNIFORM]]: +; CHECK: br label %[[FORCOND32UNIFORM]] + +; CHECK: [[IFEND41LOOPEXIT1UNIFORM]]: +; CHECK: br label %[[IFEND41UNIFORM:.+]] + +; CHECK: [[FORCOND22PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND22UNIFORM:.+]] + +; CHECK: [[IFTHEN16BOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM]], label %[[FORCOND32PREHEADER:.+]] + +; CHECK: [[FORCOND22UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY25UNIFORM:.+]], label %[[IFEND41LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY25UNIFORM]]: +; CHECK: br label %[[FORCOND22UNIFORM]] + +; CHECK: [[IFEND41LOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND41:.+]] + +; CHECK: [[FORCOND32PREHEADER]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[FORCOND22PREHEADER:.+]]: +; CHECK: br label %[[FORCOND22:.+]] + +; CHECK: [[FORCOND22]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY25:.+]], label %[[IFEND41LOOPEXIT:.+]] + +; CHECK: [[FORBODY25]]: +; CHECK: br label %[[FORCOND22]] + +; CHECK: [[FORCOND32]]: +; CHECK: %[[CMP34:.+]] = icmp +; CHECK: br i1 %[[CMP34]], label %[[FORBODY36:.+]], label %[[IFEND41LOOPEXIT1:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[IFEND41LOOPEXIT]]: +; CHECK: br label %[[IFEND41]] + +; CHECK: [[IFEND41LOOPEXIT1]]: +; CHECK: br label %[[FORCOND22PREHEADER]] + +; CHECK: [[IFEND41]]: +; CHECK: br label %[[IFEND73:.+]] + +; CHECK: [[IFELSE44]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND50PREHEADERUNIFORM:.+]], label %[[IFELSE44BOSCCINDIR:.+]] + +; CHECK: [[FORCOND61PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND61UNIFORM:.+]] + +; CHECK: [[FORCOND61UNIFORM]]: +; CHECK: %[[CMP63UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP63UNIFORM]], label %[[FORBODY65UNIFORM:.+]], label %[[IFEND70LOOPEXIT2UNIFORM:.+]] + +; CHECK: [[FORBODY65UNIFORM]]: +; CHECK: br label %[[FORCOND61UNIFORM]] + +; CHECK: [[IFEND70LOOPEXIT2UNIFORM]]: +; CHECK: br label %[[IFEND70UNIFORM:.+]] + +; CHECK: [[FORCOND50PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND50UNIFORM:.+]] + +; CHECK: [[IFELSE44BOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND61PREHEADERUNIFORM]], label %[[FORCOND61PREHEADER:.+]] + +; CHECK: [[FORCOND50UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY54UNIFORM:.+]], label %[[IFEND70LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY54UNIFORM]]: +; CHECK: br label %[[FORCOND50UNIFORM]] + +; CHECK: [[IFEND70LOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND70:.+]] + +; CHECK: [[FORCOND61PREHEADER]]: +; CHECK: br label %[[FORCOND61:.+]] + +; CHECK: [[FORCOND50PREHEADER:.+]]: +; CHECK: br label %[[FORCOND50:.+]] + +; CHECK: [[FORCOND50]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY54:.+]], label %[[IFEND70LOOPEXIT:.+]] + +; CHECK: [[FORBODY54]]: +; CHECK: br label %[[FORCOND50]] + +; CHECK: [[FORCOND61]]: +; CHECK: %[[CMP63:.+]] = icmp +; CHECK: br i1 %[[CMP63]], label %[[FORBODY65:.+]], label %[[IFEND70LOOPEXIT2:.+]] + +; CHECK: [[FORBODY65]]: +; CHECK: br label %[[FORCOND61]] + +; CHECK: [[IFEND70LOOPEXIT]]: +; CHECK: br label %[[IFEND70]] + +; CHECK: [[IFEND70LOOPEXIT2]]: +; CHECK: br label %[[FORCOND50PREHEADER]] + +; CHECK: [[IFEND70]]: +; CHECK: br label %[[IFEND73]] + +; CHECK: [[IFEND73]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll new file mode 100644 index 0000000000000..acc9bee5af397 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll @@ -0,0 +1,320 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-. +; / \ | +; c d | +; / \ / | +; e f --' +; \ | +; \ g +; \| +; h +; +; * where nodes c and f are uniform branches, and node b is a varying +; branch. +; * where nodes c, d, e, f, g and h are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <-. b' <. +; / \__|_ | | +; c d | `d' | +; / \ / | | | +; e f --' c' | +; \ | | | +; \ g f' -' +; \| | +; h g' +; | | +; | e' +; | | +; | h' +; \ / +; \ / +; \ / +; & +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization1(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; while (1) { +; if (id + i % 2 == 0) { +; if (n > 2) { +; goto e; +; } +; } else { +; for (int i = 0; i < n + 10; i++) ret++; +; } +; if (n <= 2) break; +; } +; +; ret += n * 2; +; for (int i = 0; i < n * 2; i++) ret -= i; +; ret /= n; +; goto early; +; +; e: +; for (int i = 0; i < n + 5; i++) ret /= 2; +; ret -= n; +; +; early: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization1(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end14, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %ret.2, %if.end14 ] + %cmp = icmp eq i32 %conv, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body + %cmp2 = icmp sgt i32 %n, 2 + br i1 %cmp2, label %e, label %if.end10 + +if.else: ; preds = %while.body + br label %for.cond + +for.cond: ; preds = %for.body, %if.else + %ret.1 = phi i32 [ %ret.0, %if.else ], [ %inc, %for.body ] + %storemerge = phi i32 [ 0, %if.else ], [ %inc9, %for.body ] + %add6 = add nsw i32 %n, 10 + %cmp7 = icmp slt i32 %storemerge, %add6 + br i1 %cmp7, label %for.body, label %if.end10 + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.1, 1 + %inc9 = add nsw i32 %storemerge, 1 + br label %for.cond + +if.end10: ; preds = %for.cond, %if.then + %ret.2 = phi i32 [ %ret.0, %if.then ], [ %ret.1, %for.cond ] + %cmp11 = icmp slt i32 %n, 3 + br i1 %cmp11, label %while.end, label %if.end14 + +if.end14: ; preds = %if.end10 + br label %while.body + +while.end: ; preds = %if.end10 + %mul = mul i32 %n, 2 + %add15 = add nsw i32 %ret.2, %mul + br label %for.cond17 + +for.cond17: ; preds = %for.body21, %while.end + %ret.3 = phi i32 [ %add15, %while.end ], [ %sub, %for.body21 ] + %storemerge1 = phi i32 [ 0, %while.end ], [ %inc23, %for.body21 ] + %mul18 = shl nsw i32 %n, 1 + %cmp19 = icmp slt i32 %storemerge1, %mul18 + br i1 %cmp19, label %for.body21, label %for.end24 + +for.body21: ; preds = %for.cond17 + %sub = sub nsw i32 %ret.3, %storemerge1 + %inc23 = add nsw i32 %storemerge1, 1 + br label %for.cond17 + +for.end24: ; preds = %for.cond17 + %0 = icmp eq i32 %ret.3, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %ret.3, %5 + br label %early + +e: ; preds = %if.then + br label %for.cond26 + +for.cond26: ; preds = %for.body30, %e + %ret.4 = phi i32 [ %ret.0, %e ], [ %div31, %for.body30 ] + %storemerge3 = phi i32 [ 0, %e ], [ %inc33, %for.body30 ] + %add27 = add nsw i32 %n, 5 + %cmp28 = icmp slt i32 %storemerge3, %add27 + br i1 %cmp28, label %for.body30, label %for.end34 + +for.body30: ; preds = %for.cond26 + %div31 = sdiv i32 %ret.4, 2 + %inc33 = add nsw i32 %storemerge3, 1 + br label %for.cond26 + +for.end34: ; preds = %for.cond26 + %sub35 = sub nsw i32 %ret.4, %n + br label %early + +early: ; preds = %for.end34, %for.end24 + %storemerge2 = phi i32 [ %div, %for.end24 ], [ %sub35, %for.end34 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization1, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization1 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[WHILEBODYUNIFORM:.+]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[IFEND10LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[IFEND10LOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND10UNIFORM:.+]] + +; CHECK: [[IFTHENUNIFORM]]: +; CHECK: %[[CMP2UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FORCOND26PREHEADERUNIFORM:.+]], label %[[IFEND10UNIFORM]] + +; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[FORCONDPREHEADER]] + +; CHECK: [[IFEND10UNIFORM]]: +; CHECK: %[[CMP11UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP11UNIFORM]], label %[[WHILEENDUNIFORM:.+]], label %[[WHILEBODYUNIFORM]] + +; CHECK: [[WHILEENDUNIFORM]]: +; CHECK: br label %[[FORCOND17UNIFORM:.+]] + +; CHECK: [[FORCOND17UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21UNIFORM:.+]], label %[[FOREND24UNIFORM:.+]] + +; CHECK: [[FORBODY21UNIFORM]]: +; CHECK: br label %[[FORCOND17UNIFORM]] + +; CHECK: [[FOREND24UNIFORM]]: +; CHECK: br label %[[EARLYUNIFORM:.+]] + +; CHECK: [[FORCOND26PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND26UNIFORM:.+]] + +; CHECK: [[FORCOND26UNIFORM]]: +; CHECK: %[[CMP29UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP29UNIFORM]], label %[[FORBODY30UNIFORM:.+]], label %[[FOREND34UNIFORM:.+]] + +; CHECK: [[FORBODY30UNIFORM]]: +; CHECK: br label %[[FORCOND26UNIFORM]] + +; CHECK: [[FOREND34UNIFORM]]: +; CHECK: br label %[[EARLY:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[IFTHEN:.+]]: +; CHECK: br label %[[IFEND10:.+]] + +; CHECK: [[FORCOND26PREHEADER:.+]]: +; CHECK: br label %[[FORCOND26:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFEND10LOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[IFEND10LOOPEXIT]]: +; CHECK: br label %[[IFTHEN]] + +; CHECK: [[IFEND10]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[WHILEEND:.+]] + +; CHECK: [[WHILEEND]]: +; CHECK: br label %[[FORCOND17:.+]] + +; CHECK: [[WHILEENDELSE:.+]]: +; CHECK: br label %[[FORCOND26PREHEADER]] + +; CHECK: [[FORCOND17]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21:.+]], label %[[FOREND24:.+]] + +; CHECK: [[FORBODY21]]: +; CHECK: br label %[[FORCOND17]] + +; CHECK: [[FOREND24]]: +; CHECK: br label %[[WHILEENDELSE]] + +; CHECK: [[FORCOND26]]: +; CHECK: %[[CMP29:.+]] = icmp +; CHECK: br i1 %[[CMP29]], label %[[FORBODY30:.+]], label %[[FOREND34:.+]] + +; CHECK: [[FORBODY30]]: +; CHECK: br label %[[FORCOND26]] + +; CHECK: [[FOREND34]]: +; CHECK: br label %[[EARLY]] + +; CHECK: [[EARLY]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll new file mode 100644 index 0000000000000..1a07de7f75123 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll @@ -0,0 +1,568 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-----. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; / g <---. | +; / / \ | | +; / h i | | +; f / \ / \ | | +; | j k l | | +; | /| / \ / | | +; | m | n o --' | +; | / |/ | +; |/ q ----------' +; p | +; \ r +; \ / +; s +; +; * where nodes b, c, g, h, j, k and q are uniform branches, and node i is a +; varying branch. +; * where nodes k, l, o, n, m, p, q, r and s are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <-----. b' <-----. +; / \ | / \ | +; c d | c' d' | +; / \ / | / \ / | +; / e | / e' | +; / | | / | | +; / g <---. | / g' <---. | +; / / \ | | f' / \ | | +; / h i___|_|_|____/__ \ | | +; f / \ / \ | | | h' \ i' | | +; | j k l | | | / \ \| | | +; | /| / \ / | | | j' | l' | | +; | m | n o --' | | | \ / | | +; | / |/ | | | k' | | +; |/ q ----------' | \ | | | +; p | | \ o' ---' | +; \ r | \ / | +; \ / | n' | +; s \ | | +; | \ q' -------' +; | \ / +; | m' +; | | +; | r' +; | | +; | p' +; | | +; `-------> & <------ s' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization10(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0) { // b +; // c +; for (int i = 0; i < n * 2; i++) ret++; +; if (n <= 10) { +; // f +; goto f; +; } +; } else { +; // d +; for (int i = 0; i < n / 4; i++) ret++; +; } +; // e +; ret++; +; while (1) { +; if (n & 1) { // g +; // h +; if (n < 3) { +; // j +; goto j; +; } +; } else { +; // i +; if (ret + id >= n) { +; // l +; ret /= n * n + ret; +; goto o; +; } +; } +; // k +; if (n & 1) { +; // n +; ret += n * ret; +; goto n; +; } +; // o +; o: +; ret++; +; } +; j: +; if (n < 2) { +; // m +; ret += n * 2 + 20; +; goto p; +; } else { +; goto q; +; } +; n: +; ret *= 4; +; q: +; if (n & 1) { +; // r +; ret++; +; goto r; +; } +; } +; +; r: +; for (int i = 0; i < n / 4; i++) ret++; +; goto s; +; +; f: +; ret /= n; +; goto p; +; +; p: +; for (int i = 0; i < n * 2; i++) ret++; +; +; s: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization10(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end55, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %ret.5, %if.end55 ] + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body + br label %for.cond + +for.cond: ; preds = %for.body, %if.then + %ret.1 = phi i32 [ %ret.0, %if.then ], [ %inc, %for.body ] + %storemerge5 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp slt i32 %storemerge5, %mul + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.1, 1 + %inc4 = add nsw i32 %storemerge5, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp5 = icmp slt i32 %n, 11 + br i1 %cmp5, label %f, label %if.end17 + +if.else: ; preds = %while.body + br label %for.cond9 + +for.cond9: ; preds = %for.body12, %if.else + %ret.2 = phi i32 [ %ret.0, %if.else ], [ %inc13, %for.body12 ] + %storemerge = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ] + %div = sdiv i32 %n, 4 + %cmp10 = icmp slt i32 %storemerge, %div + br i1 %cmp10, label %for.body12, label %if.end17 + +for.body12: ; preds = %for.cond9 + %inc13 = add nsw i32 %ret.2, 1 + %inc15 = add nsw i32 %storemerge, 1 + br label %for.cond9 + +if.end17: ; preds = %for.cond9, %for.end + %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ] + br label %while.body20 + +while.body20: ; preds = %o, %if.end17 + %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %o ] + %storemerge1 = add nsw i32 %storemerge1.in, 1 + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %if.else26, label %if.then21 + +if.then21: ; preds = %while.body20 + %cmp22 = icmp slt i32 %n, 3 + br i1 %cmp22, label %j, label %if.end34 + +if.else26: ; preds = %while.body20 + %add = add nsw i32 %storemerge1, %conv + %cmp27 = icmp slt i32 %add, %n + br i1 %cmp27, label %if.end34, label %if.then29 + +if.then29: ; preds = %if.else26 + %mul30 = mul nsw i32 %n, %n + %add31 = add nsw i32 %storemerge1, %mul30 + %0 = icmp eq i32 %add31, 0 + %1 = select i1 %0, i32 1, i32 %add31 + %div32 = sdiv i32 %storemerge1, %1 + br label %o + +if.end34: ; preds = %if.else26, %if.then21 + %and35 = and i32 %n, 1 + %tobool36 = icmp eq i32 %and35, 0 + br i1 %tobool36, label %o, label %if.then37 + +if.then37: ; preds = %if.end34 + %mul38 = mul nsw i32 %storemerge1, %n + %add39 = add nsw i32 %mul38, %storemerge1 + %mul50 = shl nsw i32 %add39, 2 + br label %q + +o: ; preds = %if.end34, %if.then29 + %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ] + br label %while.body20 + +j: ; preds = %if.then21 + %cmp42 = icmp eq i32 %n, 2 + br i1 %cmp42, label %q, label %if.then44 + +if.then44: ; preds = %j + %mul45 = mul i32 %n, 2 + %add46 = add nsw i32 %mul45, 20 + %add47 = add nsw i32 %add46, %storemerge1 + br label %p + +q: ; preds = %j, %if.then37 + %ret.5 = phi i32 [ %mul50, %if.then37 ], [ %storemerge1, %j ] + %and51 = and i32 %n, 1 + %tobool52 = icmp eq i32 %and51, 0 + br i1 %tobool52, label %if.end55, label %if.then53 + +if.then53: ; preds = %q + br label %for.cond57 + +if.end55: ; preds = %q + br label %while.body + +for.cond57: ; preds = %for.body61, %if.then53 + %ret.6.in = phi i32 [ %ret.5, %if.then53 ], [ %ret.6, %for.body61 ] + %storemerge2 = phi i32 [ 0, %if.then53 ], [ %inc64, %for.body61 ] + %ret.6 = add nsw i32 %ret.6.in, 1 + %div58 = sdiv i32 %n, 4 + %cmp59 = icmp slt i32 %storemerge2, %div58 + br i1 %cmp59, label %for.body61, label %s + +for.body61: ; preds = %for.cond57 + %inc64 = add nsw i32 %storemerge2, 1 + br label %for.cond57 + +f: ; preds = %for.end + %2 = icmp eq i32 %ret.1, -2147483648 + %3 = icmp eq i32 %n, -1 + %4 = and i1 %3, %2 + %5 = icmp eq i32 %n, 0 + %6 = or i1 %5, %4 + %7 = select i1 %6, i32 1, i32 %n + %div66 = sdiv i32 %ret.1, %7 + br label %p + +p: ; preds = %f, %if.then44 + %storemerge3 = phi i32 [ %add47, %if.then44 ], [ %div66, %f ] + br label %for.cond68 + +for.cond68: ; preds = %for.body72, %p + %ret.7 = phi i32 [ %storemerge3, %p ], [ %inc73, %for.body72 ] + %storemerge4 = phi i32 [ 0, %p ], [ %inc75, %for.body72 ] + %mul69 = shl nsw i32 %n, 1 + %cmp70 = icmp slt i32 %storemerge4, %mul69 + br i1 %cmp70, label %for.body72, label %s + +for.body72: ; preds = %for.cond68 + %inc73 = add nsw i32 %ret.7, 1 + %inc75 = add nsw i32 %storemerge4, 1 + br label %for.cond68 + +s: ; preds = %for.cond68, %for.cond57 + %ret.8 = phi i32 [ %ret.6, %for.cond57 ], [ %ret.7, %for.cond68 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization10, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization10 +; CHECK: br i1 true, label %[[WHILBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]] + +; CHECK: [[FORCOND9PREHEADER]]: +; CHECK: br label %[[FORCOND9:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]] + +; CHECK: [[FORCOND9]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND9]] + +; CHECK: [[IFEND17LOOPEXIT]]: +; CHECK: br label %[[IFEND17]] + +; CHECK: [[IFEND17]]: +; CHECK: br label %[[WHILEBODY20:.+]] + +; CHECK: [[WHILEBODY20]]: +; CHECK: %[[TOBOOL:.+]] = icmp +; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]] + +; CHECK: [[IFTHEN21]]: +; CHECK: %[[CMP22:.+]] = icmp +; CHECK: br i1 %[[CMP22]], label %[[J:.+]], label %[[IFEND34:.+]] + +; CHECK: [[IFELSE26]]: +; CHECK: br label %[[IFTHEN29:.+]] + +; CHECK: [[WHILEBODYUNIFORM:.+]]: +; CHECK: %[[CMPUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND9PREHEADERUNIFORM:.+]] + +; CHECK: [[FORCOND9PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND9UNIFORM:.+]] + +; CHECK: [[FORCOND9UNIFORM]]: +; CHECK: %[[CMP10UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[IFEND17LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY12UNIFORM]]: +; CHECK: br label %[[FORCOND9UNIFORM]] + +; CHECK: [[IFEND17LOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND17UNIFORM:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[FORENDUNIFORM]]: +; CHECK: %[[CMP5UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND17UNIFORM]] + +; CHECK: [[IFEND17UNIFORM]]: +; CHECK: br label %[[WHILEBODY20UNIFORM:.+]] + +; CHECK: [[WHILEBODY20UNIFORM]]: +; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[IFELSE26UNIFORM:.+]], label %[[IFTHEN21UNIFORM:.+]] + +; CHECK: [[IFTHEN21UNIFORM]]: +; CHECK: %[[CMP22UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP22UNIFORM]], label %[[JUNIFORM:.+]], label %[[IFEND34UNIFORM:.+]] + +; CHECK: [[IFELSE26UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFEND34UNIFORM]], label %[[IFELSE26UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFTHEN29UNIFORM:.+]]: +; CHECK: br label %[[OUNIFORM:.+]] + +; CHECK: [[IFEND34UNIFORM]]: +; CHECK: %[[TOBOOL36UNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOL36UNIFORM]], label %[[OUNIFORM]], label %[[IFTHEN37UNIFORM:.+]] + +; CHECK: [[IFELSE26UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN29UNIFORM]], label %[[IFELSE26UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSE26UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFTHEN29]] + +; CHECK: [[OUNIFORM]]: +; CHECK: br label %[[WHILEBODY20UNIFORM]] + +; CHECK: [[JUNIFORM]]: +; CHECK: %[[CMP42UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP42UNIFORM]], label %[[QUNIFORM:.+]], label %[[IFTHEN44UNIFORM:.+]] + +; CHECK: [[IFTHEN37UNIFORM]]: +; CHECK: br label %[[QUNIFORM]] + +; CHECK: [[QUNIFORM]]: +; CHECK: %[[TOBOOL52UNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOL52UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FORCOND57PREHEADERUNIFORM:.+]] + +; CHECK: [[IFTHEN44UNIFORM]]: +; CHECK: br label %[[PUNIFORM:.+]] + +; CHECK: [[FORCOND57PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND57UNIFORM:.+]] + +; CHECK: [[FORCOND57UNIFORM]]: +; CHECK: %[[CMP59UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP59UNIFORM]], label %[[FORBODY61UNIFORM:.+]], label %[[SLOOPEXIT1UNIFORM:.+]] + +; CHECK: [[FORBODY61UNIFORM]]: +; CHECK: br label %[[FORCOND57UNIFORM]] + +; CHECK: [[SLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[SUNIFORM:.+]] + +; CHECK: [[FUNIFORM]]: +; CHECK: br label %[[PUNIFORM]] + +; CHECK: [[PUNIFORM]]: +; CHECK: br label %[[FORCOND68UNIFORM:.+]] + +; CHECK: [[FORCOND68UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72UNIFORM:.+]], label %[[SLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY72UNIFORM]]: +; CHECK: br label %[[FORCOND68UNIFORM]] + +; CHECK: [[SLOOPEXITUNIFORM]]: +; CHECK: br label %[[S:.+]] + +; CHECK: [[IFTHEN29]]: +; CHECK: br label %[[IFEND34]] + +; CHECK: [[IFEND34]]: +; CHECK: br label %[[O:.+]] + +; CHECK: [[IFTHEN37:.+]]: +; CHECK: br label %[[IFTHEN37ELSE:.+]] + +; CHECK: [[IFTHEN37ELSE]]: +; CHECK: br i1 %{{.+}}, label %[[JELSE:.+]], label %[[JSPLIT:.+]] + +; CHECK: [[O]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]] + +; CHECK: [[WHILEBODY20PUREEXIT]]: +; CHECK: br label %[[IFTHEN37]] + +; CHECK: [[J]]: +; CHECK: br label %[[WHILEBODY20PUREEXIT]] + +; CHECK: [[JELSE]]: +; CHECK: br label %[[Q:.+]] + +; CHECK: [[JSPLIT]]: +; CHECK: br label %[[Q]] + +; CHECK: [[IFTHEN44:.+]]: +; CHECK: br label %[[IFTHEN44ELSE:.+]] + +; CHECK: [[IFTHEN44ELSE]]: +; CHECK: br label %[[FORCOND57PREHEADER:.+]] + +; CHECK: [[Q]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[IFTHEN44]] + +; CHECK: [[FORCOND57PREHEADER]]: +; CHECK: br label %[[FORCOND57:.+]] + +; CHECK: [[FORCOND57PREHEADERELSE:.+]]: +; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]] + +; CHECK: [[FORCOND57]]: +; CHECK: %[[CMP59:.+]] = icmp +; CHECK: br i1 %[[CMP59]], label %[[FORBODY61:.+]], label %[[SLOOPEXIT1:.+]] + +; CHECK: [[FORBODY61]]: +; CHECK: br label %[[FORCOND57]] + +; CHECK: [[F]]: +; CHECK: br label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[FELSE]]: +; CHECK: br label %[[P:.+]] + +; CHECK: [[FSPLIT]]: +; CHECK: br label %[[P]] + +; CHECK: [[P]]: +; CHECK: br label %[[FORCOND68:.+]] + +; CHECK: [[FORCOND68]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72:.+]], label %[[SLOOPEXIT:.+]] + +; CHECK: [[FORBODY72]]: +; CHECK: br label %[[FORCOND68]] + +; CHECK: [[SLOOPEXIT]]: +; CHECK: br label %[[S]] + +; CHECK: [[SLOOPEXIT1]]: +; CHECK: br label %[[FORCOND57PREHEADERELSE]] + +; CHECK: [[S]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll new file mode 100644 index 0000000000000..4b423f2d3f079 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll @@ -0,0 +1,425 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-------. +; | | +; c <---. | +; / \ | | +; d e | | +; / \ / \ | | +; i f g | | +; | / \ / \| | +; | j h --' | +; | | \ | +; | | k | +; | \ / | +; | \ / | +; | \ / | +; | \ / | +; | l -----' +; | / +; \ m +; \ / +; n +; +; * where nodes c, d, f, g, and l are uniform branches, and node e is a +; varying branch. +; * where nodes i, f, g, j, h, k, l, m and n are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <-------. b' <----. +; | | | | +; c <---. | c' <--. | +; / \ | | / \ | | +; d e___|___|_ d' e' | | +; / \ / \ | | \|__ | | | +; i f g | | | `g' | | +; | / \ / \| | \ / | | +; | j h --' | f' | | +; | | \ | | | | +; | | k | h' ---' | +; | \ / | | | +; | \ / | k' | +; | \ / | | | +; | \ / | j' | +; | l -----' | | +; | / l' -----' +; \ m | +; \ / m' +; n | +; | i' +; | | +; `-----> & <---- n' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization11(__global int *out, int n) { +; // a +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; // b +; while (1) { +; if (n < 5) { // c +; // d +; for (int i = 0; i < n * 2; i++) ret++; +; if (n <= 3) { +; // i +; goto i; +; } +; } else { +; // e +; if (ret + id >= n) { +; // g +; ret /= n * n + ret; +; if (n <= 10) { +; goto k; +; } else { +; goto h; +; } +; } +; } +; // f +; ret *= n; +; if (n & 1) { +; goto j; +; } +; +; // h +; h: +; ret++; +; } +; +; j: +; ret += n * 2 + 20; +; goto l; +; +; k: +; ret *= n; +; goto l; +; +; l: +; if (n & 1) { +; // m +; ret++; +; goto m; +; } +; } +; +; m: +; for (int i = 0; i < n / 4; i++) ret++; +; goto n; +; +; i: +; ret /= n; +; +; n: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization11(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end33, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %storemerge, %if.end33 ] + br label %while.body2 + +while.body2: ; preds = %h, %while.body + %ret.1 = phi i32 [ %ret.0, %while.body ], [ %inc24, %h ] + %cmp = icmp slt i32 %n, 5 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body2 + br label %for.cond + +for.cond: ; preds = %for.body, %if.then + %ret.2 = phi i32 [ %ret.1, %if.then ], [ %inc, %for.body ] + %storemerge2 = phi i32 [ 0, %if.then ], [ %inc6, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp4 = icmp slt i32 %storemerge2, %mul + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.2, 1 + %inc6 = add nsw i32 %storemerge2, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp7 = icmp slt i32 %n, 4 + br i1 %cmp7, label %i44, label %if.end20 + +if.else: ; preds = %while.body2 + %add = add nsw i32 %ret.1, %conv + %cmp10 = icmp slt i32 %add, %n + br i1 %cmp10, label %if.end20, label %if.then12 + +if.then12: ; preds = %if.else + %mul13 = mul nsw i32 %n, %n + %add14 = add nsw i32 %ret.1, %mul13 + %0 = icmp eq i32 %ret.1, -2147483648 + %1 = icmp eq i32 %add14, -1 + %2 = and i1 %0, %1 + %3 = icmp eq i32 %add14, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %add14 + %div = sdiv i32 %ret.1, %5 + %cmp15 = icmp slt i32 %n, 11 + br i1 %cmp15, label %k, label %h + +if.end20: ; preds = %if.else, %for.end + %ret.3 = phi i32 [ %ret.2, %for.end ], [ %ret.1, %if.else ] + %mul21 = mul nsw i32 %ret.3, %n + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %h, label %j + +h: ; preds = %if.end20, %if.then12 + %ret.4 = phi i32 [ %div, %if.then12 ], [ %mul21, %if.end20 ] + %inc24 = add nsw i32 %ret.4, 1 + br label %while.body2 + +j: ; preds = %if.end20 + %mul25 = mul i32 %n, 2 + %add26 = add nsw i32 %mul25, 20 + %add27 = add nsw i32 %add26, %mul21 + br label %l + +k: ; preds = %if.then12 + %mul28 = mul nsw i32 %div, %n + br label %l + +l: ; preds = %k, %j + %storemerge = phi i32 [ %add27, %j ], [ %mul28, %k ] + %and29 = and i32 %n, 1 + %tobool30 = icmp eq i32 %and29, 0 + br i1 %tobool30, label %if.end33, label %if.then31 + +if.then31: ; preds = %l + br label %for.cond35 + +if.end33: ; preds = %l + br label %while.body + +for.cond35: ; preds = %for.body39, %if.then31 + %ret.5.in = phi i32 [ %storemerge, %if.then31 ], [ %ret.5, %for.body39 ] + %storemerge1 = phi i32 [ 0, %if.then31 ], [ %inc42, %for.body39 ] + %ret.5 = add nsw i32 %ret.5.in, 1 + %div36 = sdiv i32 %n, 4 + %cmp37 = icmp slt i32 %storemerge1, %div36 + br i1 %cmp37, label %for.body39, label %n46 + +for.body39: ; preds = %for.cond35 + %inc42 = add nsw i32 %storemerge1, 1 + br label %for.cond35 + +i44: ; preds = %for.end + %6 = icmp eq i32 %ret.2, -2147483648 + %7 = icmp eq i32 %n, -1 + %8 = and i1 %7, %6 + %9 = icmp eq i32 %n, 0 + %10 = or i1 %9, %8 + %11 = select i1 %10, i32 1, i32 %n + %div45 = sdiv i32 %ret.2, %11 + br label %n46 + +n46: ; preds = %i44, %for.cond35 + %ret.6 = phi i32 [ %div45, %i44 ], [ %ret.5, %for.cond35 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization11, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization11 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[WHILEBODY2:.+]] + +; CHECK: [[WHILEBODY2]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[IFELSE:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: br label %[[IFEND20:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[IFTHEN12:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: br label %[[WHILEBODY2UNIFORM:.+]] + +; CHECK: [[WHILEBODY2UNIFORM]]: +; CHECK: %[[CMPUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]] + +; CHECK: [[IFELSEUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFEND20UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFTHEN12UNIFORM:.+]]: +; CHECK: %[[CMP15UNIFORM:cmp.+]] = icmp +; CHECK: br i1 %[[CMP15UNIFORM]], label %[[KUNIFORM:.+]], label %[[HUNIFORM:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[FORENDUNIFORM]]: +; CHECK: %[[CMP7UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP7UNIFORM]], label %[[I44UNIFORM:.+]], label %[[IFEND20UNIFORM]] + +; CHECK: [[IFEND20UNIFORM]]: +; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[HUNIFORM]], label %[[JUNIFORM:.+]] + +; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN12UNIFORM]], label %[[IFELSEUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSEUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFTHEN12]] + +; CHECK: [[HUNIFORM]]: +; CHECK: br label %[[WHILEBODY2UNIFORM]] + +; CHECK: [[KUNIFORM]]: +; CHECK: br label %[[LUNIFORM:.+]] + +; CHECK: [[JUNIFORM]]: +; CHECK: br label %[[LUNIFORM]] + +; CHECK: [[LUNIFORM]]: +; CHECK: %[[TOBOOL30UNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOL30UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FORCOND35PREHEADERUNIFORM:.+]] + +; CHECK: [[FORCOND35PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND35UNIFORM:.+]] + +; CHECK: [[FORCOND35UNIFORM]]: +; CHECK: %[[CMP37UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP37UNIFORM]], label %[[FORBODY39UNIFORM:.+]], label %[[N46LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY39UNIFORM]]: +; CHECK: br label %[[FORCOND35UNIFORM]] + +; CHECK: [[N46LOOPEXITUNIFORM]]: +; CHECK: br label %[[N46UNIFORM:.+]] + +; CHECK: [[I44UNIFORM]]: +; CHECK: br label %[[N46:.+]] + +; CHECK: [[IFTHEN12]]: +; CHECK: br label %[[IFEND20]] + +; CHECK: [[IFEND20]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[H]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY2]], label %[[WHILEBODY2PUREEXIT:.+]] + +; CHECK: [[WHILEBODY2PUREEXIT]]: +; CHECK: br label %[[K:.+]] + +; CHECK: [[J:.+]]: +; CHECK: br label %[[L:.+]] + +; CHECK: [[K]]: +; CHECK: br label %[[KELSE:.+]] + +; CHECK: [[KELSE]]: +; CHECK: br label %[[J]] + +; CHECK: [[L]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[FORCOND35PREHEADER:.+]] + +; CHECK: [[FORCOND35PREHEADER]]: +; CHECK: br label %[[FORCOND35:.+]] + +; CHECK: [[FORCOND35PREHEADERELSE:.+]]: +; CHECK: br label %[[I44:.+]] + +; CHECK: [[FORCOND35]]: +; CHECK: %[[CMP37:.+]] = icmp +; CHECK: br i1 %[[CMP37]], label %[[FORBODY39:.+]], label %[[N46LOOPEXIT:.+]] + +; CHECK: [[FORBODY39]]: +; CHECK: br label %[[FORCOND35]] + +; CHECK: [[I44]]: +; CHECK: br label %[[N46]] + +; CHECK: [[N46LOOPEXIT]]: +; CHECK: br label %[[FORCOND35PREHEADERELSE]] + +; CHECK: [[N46]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll new file mode 100644 index 0000000000000..270774ef0c142 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll @@ -0,0 +1,782 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-----. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; / g <---. | +; f / \ | | +; | h i | | +; | / / \ | | +; | / k l | | +; | / |\ /| | | +; |/ |/ \| | | +; j m n | | +; /| / \ / | | +; / | o p --' | +; / | / / | +; | | / r | +; | | / | | +; | |/ s ------' +; | | / +; | /| t +; | / | / +; |/ | / +; q | / +; | |/ +; | u +; \ / +; v +; +; * where nodes b, c, g, j, k, l, m, p and s are uniform branches, +; and node i is a varying branch. +; * where nodes k, l, o, n, m, p, q, s, r, t and v are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <-----. b' <----. +; / \ | / \ | +; c d | c' d' | +; / \ / | / \ / | +; / e | / e' | +; / | | / | | +; / g <---. | f' g' <--. | +; f / \ | | | / \ | | +; | h i___|_|__|_ h i' | | +; | / / \ | | | \/___ | | | +; | / k l | | | / `l' | | +; | / |\ /| | | |/ | | | +; |/ |/ \| | | j' k' | | +; j m n | | |\ | | | +; /| / \ / | | | \ n' | | +; / | o p --' | | \ | | | +; / | / / | | | m' | | +; | | / r | | | | | | +; | | / | | | | p' -' | +; | |/ s ------' | | / | +; | | / | | r' | +; | /| t | | | | +; | / | / | | s' -----' +; |/ | / | |/ +; q | / | o' +; | |/ | / +; | u | t' +; \ / |/ +; v u' +; | | +; | q' +; | | +; `-------> & <------ v' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization12(__global int *out, int n) { +; // a +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0) { // b +; // c +; for (int i = 0; i < n * 2; i++) ret++; +; if (n < 5) { +; // f +; goto f; +; } +; } else { +; // d +; for (int i = 0; i < n / 4; i++) ret++; +; } +; // e +; ret++; +; while (1) { +; if (n <= 2) { // g +; // h +; ret -= n * ret; +; for (int i = 0; i < n * 2; i++) ret++; +; // j +; goto j; +; } else { +; // i +; if (ret + id >= n) { +; // k +; ret /= n * n + ret; +; if (n < 5) { +; // m +; ret -= n; +; goto m; +; } else { +; // n +; ret += n; +; goto n; +; } +; } else { +; // l +; if (n >= 5) { +; // m +; ret += n; +; goto m; +; } else { +; // n +; ret -= n; +; goto n; +; } +; } +; } +; // m +; m: +; if (n & 1) { +; // o +; ret *= n; +; goto q; +; } else { +; // p +; goto p; +; } +; +; // n +; n: +; ret *= ret; +; // p +; p: +; if (n > 3) { +; goto r; +; } +; ret++; +; } +; +; // r +; r: +; ret *= 4; +; for (int i = 0; i < n / 4; i++) ret++; +; +; // s +; if (n & 1) { +; goto t; +; } +; ret++; +; } +; +; f: +; ret /= n; +; goto j; +; +; j: +; if (n == 2) { +; goto q; +; } else { +; goto u; +; } +; +; t: +; for (int i = 0; i < n + 1; i++) ret++; +; goto u; +; +; q: +; for (int i = 0; i < n / 4; i++) ret++; +; goto v; +; +; u: +; for (int i = 0; i < n * 2; i++) ret++; +; +; v: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end79, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc80, %if.end79 ] + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body + br label %for.cond + +for.cond: ; preds = %for.body, %if.then + %ret.0 = phi i32 [ %storemerge, %if.then ], [ %inc, %for.body ] + %storemerge10 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp slt i32 %storemerge10, %mul + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.0, 1 + %inc4 = add nsw i32 %storemerge10, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp5 = icmp slt i32 %n, 5 + br i1 %cmp5, label %f, label %if.end17 + +if.else: ; preds = %while.body + br label %for.cond9 + +for.cond9: ; preds = %for.body12, %if.else + %ret.1 = phi i32 [ %storemerge, %if.else ], [ %inc13, %for.body12 ] + %storemerge1 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ] + %div = sdiv i32 %n, 4 + %cmp10 = icmp slt i32 %storemerge1, %div + br i1 %cmp10, label %for.body12, label %if.end17 + +for.body12: ; preds = %for.cond9 + %inc13 = add nsw i32 %ret.1, 1 + %inc15 = add nsw i32 %storemerge1, 1 + br label %for.cond9 + +if.end17: ; preds = %for.cond9, %for.end + %ret.2 = phi i32 [ %ret.0, %for.end ], [ %ret.1, %for.cond9 ] + br label %while.body20 + +while.body20: ; preds = %if.end63, %if.end17 + %storemerge2.in = phi i32 [ %ret.2, %if.end17 ], [ %ret.4, %if.end63 ] + %storemerge2 = add nsw i32 %storemerge2.in, 1 + %cmp21 = icmp slt i32 %n, 3 + br i1 %cmp21, label %if.then23, label %if.else35 + +if.then23: ; preds = %while.body20 + %mul24 = mul nsw i32 %storemerge2, %n + %sub = sub nsw i32 %storemerge2, %mul24 + br label %for.cond26 + +for.cond26: ; preds = %for.body30, %if.then23 + %ret.3 = phi i32 [ %sub, %if.then23 ], [ %inc31, %for.body30 ] + %storemerge9 = phi i32 [ 0, %if.then23 ], [ %inc33, %for.body30 ] + %mul27 = shl nsw i32 %n, 1 + %cmp28 = icmp slt i32 %storemerge9, %mul27 + br i1 %cmp28, label %for.body30, label %j + +for.body30: ; preds = %for.cond26 + %inc31 = add nsw i32 %ret.3, 1 + %inc33 = add nsw i32 %storemerge9, 1 + br label %for.cond26 + +if.else35: ; preds = %while.body20 + %add = add nsw i32 %storemerge2, %conv + %cmp36 = icmp slt i32 %add, %n + br i1 %cmp36, label %if.else48, label %if.then38 + +if.then38: ; preds = %if.else35 + %mul39 = mul nsw i32 %n, %n + %add40 = add nsw i32 %storemerge2, %mul39 + %0 = icmp eq i32 %add40, 0 + %1 = select i1 %0, i32 1, i32 %add40 + %div41 = sdiv i32 %storemerge2, %1 + %cmp42 = icmp slt i32 %n, 5 + br i1 %cmp42, label %if.then44, label %if.else46 + +if.then44: ; preds = %if.then38 + %sub45 = sub nsw i32 %div41, %n + br label %m + +if.else46: ; preds = %if.then38 + %add47 = add nsw i32 %div41, %n + br label %n58 + +if.else48: ; preds = %if.else35 + %cmp49 = icmp sgt i32 %n, 4 + br i1 %cmp49, label %if.then51, label %if.else53 + +if.then51: ; preds = %if.else48 + %add52 = add nsw i32 %storemerge2, %n + br label %m + +if.else53: ; preds = %if.else48 + %sub54 = sub nsw i32 %storemerge2, %n + br label %n58 + +m: ; preds = %if.then51, %if.then44 + %storemerge7 = phi i32 [ %add52, %if.then51 ], [ %sub45, %if.then44 ] + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %p, label %if.then55 + +if.then55: ; preds = %m + %mul56 = mul nsw i32 %storemerge7, %n + br label %q + +n58: ; preds = %if.else53, %if.else46 + %storemerge3 = phi i32 [ %sub54, %if.else53 ], [ %add47, %if.else46 ] + %mul59 = mul nsw i32 %storemerge3, %storemerge3 + br label %p + +p: ; preds = %n58, %m + %ret.4 = phi i32 [ %mul59, %n58 ], [ %storemerge7, %m ] + %cmp60 = icmp sgt i32 %n, 3 + br i1 %cmp60, label %r, label %if.end63 + +if.end63: ; preds = %p + br label %while.body20 + +r: ; preds = %p + %mul65 = shl nsw i32 %ret.4, 2 + br label %for.cond67 + +for.cond67: ; preds = %for.body71, %r + %ret.5 = phi i32 [ %mul65, %r ], [ %inc72, %for.body71 ] + %storemerge4 = phi i32 [ 0, %r ], [ %inc74, %for.body71 ] + %div68 = sdiv i32 %n, 4 + %cmp69 = icmp slt i32 %storemerge4, %div68 + br i1 %cmp69, label %for.body71, label %for.end75 + +for.body71: ; preds = %for.cond67 + %inc72 = add nsw i32 %ret.5, 1 + %inc74 = add nsw i32 %storemerge4, 1 + br label %for.cond67 + +for.end75: ; preds = %for.cond67 + %and76 = and i32 %n, 1 + %tobool77 = icmp eq i32 %and76, 0 + br i1 %tobool77, label %if.end79, label %t + +if.end79: ; preds = %for.end75 + %inc80 = add nsw i32 %ret.5, 1 + br label %while.body + +f: ; preds = %for.end + %2 = icmp eq i32 %n, 0 + %3 = select i1 %2, i32 1, i32 %n + %div81 = sdiv i32 %ret.0, %3 + br label %j + +j: ; preds = %f, %for.cond26 + %ret.6 = phi i32 [ %div81, %f ], [ %ret.3, %for.cond26 ] + %cmp82 = icmp eq i32 %n, 2 + br i1 %cmp82, label %q, label %u + +t: ; preds = %for.end75 + br label %for.cond87 + +for.cond87: ; preds = %for.body91, %t + %ret.7 = phi i32 [ %ret.5, %t ], [ %inc92, %for.body91 ] + %storemerge5 = phi i32 [ 0, %t ], [ %inc94, %for.body91 ] + %cmp89 = icmp sgt i32 %storemerge5, %n + br i1 %cmp89, label %u, label %for.body91 + +for.body91: ; preds = %for.cond87 + %inc92 = add nsw i32 %ret.7, 1 + %inc94 = add nsw i32 %storemerge5, 1 + br label %for.cond87 + +q: ; preds = %j, %if.then55 + %ret.8 = phi i32 [ %mul56, %if.then55 ], [ %ret.6, %j ] + br label %for.cond97 + +for.cond97: ; preds = %for.body101, %q + %ret.9 = phi i32 [ %ret.8, %q ], [ %inc102, %for.body101 ] + %storemerge8 = phi i32 [ 0, %q ], [ %inc104, %for.body101 ] + %div98 = sdiv i32 %n, 4 + %cmp99 = icmp slt i32 %storemerge8, %div98 + br i1 %cmp99, label %for.body101, label %v + +for.body101: ; preds = %for.cond97 + %inc102 = add nsw i32 %ret.9, 1 + %inc104 = add nsw i32 %storemerge8, 1 + br label %for.cond97 + +u: ; preds = %for.cond87, %j + %ret.10 = phi i32 [ %ret.6, %j ], [ %ret.7, %for.cond87 ] + br label %for.cond107 + +for.cond107: ; preds = %for.body111, %u + %ret.11 = phi i32 [ %ret.10, %u ], [ %inc112, %for.body111 ] + %storemerge6 = phi i32 [ 0, %u ], [ %inc114, %for.body111 ] + %mul108 = shl nsw i32 %n, 1 + %cmp109 = icmp slt i32 %storemerge6, %mul108 + br i1 %cmp109, label %for.body111, label %v + +for.body111: ; preds = %for.cond107 + %inc112 = add nsw i32 %ret.11, 1 + %inc114 = add nsw i32 %storemerge6, 1 + br label %for.cond107 + +v: ; preds = %for.cond107, %for.cond97 + %ret.12 = phi i32 [ %ret.9, %for.cond97 ], [ %ret.11, %for.cond107 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.12, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization12, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization12 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]] + +; CHECK: [[FORCOND9PREHEADER]]: +; CHECK: br label %[[FORCOND9:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]] + +; CHECK: [[FORCOND9]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND9]] + +; CHECK: [[IFEND17LOOPEXIT]]: +; CHECK: br label %[[IFEND17]] + +; CHECK: [[IFEND17]]: +; CHECK: br label %[[WHILEBODY20:.+]] + +; CHECK: [[WHILEBODY20]]: +; CHECK: %[[CMP21:.+]] = icmp +; CHECK: br i1 %[[CMP21]], label %[[IFTHEN23:.+]], label %[[IFELSE35:.+]] + +; CHECK: [[IFTHEN23]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[IFTHEN23ELSE:.+]]: +; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]] + +; CHECK: [[IFTHEN23SPLIT:.+]]: +; CHECK: br label %[[FORCOND26:.+]] + +; CHECK: [[FORCOND26]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[JLOOPEXIT:.+]] + +; CHECK: [[FORBODY30]]: +; CHECK: br label %[[FORCOND26]] + +; CHECK: [[IFELSE35]]: +; CHECK: br label %[[IFTHEN38:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: %[[CMPUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND9PREHEADERUNIFORM:.+]] + +; CHECK: [[FORCOND9PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND9UNIFORM:.+]] + +; CHECK: [[FORCOND9UNIFORM]]: +; CHECK: %[[CMP10UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[IFEND17LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY12UNIFORM]]: +; CHECK: br label %[[FORCOND9UNIFORM]] + +; CHECK: [[IFEND17LOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND17UNIFORM:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[FORENDUNIFORM]]: +; CHECK: %[[CMP5UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND17UNIFORM]] + +; CHECK: [[IFEND17UNIFORM]]: +; CHECK: br label %[[WHILEBODY20UNIFORM:.+]] + +; CHECK: [[WHILEBODY20UNIFORM]]: +; CHECK: %[[CMP21UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP21UNIFORM]], label %[[IFTHEN23UNIFORM:.+]], label %[[IFELSE35UNIFORM:.+]] + +; CHECK: [[IFELSE35UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE48UNIFORM:.+]], label %[[IFELSE35UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFTHEN38UNIFORM:.+]]: +; CHECK: %[[CMP42UNIFORM:cmp.+]] = icmp +; CHECK: br i1 %[[CMP42UNIFORM]], label %[[IFTHEN44UNIFORM:.+]], label %[[IFELSE46UNIFORM:.+]] + +; CHECK: [[IFELSE46UNIFORM]]: +; CHECK: br label %[[N58UNIFORM:.+]] + +; CHECK: [[IFTHEN44UNIFORM]]: +; CHECK: br label %[[MUNIFORM:.+]] + +; CHECK: [[IFELSE48UNIFORM]]: +; CHECK: %[[CMP49UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP49UNIFORM]], label %[[IFTHEN51UNIFORM:.+]], label %[[IFELSE53UNIFORM:.+]] + +; CHECK: [[IFELSE35UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN38UNIFORM]], label %[[IFELSE35UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSE35UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFTHEN38]] + +; CHECK: [[IFELSE53UNIFORM]]: +; CHECK: br label %[[N58UNIFORM]] + +; CHECK: [[IFTHEN51UNIFORM]]: +; CHECK: br label %[[MUNIFORM]] + +; CHECK: [[N58UNIFORM]]: +; CHECK: br label %[[PUNIFORM:.+]] + +; CHECK: [[MUNIFORM]]: +; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[PUNIFORM]], label %[[IFTHEN55UNIFORM:.+]] + +; CHECK: [[PUNIFORM]]: +; CHECK: %[[CMP60UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP60UNIFORM]], label %[[RUNIFORM:.+]], label %[[WHILEBODY20UNIFORM]] + +; CHECK: [[RUNIFORM]]: +; CHECK: br label %[[FORCOND67UNIFORM:.+]] + +; CHECK: [[FORCOND67UNIFORM]]: +; CHECK: %[[CMP69UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP69UNIFORM]], label %[[FORBODY71UNIFORM:.+]], label %[[FOREND75UNIFORM:.+]] + +; CHECK: [[FORBODY71UNIFORM]]: +; CHECK: br label %[[FORCOND67UNIFORM]] + +; CHECK: [[FOREND75UNIFORM]]: +; CHECK: %[[TOBOOL77UNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOL77UNIFORM]], label %[[IFEND79UNIFORM:.+]], label %[[FORCOND87PREHEADERUNIFORM:.+]] + +; CHECK: [[IFEND79UNIFORM]]: +; CHECK: br label %[[WHILEBODYUNIFORM]] + +; CHECK: [[IFEND55UNIFORM:.+]]: +; CHECK: br label %[[QUNIFORM:.+]] + +; CHECK: [[FORCOND87PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND87UNIFORM:.+]] + +; CHECK: [[FORCOND87UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXITUNIFORM:.+]], label %[[FORBODY91UNIFORM:.+]] + +; CHECK: [[FORBODY91UNIFORM]]: +; CHECK: br label %[[FORCOND87UNIFORM]] + +; CHECK: [[ULOOPEXITUNIFORM]]: +; CHECK: br label %[[UUNIFORM:.+]] + +; CHECK: [[IFTHEN23UNIFORM]]: +; CHECK: br label %[[FORCOND26UNIFORM:.+]] + +; CHECK: [[FORCOND26UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30UNIFORM:.+]], label %[[JLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY30UNIFORM]]: +; CHECK: br label %[[FORCOND26UNIFORM]] + +; CHECK: [[JLOOPEXITUNIFORM]]: +; CHECK: br label %[[JUNIFORM:.+]] + +; CHECK: [[FUNIFORM]]: +; CHECK: br label %[[JUNIFORM]] + +; CHECK: [[JUNIFORM]]: +; CHECK: %[[CMP82UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP82UNIFORM]], label %[[QUNIFORM]], label %[[UUNIFORM]] + +; CHECK: [[UUNIFORM]]: +; CHECK: br label %[[FORCOND107UNIFORM:.+]] + +; CHECK: [[FORCOND107UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111UNIFORM:.+]], label %[[ULOOPEXIT1UNIFORM:.+]] + +; CHECK: [[FORBODY111UNIFORM]]: +; CHECK: br label %[[FORCOND107UNIFORM]] + +; CHECK: [[VLOOPEXIT1UNIFORM:.+]]: +; CHECK: br label %[[VUNIFORM:.+]] + +; CHECK: [[QUNIFORM]]: +; CHECK: br label %[[FORCOND97UNIFORM:.+]] + +; CHECK: [[FORCOND97UNIFORM]]: +; CHECK: %[[CMP99UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP99UNIFORM]], label %[[FORBODY101UNIFORM:.+]], label %[[VLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY101UNIFORM]]: +; CHECK: br label %[[FORCOND97UNIFORM]] + +; CHECK: [[VLOOPEXITUNIFORM]]: +; CHECK: br label %[[V:.+]] + +; CHECK: [[IFTHEN38]]: +; CHECK: %[[CMP42:cmp.+]] = icmp +; CHECK: br i1 %[[CMP42]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]] + +; CHECK: [[IFTHEN44]]: +; CHECK: br label %[[IFELSE48:.+]] + +; CHECK: [[IFELSE46]]: +; CHECK: br label %[[IFELSE48]] + +; CHECK: [[IFELSE48]]: +; CHECK: %[[CMP49:.+]] = icmp +; CHECK: br i1 %[[CMP49]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]] + +; CHECK: [[IFTHEN51]]: +; CHECK: br label %[[N58:.+]] + +; CHECK: [[IFELSE53]]: +; CHECK: br label %[[N58]] + +; CHECK: [[M:.+]]: +; CHECK: br label %[[P:.+]] + +; CHECK: [[IFTHEN55:.+]]: +; CHECK: br label %[[IFTHEN55ELSE:.+]] + +; CHECK: [[IFTHEN55ELSE]]: +; CHECK: br label %[[FORCOND87PREHEADER:.+]] + +; CHECK: [[N58]]: +; CHECK: br label %[[M]] + +; CHECK: [[P]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]] + +; CHECK: [[WHILEBODY20PUREEXIT]]: +; CHECK: br label %[[R:.+]] + +; CHECK: [[R]]: +; CHECK: br label %[[FORCOND67:.+]] + +; CHECK: [[FORCOND67]]: +; CHECK: %[[CMP69:.+]] = icmp +; CHECK: br i1 %[[CMP69]], label %[[FORBODY71:.+]], label %[[FOREND75:.+]] + +; CHECK: [[FORBODY71]]: +; CHECK: br label %[[FORCOND67]] + +; CHECK: [[FOREND75]]: +; CHECK: br label %[[IFEND79:.+]] + +; CHECK: [[FORCOND87PREHEADER]]: +; CHECK: br label %[[FORCOND87:.+]] + +; CHECK: [[FORCOND87PREHEADERELSE:.+]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN23ELSE]], label %[[IFTHEN23SPLIT]] + +; CHECK: [[IFEND79]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[IFTHEN55]] + +; CHECK: [[F]]: +; CHECK: br label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[FELSE]]: +; CHECK: br label %[[U:.+]] + +; CHECK: [[FSPLIT]]: +; CHECK: br label %[[J:.+]] + +; CHECK: [[JLOOPEXIT]]: +; CHECK: br label %[[J]] + +; CHECK: [[J]]: +; CHECK: %[[CMP82:.+]] = icmp +; CHECK: br i1 %[[CMP82]], label %[[Q:.+]], label %[[U]] + +; CHECK: [[FORCOND87]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXIT:.+]], label %[[FORBODY91:.+]] + +; CHECK: [[FORBODY91]]: +; CHECK: br label %[[FORCOND87]] + +; CHECK: [[Q]]: +; CHECK: br label %[[FORCOND97:.+]] + +; CHECK: [[FORCOND97]]: +; CHECK: %[[CMP99:.+]] = icmp +; CHECK: br i1 %[[CMP99]], label %[[FORBODY101:.+]], label %[[VLOOPEXIT:.+]] + +; CHECK: [[FORBODY101]]: +; CHECK: br label %[[FORCOND97]] + +; CHECK: [[ULOOPEXIT]]: +; CHECK: br label %[[FORCOND87PREHEADERELSE]] + +; CHECK: [[U]]: +; CHECK: br label %[[FORCOND107:.+]] + +; CHECK: [[FORCOND107]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111:.+]], label %[[VLOOPEXIT1:.+]] + +; CHECK: [[FORBODY111]]: +; CHECK: br label %[[FORCOND107]] + +; CHECK: [[VLOOPEXIT]]: +; CHECK: br label %[[V]] + +; CHECK: [[VLOOPEXIT1]]: +; CHECK: br label %[[Q]] + +; CHECK: [[V]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll new file mode 100644 index 0000000000000..67d4e6542cdb5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll @@ -0,0 +1,251 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c +; \ / \ +; | \ +; | d +; | / \ +; | | e +; | \ / +; | f +; | / \ +; | | g +; | \ / +; \ h +; \ / +; i +; +; * where nodes d and f are uniform branches, and nodes a and c are varying +; branches. +; * where nodes b, c, i are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a___________ +; / \ \ +; b c_________ c' +; \ / \ \| +; | \ d' +; | d / \ +; | / \ | e' +; | | e \ / +; | \ / f' +; | f / \ +; | / \ | g' +; | | g \ / +; | \ / h' +; \ h | +; \ / b' +; i | +; `--> & <- i' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization13(__global int *out, int n) { +; size_t tid = get_global_id(0); +; size_t size = get_global_size(0); +; // a +; if (tid + 1 < size) { +; // b +; out[tid] = n; +; } else if (tid + 1 == size) { // c +; size_t leftovers = 1 + (size & 1); +; switch (leftovers) { // d +; case 2: // e +; out[tid] = 2 * n + 1; +; // fall through +; case 1: // f +; out[tid] += 3 * n - 1; +; break; +; } +; switch (leftovers) { // g +; case 2: +; out[tid] /= n; +; // fall through +; case 1: // h +; out[tid]--; +; break; +; } +; } +; // i +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization13(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %call1 = call i64 @__mux_get_global_size(i32 0) #2 + %add = add i64 %call, 1 + %cmp = icmp ult i64 %add, %call1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %n, i32 addrspace(1)* %arrayidx, align 4 + br label %if.end17 + +if.else: ; preds = %entry + %add2 = add i64 %call, 1 + %cmp3 = icmp eq i64 %add2, %call1 + br i1 %cmp3, label %if.then4, label %if.end17 + +if.then4: ; preds = %if.else + %0 = and i64 %call1, 1 + %trunc = icmp eq i64 %0, 0 + br i1 %trunc, label %sw.bb8, label %sw.bb + +sw.bb: ; preds = %if.then4 + %mul = shl nsw i32 %n, 1 + %add6 = or i32 %mul, 1 + %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %add6, i32 addrspace(1)* %arrayidx7, align 4 + br label %sw.bb8 + +sw.bb8: ; preds = %sw.bb, %if.then4 + %mul9 = mul nsw i32 %n, 3 + %sub = add nsw i32 %mul9, -1 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %add11 = add nsw i32 %sub, %1 + store i32 %add11, i32 addrspace(1)* %arrayidx10, align 4 + %2 = and i64 %call1, 1 + %trunc2 = icmp ne i64 %2, 0 + %trunc2.off = add i1 %trunc2, true + %switch = icmp ult i1 %trunc2.off, true + br i1 %switch, label %sw.bb12, label %sw.bb14 + +sw.bb12: ; preds = %sw.bb8 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + %3 = load i32, i32 addrspace(1)* %arrayidx13, align 4 + %4 = icmp eq i32 %3, -2147483648 + %5 = icmp eq i32 %n, -1 + %6 = and i1 %5, %4 + %7 = icmp eq i32 %n, 0 + %8 = or i1 %7, %6 + %9 = select i1 %8, i32 1, i32 %n + %div = sdiv i32 %3, %9 + store i32 %div, i32 addrspace(1)* %arrayidx13, align 4 + br label %sw.bb14 + +sw.bb14: ; preds = %sw.bb12, %sw.bb8 + %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + %10 = load i32, i32 addrspace(1)* %arrayidx15, align 4 + %dec = add nsw i32 %10, -1 + store i32 %dec, i32 addrspace(1)* %arrayidx15, align 4 + br label %if.end17 + +if.end17: ; preds = %sw.bb14, %if.else, %if.then + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_size(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization13, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization13 +; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]] + +; CHECK: [[IFELSEUNIFORM:.+]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFTHEN4UNIFORM]]: +; CHECK: %[[TRUNCUNIFORM:.+]] = icmp +; CHECK: br i1 %[[TRUNCUNIFORM]], label %[[SWBB8UNIFORM:.+]], label %[[SWBBUNIFORM:.+]] + +; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFEND17UNIFORM:.+]], label %[[IFTHEN4:.+]] + +; CHECK: [[SWBBUNIFORM]]: +; CHECK: br label %[[SWBB8UNIFORM]] + +; CHECK: [[SWBB8UNIFORM]]: +; CHECK: %[[TRUNC2UNIFORM:.+]] = icmp +; CHECK: br i1 %[[TRUNC2UNIFORM]], label %[[SWBB14UNIFORM:.+]], label %[[SWBB12UNIFORM:.+]] + +; CHECK: [[SWBB12UNIFORM]]: +; CHECK: br label %[[SWBB14UNIFORM]] + +; CHECK: [[SWBB14UNIFORM]]: +; CHECK: br label %[[IFEND17UNIFORM]] + +; CHECK: [[IFTHENUNIFORM]]: +; CHECK: br label %[[IFEND17:.+]] + +; CHECK: [[ENTRYBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSEUNIFORM]], label %[[IFELSE:.+]] + +; CHECK: [[IFTHEN:.+]]: +; CHECK: br label %[[IFEND17]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[IFTHEN4]] + +; CHECK: [[IFTHEN4]]: +; CHECK: %[[TRUNC:.+]] = icmp +; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even +; on inactive lanes. +; CHECK: %[[TRUNC_ACTIVE:.+]] = select i1 {{%.*}}, i1 %[[TRUNC]], i1 false +; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]]) +; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]] + +; CHECK: [[SWBB]]: +; CHECK: br label %[[SWBB8]] + +; CHECK: [[SWBB8]]: +; CHECK: %[[TRUNC2:.+]] = icmp +; CHECK: br i1 %[[TRUNC2]], label %[[SWBB14:.+]], label %[[SWBB12:.+]] + +; CHECK: [[SWBB12]]: +; CHECK: br label %[[SWBB14]] + +; CHECK: [[SWBB14]]: +; CHECK: br label %[[IFTHEN]] + +; CHECK: [[IFEND17]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll new file mode 100644 index 0000000000000..1a3e5764611b1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll @@ -0,0 +1,356 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c <-. +; | / \ | +; | d e | +; |/ \ / | +; f g --' +; \ | +; \ h +; \| +; i +; +; * where nodes a, d and g are uniform branches, and node c is a varying +; branch. +; * where nodes d, e, f, g, h and i are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; / \ +; b c <-. c' <. +; / / \__|__ | | +; | d e | `e' | +; | / \ / | | | +; | f g --' d' | +; \ \ | | | +; \ \ h g' -' +; \ \| | +; \ i h' +; \| / +; \ / +; / \ / +; | \ / +; | f' +; | | +; | i' +; \ / +; & +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization14(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; if (n < 5) { +; for (int i = 0; i < n + 10; i++) ret++; +; goto f; +; } else { +; while (1) { +; if (id + i % 2 == 0) { +; if (n > 2) { +; goto f; +; } +; } else { +; for (int i = 0; i < n + 10; i++) ret++; +; } +; if (n <= 2) break; +; } +; } +; +; ret += n * 2; +; for (int i = 0; i < n * 2; i++) ret -= i; +; ret /= n; +; goto early; +; +; f: +; for (int i = 0; i < n + 5; i++) ret /= 2; +; ret -= n; +; +; early: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization14(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %n, 5 + br i1 %cmp, label %for.cond, label %while.body + +for.cond: ; preds = %for.body, %entry + %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ] + %add = add nsw i32 %n, 10 + %cmp3 = icmp slt i32 %storemerge4, %add + br i1 %cmp3, label %for.body, label %f + +for.body: ; preds = %for.cond + %inc = add nuw nsw i32 %ret.0, 1 + %inc5 = add nuw nsw i32 %storemerge4, 1 + br label %for.cond + +while.body: ; preds = %if.end24, %entry + %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ] + %cmp7 = icmp eq i32 %conv, 0 + br i1 %cmp7, label %if.then9, label %for.cond15 + +if.then9: ; preds = %while.body + %cmp10 = icmp sgt i32 %n, 2 + br i1 %cmp10, label %f, label %if.end24 + +for.cond15: ; preds = %for.body19, %while.body + %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ] + %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ] + %add16 = add nsw i32 %n, 10 + %cmp17 = icmp slt i32 %storemerge, %add16 + br i1 %cmp17, label %for.body19, label %if.end24 + +for.body19: ; preds = %for.cond15 + %inc20 = add nsw i32 %ret.2, 1 + %inc22 = add nuw nsw i32 %storemerge, 1 + br label %for.cond15 + +if.end24: ; preds = %for.cond15, %if.then9 + %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ] + %cmp25 = icmp slt i32 %n, 3 + br i1 %cmp25, label %if.end29, label %while.body + +if.end29: ; preds = %if.end24 + %mul = mul i32 %n, 2 + %add30 = add nsw i32 %ret.3, %mul + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.end29 + %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ] + %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ] + %mul33 = shl nsw i32 %n, 1 + %cmp34 = icmp slt i32 %storemerge1, %mul33 + br i1 %cmp34, label %for.body36, label %for.end39 + +for.body36: ; preds = %for.cond32 + %sub = sub nsw i32 %ret.4, %storemerge1 + %inc38 = add nuw nsw i32 %storemerge1, 1 + br label %for.cond32 + +for.end39: ; preds = %for.cond32 + %0 = icmp eq i32 %ret.4, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %ret.4, %5 + br label %early + +f: ; preds = %if.then9, %for.cond + %ret.5 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %if.then9 ] + br label %for.cond41 + +for.cond41: ; preds = %for.body45, %f + %ret.6 = phi i32 [ %ret.5, %f ], [ %div46, %for.body45 ] + %storemerge3 = phi i32 [ 0, %f ], [ %inc48, %for.body45 ] + %add42 = add nsw i32 %n, 5 + %cmp43 = icmp slt i32 %storemerge3, %add42 + br i1 %cmp43, label %for.body45, label %for.end49 + +for.body45: ; preds = %for.cond41 + %div46 = sdiv i32 %ret.6, 2 + %inc48 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond41 + +for.end49: ; preds = %for.cond41 + %sub50 = sub nsw i32 %ret.6, %n + br label %early + +early: ; preds = %for.end49, %for.end39 + %storemerge2 = phi i32 [ %div, %for.end39 ], [ %sub50, %for.end49 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization14, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization14 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]] + +; CHECK: [[WHILEBODYPREHEADER]]: +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[FORCOND15PREHEADER:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN9UNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[FORCOND15PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND15UNIFORM:.+]] + +; CHECK: [[FORCOND15UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19UNIFORM:.+]], label %[[IFEND24LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY19UNIFORM]]: +; CHECK: br label %[[FORCOND15UNIFORM]] + +; CHECK: [[IFEND24LOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND24UNIFORM:.+]] + +; CHECK: [[IFTHEN9UNIFORM]]: +; CHECK: %[[CMP10UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FLOOPEXIT1UNIFORM:.+]], label %[[IFEND24UNIFORM]] + +; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND15PREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[FORCOND15PREHEADER]] + +; CHECK: [[IFEND24UNIFORM]]: +; CHECK: %[[CMP25UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP25UNIFORM]], label %[[IFEND29UNIFORM:.+]], label %[[WHILEBODYUNIFORM]] + +; CHECK: [[IFEND29UNIFORM]]: +; CHECK: br label %[[FORCOND32UNIFORM:.+]] + +; CHECK: [[FORCOND32UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36UNIFORM:.+]], label %[[FOREND39UNIFORM:.+]] + +; CHECK: [[FORBODY36UNIFORM]]: +; CHECK: br label %[[FORCOND32UNIFORM]] + +; CHECK: [[FOREND39UNIFORM]]: +; CHECK: br label %[[EARLYUNIFORM:.+]] + +; CHECK: [[FLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[FUNIFORM:.+]] + +; CHECK: [[FUNIFORM]]: +; CHECK: br label %[[FORCOND41UNIFORM:.+]] + +; CHECK: [[FORCOND41UNIFORM]]: +; CHECK: %[[CMP43UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP43UNIFORM]], label %[[FORBODY45UNIFORM:.+]], label %[[FOREND49UNIFORM:.+]] + +; CHECK: [[FORBODY45UNIFORM]]: +; CHECK: br label %[[FORCOND41UNIFORM]] + +; CHECK: [[FOREND49UNIFORM]]: +; CHECK: br label %[[EARLY:.+]] + +; CHECK: [[FORCOND15PREHEADER]]: +; CHECK: br label %[[FORCOND15:.+]] + +; CHECK: [[IFTHEN9:.+]]: +; CHECK: br label %[[IFEND24:.+]] + +; CHECK: [[FORCOND15]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]] + +; CHECK: [[FORBODY19]]: +; CHECK: br label %[[FORCOND15]] + +; CHECK: [[IFEND24LOOPEXIT]]: +; CHECK: br label %[[IFTHEN9]] + +; CHECK: [[IFEND24]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[IFEND29:.+]] + +; CHECK: [[IFEND29]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[IFEND29ELSE:.+]]: +; CHECK: br label %[[FLOOPEXIT2:.+]] + +; CHECK: [[FORCOND32]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[FOREND39]]: +; CHECK: br label %[[IFEND29ELSE]] + +; CHECK: [[FLOOPEXIT]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[FLOOPEXIT2]]: +; CHECK: br label %[[F]] + +; CHECK: [[F]]: +; CHECK: br label %[[FORCOND41:.+]] + +; CHECK: [[FORCOND41]]: +; CHECK: %[[CMP43:.+]] = icmp +; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]] + +; CHECK: [[FORBODY45]]: +; CHECK: br label %[[FORCOND41]] + +; CHECK: [[FOREND49]]: +; CHECK: br label %[[EARLY]] + +; CHECK: [[EARLY]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll new file mode 100644 index 0000000000000..b1626a8d0c7cc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll @@ -0,0 +1,415 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-----. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; / g <---. | +; / / \ | | +; f h i | | +; | / \ / \ | | +; | | j k | | +; | \ / \ / | | +; | l m --' | +; | / | +; | o ----------' +; | | +; n p +; \ / +; q +; +; * where nodes b, c, g, h, j and o are uniform branches, and node i is a +; varying branch. +; * where nodes j, k, m, l, and o are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <------------------. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; f g <---. g' <---. | +; | / \ | / \ | | +; | h i___|__ h' i' | | +; | / \ / \ | \|__ | | | +; | | j k | | `k' | | +; | \ / \ / | \ / | | +; | l m --' j' | | +; | | | | | +; | | m'-----' | +; \ | | | +; \ `----> & <--- l' | +; \ / | +; \ o ----------------' +; | | +; n p +; \ / +; q +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization15(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0) { // b +; // c +; for (int i = 0; i < n * 2; i++) ret++; +; if (n <= 10) { +; // f +; goto f; +; } +; } else { +; // d +; for (int i = 0; i < n / 4; i++) ret++; +; } +; // e +; ret++; +; while (1) { +; if (n & 1) { // g +; // h +; if (n < 3) { +; goto l; +; } +; } else { +; // i +; if (ret + id >= n) { +; // k +; ret /= n * n + ret; +; goto m; +; } +; } +; // j +; if (n & 1) { +; goto l; +; } +; // m +; m: +; ret++; +; } +; l: +; ret *= 4; +; o: +; if (n & 1) { +; // p +; ret++; +; goto p; +; } +; } +; +; p: +; for (int i = 0; i < n / 4; i++) ret++; +; goto q; +; +; f: +; ret /= n; +; goto n; +; +; n: +; for (int i = 0; i < n * 2; i++) ret++; +; +; q: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization15(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %l, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %mul40, %l ] + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %for.cond, label %for.cond9 + +for.cond: ; preds = %for.body, %while.body + %ret.1 = phi i32 [ %inc, %for.body ], [ %ret.0, %while.body ] + %storemerge3 = phi i32 [ %inc4, %for.body ], [ 0, %while.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp slt i32 %storemerge3, %mul + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.1, 1 + %inc4 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp5 = icmp slt i32 %n, 11 + br i1 %cmp5, label %f, label %if.end17 + +for.cond9: ; preds = %for.body12, %while.body + %ret.2 = phi i32 [ %inc13, %for.body12 ], [ %ret.0, %while.body ] + %storemerge = phi i32 [ %inc15, %for.body12 ], [ 0, %while.body ] + %div = sdiv i32 %n, 4 + %cmp10 = icmp slt i32 %storemerge, %div + br i1 %cmp10, label %for.body12, label %if.end17 + +for.body12: ; preds = %for.cond9 + %inc13 = add nsw i32 %ret.2, 1 + %inc15 = add nuw nsw i32 %storemerge, 1 + br label %for.cond9 + +if.end17: ; preds = %for.cond9, %for.end + %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ] + br label %while.body20 + +while.body20: ; preds = %m, %if.end17 + %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %m ] + %storemerge1 = add nsw i32 %storemerge1.in, 1 + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %if.else26, label %if.then21 + +if.then21: ; preds = %while.body20 + %cmp22 = icmp slt i32 %n, 3 + br i1 %cmp22, label %l, label %if.end34 + +if.else26: ; preds = %while.body20 + %add = add nsw i32 %storemerge1, %conv + %cmp27 = icmp slt i32 %add, %n + br i1 %cmp27, label %if.end34, label %if.then29 + +if.then29: ; preds = %if.else26 + %mul30 = mul nsw i32 %n, %n + %add31 = add nsw i32 %storemerge1, %mul30 + %0 = icmp eq i32 %add31, 0 + %1 = select i1 %0, i32 1, i32 %add31 + %div32 = sdiv i32 %storemerge1, %1 + br label %m + +if.end34: ; preds = %if.else26, %if.then21 + %and35 = and i32 %n, 1 + %tobool36 = icmp eq i32 %and35, 0 + br i1 %tobool36, label %m, label %l + +m: ; preds = %if.end34, %if.then29 + %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ] + br label %while.body20 + +l: ; preds = %if.end34, %if.then21 + %mul40 = shl nsw i32 %storemerge1, 2 + %and41 = and i32 %n, 1 + %tobool42 = icmp eq i32 %and41, 0 + br i1 %tobool42, label %while.body, label %if.then43 + +if.then43: ; preds = %l + %inc44 = or i32 %mul40, 1 + br label %for.cond47 + +for.cond47: ; preds = %for.body51, %if.then43 + %ret.5 = phi i32 [ %inc44, %if.then43 ], [ %inc52, %for.body51 ] + %storemerge2 = phi i32 [ 0, %if.then43 ], [ %inc54, %for.body51 ] + %div48 = sdiv i32 %n, 4 + %cmp49 = icmp slt i32 %storemerge2, %div48 + br i1 %cmp49, label %for.body51, label %q + +for.body51: ; preds = %for.cond47 + %inc52 = add nsw i32 %ret.5, 1 + %inc54 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond47 + +f: ; preds = %for.end + %2 = icmp eq i32 %ret.1, -2147483648 + %3 = icmp eq i32 %n, -1 + %4 = and i1 %3, %2 + %5 = icmp eq i32 %n, 0 + %6 = or i1 %5, %4 + %7 = select i1 %6, i32 1, i32 %n + %div56 = sdiv i32 %ret.1, %7 + br label %for.cond59 + +for.cond59: ; preds = %for.body63, %f + %ret.6 = phi i32 [ %div56, %f ], [ %inc64, %for.body63 ] + %storemerge4 = phi i32 [ 0, %f ], [ %inc66, %for.body63 ] + %mul60 = shl nsw i32 %n, 1 + %cmp61 = icmp slt i32 %storemerge4, %mul60 + br i1 %cmp61, label %for.body63, label %q + +for.body63: ; preds = %for.cond59 + %inc64 = add nsw i32 %ret.6, 1 + %inc66 = add nuw nsw i32 %storemerge4, 1 + br label %for.cond59 + +q: ; preds = %for.cond59, %for.cond47 + %ret.7 = phi i32 [ %ret.5, %for.cond47 ], [ %ret.6, %for.cond59 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.7, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization15, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization15 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]] + +; CHECK: [[FORCOND9PREHEADER]]: +; CHECK: br label %[[FORCOND9:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 false, label %[[FORBODY:.+]], label %[[FOREND:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]] + +; CHECK: [[FORCOND9]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND9]] + +; CHECK: [[IFEND17LOOPEXIT]]: +; CHECK: br label %[[IFEND17]] + +; CHECK: [[IFEND17]]: +; CHECK: br i1 true, label %[[WHILEBODY20UNIFORM:.+]], label %[[WHILEBODY20:.+]] + +; CHECK: [[WHILEBODY20]]: +; CHECK: %[[TOBOOL:.+]] = icmp +; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]] + +; CHECK: [[IFTHEN21]]: +; CHECK: br label %[[M:.+]] + +; CHECK: [[IFELSE26]]: +; CHECK: br label %[[IFTHEN29:.+]] + +; CHECK: [[WHILEBODY20UNIFORM]]: +; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[IFELSE26UNIFORM:.+]], label %[[IFTHEN21UNIFORM:.+]] + +; CHECK: [[IFTHEN21UNIFORM]]: +; CHECK: %[[CMP22UNIFORM:.+]] = icmp +; CHECK: %[[TOBOOLNEW36UNIFORM:.+]] = icmp +; CHECK: %[[ORCONDUNIFORM:.+]] = and i1 %[[CMP22UNIFORM]], %[[TOBOOLNEW36UNIFORM]] +; CHECK: br i1 %[[ORCONDUNIFORM]], label %[[MUNIFORM:.+]], label %[[L:.+]] + +; CHECK: [[IFELSE26UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[IFEND34UNIFORM:.+]], label %[[IFELSE26UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFTHEN29UNIFORM:.+]]: +; CHECK: br label %[[MUNIFORM:.+]] + +; CHECK: [[IFEND34UNIFORM]]: +; CHECK: %[[TOBOOL36UNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOL36UNIFORM]], label %[[MUNIFORM]], label %[[L:.+]] + +; CHECK: [[IFELSE26UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN29UNIFORM]], label %[[IFELSE26UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSE26UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFTHEN29]] + +; CHECK: [[MUNIFORM]]: +; CHECK: br label %[[WHILEBODY20UNIFORM]] + +; CHECK: [[IFTHEN29]]: +; CHECK: br label %[[IFEND34:.+]] + +; CHECK: [[IFEND34]]: +; CHECK: br label %[[M:.+]] + +; CHECK: [[M]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]] + +; CHECK: [[WHILEBODY20PUREEXIT]]: +; CHECK: br label %[[L]] + +; CHECK: [[L]]: +; CHECK: %[[TOBOOL42:.+]] = icmp +; CHECK: br i1 %[[TOBOOL42]], label %[[WHILEBODY]], label %[[IFTHEN43:.+]] + +; CHECK: [[IFTHEN43]]: +; CHECK: br label %[[FORCOND47:.+]] + +; CHECK: [[FORCOND47]]: +; CHECK: %[[CMP49:.+]] = icmp +; CHECK: br i1 %[[CMP49]], label %[[FORBODY51:.+]], label %[[QLOOPEXIT2:.+]] + +; CHECK: [[FORBODY51]]: +; CHECK: br label %[[FORCOND47]] + +; CHECK: [[F]]: +; CHECK: br label %[[FORCOND59:.+]] + +; CHECK: [[FORCOND59]]: +; CHECK: br i1 false, label %[[FORBODY63:.+]], label %[[QLOOPEXIT:.+]] + +; CHECK: [[FORBODY63]]: +; CHECK: br label %[[FORCOND59]] + +; CHECK: [[QLOOPEXIT]]: +; CHECK: br label %[[Q:.+]] + +; CHECK: [[QLOOPEXIT2]]: +; CHECK: br label %[[Q]] + +; CHECK: [[Q]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll new file mode 100644 index 0000000000000..e9567cc00d194 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll @@ -0,0 +1,394 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c <-. +; / / \ | +; | d e | +; | / \ / | +; | f g --' +; |/ | +; h i +; \ / +; \ / +; j +; +; * where nodes a, d and g are uniform branches, and node c is a varying +; branch. +; * where nodes d, e, f, g, i and j are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; / \ +; b c <-. c' <. +; / / \__|_ | | +; / d e | `e' | +; / / \ / | | | +; / f g --' d' | +; | / | | | +; \ h i g' -' +; \ \ / | +; \ \ / i' +; \ j | +; \| f' +; \ | +; /\ / +; | \ / +; | \ / +; | \ / +; | h' +; | | +; | j' +; \ / +; \ / +; & +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization16(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; if (n < 5) { +; for (int i = 0; i < n + 10; i++) ret++; +; goto h; +; } else { +; while (1) { +; if (id + i % 2 == 0) { +; if (n > 2) { +; goto f; +; } +; } else { +; for (int i = 0; i < n + 10; i++) ret++; +; } +; if (n <= 2) break; +; } +; } +; +; ret += n * 2; +; for (int i = 0; i < n * 2; i++) ret -= i; +; ret /= n; +; goto early; +; +; f: +; for (int i = 0; i < n + 5; i++) ret /= 2; +; ret -= n; +; +; h: +; for (int i = 0; i < n * 2; i++) ret -= i; +; +; early: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization16(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %n, 5 + br i1 %cmp, label %for.cond, label %while.body + +for.cond: ; preds = %for.body, %entry + %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ] + %add = add nsw i32 %n, 10 + %cmp3 = icmp slt i32 %storemerge4, %add + br i1 %cmp3, label %for.body, label %h + +for.body: ; preds = %for.cond + %inc = add nuw nsw i32 %ret.0, 1 + %inc5 = add nuw nsw i32 %storemerge4, 1 + br label %for.cond + +while.body: ; preds = %if.end24, %entry + %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ] + %cmp7 = icmp eq i32 %conv, 0 + br i1 %cmp7, label %if.then9, label %for.cond15 + +if.then9: ; preds = %while.body + %cmp10 = icmp sgt i32 %n, 2 + br i1 %cmp10, label %for.cond41, label %if.end24 + +for.cond15: ; preds = %for.body19, %while.body + %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ] + %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ] + %add16 = add nsw i32 %n, 10 + %cmp17 = icmp slt i32 %storemerge, %add16 + br i1 %cmp17, label %for.body19, label %if.end24 + +for.body19: ; preds = %for.cond15 + %inc20 = add nsw i32 %ret.2, 1 + %inc22 = add nuw nsw i32 %storemerge, 1 + br label %for.cond15 + +if.end24: ; preds = %for.cond15, %if.then9 + %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ] + %cmp25 = icmp slt i32 %n, 3 + br i1 %cmp25, label %if.end29, label %while.body + +if.end29: ; preds = %if.end24 + %mul = mul i32 %n, 2 + %add30 = add nsw i32 %ret.3, %mul + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.end29 + %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ] + %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ] + %mul33 = shl nsw i32 %n, 1 + %cmp34 = icmp slt i32 %storemerge1, %mul33 + br i1 %cmp34, label %for.body36, label %for.end39 + +for.body36: ; preds = %for.cond32 + %sub = sub nsw i32 %ret.4, %storemerge1 + %inc38 = add nuw nsw i32 %storemerge1, 1 + br label %for.cond32 + +for.end39: ; preds = %for.cond32 + %0 = icmp eq i32 %ret.4, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %ret.4, %5 + br label %early + +for.cond41: ; preds = %for.body45, %if.then9 + %ret.5 = phi i32 [ %div46, %for.body45 ], [ %ret.1, %if.then9 ] + %storemerge2 = phi i32 [ %inc48, %for.body45 ], [ 0, %if.then9 ] + %add42 = add nsw i32 %n, 5 + %cmp43 = icmp slt i32 %storemerge2, %add42 + br i1 %cmp43, label %for.body45, label %for.end49 + +for.body45: ; preds = %for.cond41 + %div46 = sdiv i32 %ret.5, 2 + %inc48 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond41 + +for.end49: ; preds = %for.cond41 + %sub50 = sub nsw i32 %ret.5, %n + br label %h + +h: ; preds = %for.end49, %for.cond + %ret.6 = phi i32 [ %sub50, %for.end49 ], [ %ret.0, %for.cond ] + br label %for.cond52 + +for.cond52: ; preds = %for.body56, %h + %ret.7 = phi i32 [ %ret.6, %h ], [ %sub57, %for.body56 ] + %storemerge3 = phi i32 [ 0, %h ], [ %inc59, %for.body56 ] + %mul53 = shl nsw i32 %n, 1 + %cmp54 = icmp slt i32 %storemerge3, %mul53 + br i1 %cmp54, label %for.body56, label %early + +for.body56: ; preds = %for.cond52 + %sub57 = sub nsw i32 %ret.7, %storemerge3 + %inc59 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond52 + +early: ; preds = %for.cond52, %for.end39 + %ret.8 = phi i32 [ %div, %for.end39 ], [ %ret.7, %for.cond52 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization16, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization16 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]] + +; CHECK: [[WHILEBODYPREHEADER]]: +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[FORCOND15PREHEADER:.+]] + +; CHECK: [[WHILEBODYUNIFORM:.+]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN9UNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[FORCOND15PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND15UNIFORM:.+]] + +; CHECK: [[FORCOND15UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19UNIFORM:.+]], label %[[IFEND24LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY19UNIFORM]]: +; CHECK: br label %[[FORCOND15UNIFORM]] + +; CHECK: [[IFEND24LOOPEXITUNIFORM]]: +; CHECK: br label %[[IFEND24UNIFORM:.+]] + +; CHECK: [[IFTHEN9UNIFORM:.+]]: +; CHECK: %[[CMP10UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORCOND41PREHEADERUNIFORM:.+]], label %[[IFEND24UNIFORM]] + +; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND15PREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[FORCOND15PREHEADER]] + +; CHECK: [[IFEND24UNIFORM]]: +; CHECK: %[[CMP25UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP25UNIFORM]], label %[[IFEND29UNIFORM:.+]], label %[[WHILEBODYUNIFORM]] + +; CHECK: [[IFEND29UNIFORM]]: +; CHECK: br label %[[FORCOND32UNIFORM:.+]] + +; CHECK: [[FORCOND32UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36UNIFORM:.+]], label %[[FOREND39UNIFORM:.+]] + +; CHECK: [[FORBODY36UNIFORM]]: +; CHECK: br label %[[FORCOND32UNIFORM]] + +; CHECK: [[FOREND39UNIFORM]]: +; CHECK: br label %[[EARLYUNIFORM:.+]] + +; CHECK: [[FORCOND41PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND41UNIFORM:.+]] + +; CHECK: [[FORCOND41UNIFORM]]: +; CHECK: %[[CMP43UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP43UNIFORM]], label %[[FORBODY45UNIFORM:.+]], label %[[FOREND49UNIFORM:.+]] + +; CHECK: [[FORBODY45UNIFORM]]: +; CHECK: br label %[[FORCOND41UNIFORM]] + +; CHECK: [[FOREND49UNIFORM]]: +; CHECK: br label %[[HUNIFORM:.+]] + +; CHECK: [[HUNIFORM]]: +; CHECK: br label %[[FORCOND52UNIFORM:.+]] + +; CHECK: [[FORCOND52UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56UNIFORM:.+]], label %[[EARLYLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY56UNIFORM]]: +; CHECK: br label %[[FORCOND52UNIFORM]] + +; CHECK: [[EARLYLOOPEXITUNIFORM]]: +; CHECK: br label %[[EARLY:.+]] + +; CHECK: [[FORCOND15PREHEADER]]: +; CHECK: br label %[[FORCOND15:.+]] + +; CHECK: [[IFTHEN9:.+]]: +; CHECK: br label %[[IFEND24:.+]] + +; CHECK: [[FORCOND41PREHEADER:.+]]: +; CHECK: br label %[[FORCOND41:.+]] + +; CHECK: [[FORCOND15]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]] + +; CHECK: [[FORBODY19]]: +; CHECK: br label %[[FORCOND15]] + +; CHECK: [[IFEND24LOOPEXIT]]: +; CHECK: br label %[[IFTHEN9]] + +; CHECK: [[IFEND24]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[IFEND29:.+]] + +; CHECK: [[IFEND29]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[IFEND29ELSE:.+]]: +; CHECK: br label %[[FORCOND41PREHEADER]] + +; CHECK: [[FORCOND32]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[FOREND39]]: +; CHECK: br label %[[IFEND29ELSE]] + +; CHECK: [[FORCOND41]]: +; CHECK: %[[CMP43:.+]] = icmp +; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]] + +; CHECK: [[FORBODY45]]: +; CHECK: br label %[[FORCOND41]] + +; CHECK: [[FOREND49]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[H]] + +; CHECK: [[H]]: +; CHECK: br label %[[FORCOND52:.+]] + +; CHECK: [[FORCOND52]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56:.+]], label %[[EARLYLOOPEXIT:.+]] + +; CHECK: [[FORBODY56]]: +; CHECK: br label %[[FORCOND52]] + +; CHECK: [[EARLYLOOPEXIT]]: +; CHECK: br label %[[EARLY]] + +; CHECK: [[EARLY]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll new file mode 100644 index 0000000000000..2c25911eeba63 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll @@ -0,0 +1,470 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <----. +; / \ | +; c d | +; / / \ | +; e f g -' +; / \ | | +; .--> h | i j +; | / \ | \ / +; '- k l '-> m +; | \ / +; n \ / +; \ o +; \ / +; \ / +; p +; +; * where nodes b, d, and h are uniform branches, and nodes e and g are varying +; branches. +; * where nodes h, j, m, o, and p are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <----. .-----------> b' <----. +; / \ | | / \ | +; c d | | c' d' | +; / / \ | | / / \ | +; e f g -' | e' f' g' -' +; / \__|___|\___' _____ / | | +; .--> h | i j\____/ .-->`h' i' | +; | / \ | \ / | / \ | | +; '- k l '-> m '- k' l' | | +; | \ / \ \ | / +; n \ / n' \ | / +; \ o \ \|/ +; \ / `-> j' +; \ / | +; p m' +; | | +; | o' +; | | +; `----------> & <--------- p' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization17(__global int *out, int n, int x) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; while (1) { +; if (n > 10) { +; goto c; +; } else if (n < 5) { +; goto f; +; } +; if (id + i++ % 2 == 0) { +; break; +; } +; } +; +; // j +; for (int i = 0; i < n + 10; i++) ret++; +; goto m; +; +; f: +; ret += x / 2; +; for (int i = 0; i < x / 2; i++) ret += i; +; goto m; +; +; c: +; for (int i = 0; i < n - 5; i++) ret += 2; +; // e +; if (id % 2 == 0) { +; goto h; +; } else { +; goto m; +; } +; +; m: +; ret <<= 2; +; goto o; +; +; h: +; for (int i = 0; i < x / 2; i++) { +; if (x < 5) { +; goto l; +; } +; } +; // n +; ret += id << 3; +; goto p; +; +; l: +; ret += id << 3; +; +; o: +; for (int i = 0; i < x / 2; i++) ret += i; +; +; p: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 noundef %n, i32 noundef %x) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end5, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end5 ] + %cmp = icmp sgt i32 %n, 10 + br i1 %cmp, label %for.cond28, label %if.else + +if.else: ; preds = %while.body + %cmp2 = icmp slt i32 %n, 5 + br i1 %cmp2, label %f, label %if.end5 + +if.end5: ; preds = %if.else + %inc = add nuw nsw i32 %i.0, 1 + %rem = and i32 %i.0, 1 + %add = sub nsw i32 0, %rem + %cmp6 = icmp eq i32 %conv, %add + br i1 %cmp6, label %for.cond, label %while.body + +for.cond: ; preds = %for.body, %if.end5 + %ret.0 = phi i32 [ %inc14, %for.body ], [ 0, %if.end5 ] + %storemerge = phi i32 [ %inc15, %for.body ], [ 0, %if.end5 ] + %add11 = add nsw i32 %n, 10 + %cmp12 = icmp slt i32 %storemerge, %add11 + br i1 %cmp12, label %for.body, label %m + +for.body: ; preds = %for.cond + %inc14 = add nuw nsw i32 %ret.0, 1 + %inc15 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +f: ; preds = %if.else + %div = sdiv i32 %x, 2 + br label %for.cond18 + +for.cond18: ; preds = %for.body22, %f + %ret.1 = phi i32 [ %div, %f ], [ %add23, %for.body22 ] + %storemerge3 = phi i32 [ 0, %f ], [ %inc25, %for.body22 ] + %div19 = sdiv i32 %x, 2 + %cmp20 = icmp slt i32 %storemerge3, %div19 + br i1 %cmp20, label %for.body22, label %m + +for.body22: ; preds = %for.cond18 + %add23 = add nsw i32 %storemerge3, %ret.1 + %inc25 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond18 + +for.cond28: ; preds = %for.body32, %while.body + %ret.2 = phi i32 [ %add33, %for.body32 ], [ 0, %while.body ] + %storemerge4 = phi i32 [ %inc35, %for.body32 ], [ 0, %while.body ] + %add29 = add nsw i32 %n, 5 + %cmp30 = icmp slt i32 %storemerge4, %add29 + br i1 %cmp30, label %for.body32, label %for.end36 + +for.body32: ; preds = %for.cond28 + %add33 = add nuw nsw i32 %ret.2, 2 + %inc35 = add nuw nsw i32 %storemerge4, 1 + br label %for.cond28 + +for.end36: ; preds = %for.cond28 + %rem375 = and i32 %conv, 1 + %cmp38 = icmp eq i32 %rem375, 0 + br i1 %cmp38, label %for.cond43, label %m + +m: ; preds = %for.end36, %for.cond18, %for.cond + %ret.3 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond18 ], [ %ret.2, %for.end36 ] + %shl = shl i32 %ret.3, 2 + br label %o + +for.cond43: ; preds = %for.inc52, %for.end36 + %storemerge6 = phi i32 [ %inc53, %for.inc52 ], [ 0, %for.end36 ] + %div44 = sdiv i32 %x, 2 + %cmp45 = icmp slt i32 %storemerge6, %div44 + br i1 %cmp45, label %for.body47, label %for.end54 + +for.body47: ; preds = %for.cond43 + %cmp48 = icmp slt i32 %x, 5 + br i1 %cmp48, label %l, label %for.inc52 + +for.inc52: ; preds = %for.body47 + %inc53 = add nuw nsw i32 %storemerge6, 1 + br label %for.cond43 + +for.end54: ; preds = %for.cond43 + %shl55 = mul i32 %conv, 8 + %add56 = add nsw i32 %ret.2, %shl55 + br label %p + +l: ; preds = %for.body47 + %shl57 = mul i32 %conv, 8 + %add58 = add nsw i32 %ret.2, %shl57 + br label %o + +o: ; preds = %l, %m + %storemerge1 = phi i32 [ %shl, %m ], [ %add58, %l ] + br label %for.cond60 + +for.cond60: ; preds = %for.body64, %o + %ret.4 = phi i32 [ %storemerge1, %o ], [ %add65, %for.body64 ] + %storemerge2 = phi i32 [ 0, %o ], [ %inc67, %for.body64 ] + %div61 = sdiv i32 %x, 2 + %cmp62 = icmp slt i32 %storemerge2, %div61 + br i1 %cmp62, label %for.body64, label %p + +for.body64: ; preds = %for.cond60 + %add65 = add nsw i32 %storemerge2, %ret.4 + %inc67 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond60 + +p: ; preds = %for.cond60, %for.end54 + %ret.5 = phi i32 [ %add56, %for.end54 ], [ %ret.4, %for.cond60 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.5, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization17, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization17 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]] + +; CHECK: [[FORCOND28PREHEADER]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[FORCOND28PREHEADERELSE:.+]]: +; CHECK: br label %[[M:.+]] + +; CHECK: [[FORCOND28PREHEADERSPLIT:.+]]: +; CHECK: br label %[[FORCOND28:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: %[[CMP2:.+]] = icmp +; CHECK: br i1 %[[CMP2]], label %[[F:.+]], label %[[IFEND5:.+]] + +; CHECK: [[IFEND5]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: %[[CMPUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCOND28PREHEADERUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]] + +; CHECK: [[IFELSEUNIFORM]]: +; CHECK: %[[CMP2UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND5UNIFORM:.+]] + +; CHECK: [[IFEND5UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFEND5UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[IFEND5UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFEND5UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFEND5UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[WHILEBODY]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[MLOOPEXIT1UNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[MLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[MUNIFORM:.+]] + +; CHECK: [[FUNIFORM]]: +; CHECK: br label %[[FORCOND18UNIFORM:.+]] + +; CHECK: [[FORCOND18UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22UNIFORM:.+]], label %[[MLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY22UNIFORM]]: +; CHECK: br label %[[FORCOND18UNIFORM]] + +; CHECK: [[MLOOPEXITUNIFORM]]: +; CHECK: br label %[[MUNIFORM]] + +; CHECK: [[FORCOND28PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND28UNIFORM:.+]] + +; CHECK: [[FORCOND28UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32UNIFORM:.+]], label %[[FOREND36UNIFORM:.+]] + +; CHECK: [[FORBODY32UNIFORM]]: +; CHECK: br label %[[FORCOND28UNIFORM]] + +; CHECK: [[FOREND36UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND43PREHEADERUNIFORM:.+]], label %[[FOREND36UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[FORCOND43PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND43UNIFORM:.+]] + +; CHECK: [[FOREND36UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[MUNIFORM]], label %[[FORCOND43PREHEADER:.+]] + +; CHECK: [[FORCOND43UNIFORM]]: +; CHECK: %[[CMP45UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP45UNIFORM]], label %[[FORBODY47UNIFORM:.+]], label %[[FOREND54UNIFORM:.+]] + +; CHECK: [[FORBODY47UNIFORM]]: +; CHECK: %[[CMP48UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP48UNIFORM]], label %[[LUNIFORM:.+]], label %[[FORINC52UNIFORM:.+]] + +; CHECK: [[FORINC52UNIFORM]]: +; CHECK: br label %[[FORCOND43UNIFORM]] + +; CHECK: [[FOREND54UNIFORM]]: +; CHECK: br label %[[PUNIFORM:.+]] + +; CHECK: [[LUNIFORM]]: +; CHECK: br label %[[OUNIFORM:.+]] + +; CHECK: [[MUNIFORM]]: +; CHECK: br label %[[OUNIFORM]] + +; CHECK: [[OUNIFORM]]: +; CHECK: br label %[[FORCOND60UNIFORM:.+]] + +; CHECK: [[FORCOND60UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64UNIFORM:.+]], label %[[PLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY64UNIFORM]]: +; CHECK: br label %[[FORCOND60UNIFORM]] + +; CHECK: [[PLOOPEXITUNIFORM]]: +; CHECK: br label %[[P:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCONDPREHEADERELSE:.+]]: +; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[MLOOPEXIT2:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[F]]: +; CHECK: br label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[FELSE]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND28PREHEADERELSE]], label %[[FORCOND28PREHEADERSPLIT]] + +; CHECK: [[FSPLIT]]: +; CHECK: br label %[[FORCOND18:.+]] + +; CHECK: [[FORCOND18]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[MLOOPEXIT:.+]] + +; CHECK: [[FORBODY22]]: +; CHECK: br label %[[FORCOND18]] + +; CHECK: [[FORCOND28]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32:.+]], label %[[FOREND36:.+]] + +; CHECK: [[FORBODY32]]: +; CHECK: br label %[[FORCOND28]] + +; CHECK: [[FOREND36]]: +; CHECK: br label %[[FORCOND43PREHEADER]] + +; CHECK: [[FORCOND43PREHEADER]]: +; CHECK: br label %[[FORCOND43:.+]] + +; CHECK: [[MLOOPEXIT]]: +; CHECK: br label %[[M]] + +; CHECK: [[MLOOPEXIT2]]: +; CHECK: br label %[[FORCONDPREHEADERELSE]] + +; CHECK: [[M]]: +; CHECK: br label %[[O:.+]] + +; CHECK: [[FORCOND43]]: +; CHECK: %[[CMP14:.+]] = icmp +; CHECK: br i1 %[[CMP14]], label %[[FORBODY47:.+]], label %[[FOREND54:.+]] + +; CHECK: [[FORBODY47]]: +; CHECK: %[[CMP48:.+]] = icmp +; CHECK: br i1 %[[CMP48]], label %[[L:.+]], label %[[FORINC52:.+]] + +; CHECK: [[FORINC52]]: +; CHECK: br label %[[FORCOND43]] + +; CHECK: [[FOREND54]]: +; CHECK: br label %[[M]] + +; CHECK: [[L]]: +; CHECK: br label %[[M]] + +; CHECK: [[O]]: +; CHECK: br label %[[FORCOND60:.+]] + +; CHECK: [[FORCOND60]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64:.+]], label %[[PLOOPEXIT:.+]] + +; CHECK: [[FORBODY64]]: +; CHECK: br label %[[FORCOND60]] + +; CHECK: [[PLOOPEXIT]]: +; CHECK: br label %[[P]] + +; CHECK: [[P]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll new file mode 100644 index 0000000000000..f9868c86a2d0b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll @@ -0,0 +1,357 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <--. +; / \ | +; c d -' +; / \ | +; e f | +; | \| +; | g +; | / +; | h +; \ / \ +; i j +; \ / +; k +; +; * where nodes b, and h are uniform branches, and nodes c and d are varying +; branches. +; * where nodes e, f, g, i and k are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <--. .-> b' <--. +; / \ | | / \ | +; c d -' | c' d' -' +; / \__|\___' | | +; e f |`---> f' | +; | \| | | +; | g e' | +; | / \ / +; | h g' +; \ / \ | +; i j h' +; \ / / \ +; k | j' +; | \ / +; | i' +; | | +; `--> & <-- k' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization18(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; while (1) { +; if (n > 5) { +; if (id + i % 2 == 0) { +; goto e; +; } else { +; goto f; +; } +; } +; if (++i + id > 3) { +; goto g; +; } +; } +; +; f: +; for (int i = 0; i < n + 5; i++) ret += 2; +; goto g; +; +; g: +; for (int i = 1; i < n * 2; i++) ret *= i; +; goto h; +; +; e: +; for (int i = 0; i < n + 5; i++) ret++; +; goto i; +; +; h: +; if (n > 3) { +; i: +; ret++; +; } else { +; ret *= 3; +; } +; +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %cmp = icmp sgt i32 %n, 5 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %while.body + %rem = and i32 %i.0, 1 + %add = sub nsw i32 0, %rem + %cmp2 = icmp eq i32 %conv, %add + br i1 %cmp2, label %for.cond26, label %for.cond + +if.end: ; preds = %while.body + %inc = add nuw nsw i32 %i.0, 1 + %add5 = add nsw i32 %inc, %conv + %cmp6 = icmp sgt i32 %add5, 3 + br i1 %cmp6, label %g, label %while.body + +for.cond: ; preds = %for.body, %if.then + %ret.0 = phi i32 [ %add14, %for.body ], [ 0, %if.then ] + %storemerge2 = phi i32 [ %inc15, %for.body ], [ 0, %if.then ] + %add11 = add nsw i32 %n, 5 + %cmp12 = icmp slt i32 %storemerge2, %add11 + br i1 %cmp12, label %for.body, label %g + +for.body: ; preds = %for.cond + %add14 = add nuw nsw i32 %ret.0, 2 + %inc15 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond + +g: ; preds = %for.cond, %if.end + %ret.1 = phi i32 [ 0, %if.end ], [ %ret.0, %for.cond ] + br label %for.cond17 + +for.cond17: ; preds = %for.body20, %g + %ret.2 = phi i32 [ %ret.1, %g ], [ %mul21, %for.body20 ] + %storemerge = phi i32 [ 1, %g ], [ %inc23, %for.body20 ] + %mul = shl nsw i32 %n, 1 + %cmp18 = icmp slt i32 %storemerge, %mul + br i1 %cmp18, label %for.body20, label %h + +for.body20: ; preds = %for.cond17 + %mul21 = mul nsw i32 %storemerge, %ret.2 + %inc23 = add nuw nsw i32 %storemerge, 1 + br label %for.cond17 + +for.cond26: ; preds = %for.body30, %if.then + %ret.3 = phi i32 [ %inc31, %for.body30 ], [ 0, %if.then ] + %storemerge3 = phi i32 [ %inc33, %for.body30 ], [ 0, %if.then ] + %add27 = add nsw i32 %n, 5 + %cmp28 = icmp slt i32 %storemerge3, %add27 + br i1 %cmp28, label %for.body30, label %i38 + +for.body30: ; preds = %for.cond26 + %inc31 = add nuw nsw i32 %ret.3, 1 + %inc33 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond26 + +h: ; preds = %for.cond17 + %cmp35 = icmp sgt i32 %n, 3 + br i1 %cmp35, label %i38, label %if.else40 + +i38: ; preds = %h, %for.cond26 + %ret.4 = phi i32 [ %ret.3, %for.cond26 ], [ %ret.2, %h ] + %inc39 = add nsw i32 %ret.4, 1 + br label %if.end42 + +if.else40: ; preds = %h + %mul41 = mul nsw i32 %ret.2, 3 + br label %if.end42 + +if.end42: ; preds = %if.else40, %i38 + %storemerge1 = phi i32 [ %mul41, %if.else40 ], [ %inc39, %i38 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge1, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization18, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization18 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[IFTHENELSE:.+]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[IFTHENSPLIT:.+]]: +; CHECK: br label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND26PREHEADER:.+]]: +; CHECK: br label %[[FORCOND26:.+]] + +; CHECK: [[IFEND]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[GLOOPEXIT2:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: %[[CMPUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFENDUNIFORM:.+]] + +; CHECK: [[IFENDUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT1UNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[GLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[GUNIFORM:.+]] + +; CHECK: [[IFENDUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFENDUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[WHILEBODY]] + +; CHECK: [[IFTHENUNIFORM]] +; CHECK: br i1 %{{.+}}, label %[[FORCOND26PREHEADERUNIFORM:.+]], label %[[IFTHENUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[GLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[GLOOPEXITUNIFORM]]: +; CHECK: br label %[[GUNIFORM]] + +; CHECK: [[FORCOND26PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND26UNIFORM:.+]] + +; CHECK: [[IFTHENUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[FORCOND26UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30UNIFORM:.+]], label %[[I38LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY30UNIFORM]]: +; CHECK: br label %[[FORCOND26UNIFORM]] + +; CHECK: [[I38LOOPEXITUNIFORM]]: +; CHECK: br label %[[I38UNIFORM:.+]] + +; CHECK: [[GUNIFORM]]: +; CHECK: br label %[[FORCOND17UNIFORM:.+]] + +; CHECK: [[FORCOND17UNIFORM]]: +; CHECK: %[[CMP18UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP18UNIFORM]], label %[[FORBODY20UNIFORM:.+]], label %[[HUNIFORM:.+]] + +; CHECK: [[FORBODY20UNIFORM]]: +; CHECK: br label %[[FORCOND17UNIFORM]] + +; CHECK: [[HUNIFORM]]: +; CHECK: %[[CMP35UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP35UNIFORM]], label %[[I38UNIFORM]], label %[[IFELSE40UNIFORM:.+]] + +; CHECK: [[IFELSE40UNIFORM]]: +; CHECK: br label %[[IFEND42UNIFORM:.+]] + +; CHECK: [[I38UNIFORM]]: +; CHECK: br label %[[IFEND42:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[GLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[GLOOPEXIT]]: +; CHECK: br label %[[FORCOND26PREHEADER]] + +; CHECK: [[GLOOPEXIT2]]: +; CHECK: br label %[[GLOOPEXIT2ELSE:.+]] + +; CHECK: [[GLOOPEXIT2ELSE]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]] + +; CHECK: [[G]]: +; CHECK: br label %[[FORCOND17:.+]] + +; CHECK: [[FORCOND17]]: +; CHECK: %[[CMP18:.+]] = icmp +; CHECK: br i1 %[[CMP18]], label %[[FORBODY20:.+]], label %[[H:.+]] + +; CHECK: [[FORBODY20]]: +; CHECK: br label %[[FORCOND17]] + +; CHECK: [[FORCOND26]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[I38LOOPEXIT:.+]] + +; CHECK: [[FORBODY30]]: +; CHECK: br label %[[FORCOND26]] + +; CHECK: [[H]]: +; CHECK: %[[CMP35:.+]] = icmp +; CHECK: br i1 %[[CMP35]], label %[[I38:.+]], label %[[IFELSE40:.+]] + +; CHECK: [[I38LOOPEXIT]]: +; CHECK: br label %[[G]] + +; CHECK: [[I38]]: +; CHECK: br label %[[IFEND42]] + +; CHECK: [[IFELSE40]]: +; CHECK: br label %[[I38]] + +; CHECK: [[IFEND42]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll new file mode 100644 index 0000000000000..37ca06b926eca --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll @@ -0,0 +1,379 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <----. +; / \ | +; c \ | +; / \ \ | +; d e f -' +; | | | +; \ \ g +; \ \ / \ +; \ h i <, +; \ \ / / +; \ j / +; \ / +; `-' +; +; * where nodes b, c, and g are uniform branches, and node f is a varying +; branch. +; * where nodes g, h, i and j are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <----. .---> b' <----. +; / \ | | / \ | +; c \ | | c' \ | +; / \ \ | | / \ \ | +; d e f -' | d' e' f' -' +; | | |\___' | | | +; \ \ g \ | / +; \ \ / \ \ | / +; \ h i <, \|/ +; \ \ / / g' +; \ j / | +; \ | / i' +; `-' | +; | h' +; | | +; `--> & <- j' +; +; where '&' represents merge blocks of BOSCC regions. +; +; The uniform branch `g` has been linearized because both its successors are +; divergent. Not linearizing `g` would mean that only one of both +; successors could be executed in addition to the other, pending a uniform +; condition evaluates to true, whereas what we want is to possibly execute both +; no matter what the uniform condition evaluates to. +; +; __kernel void partial_linearization19(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; while (1) { +; if (n > 5) { +; if (n == 6) { +; goto d; +; } else { +; goto e; +; } +; } +; if (++i + id > 3) { +; break; +; } +; } +; +; // g +; if (n == 3) { +; goto h; +; } else { +; goto i; +; } +; +; d: +; for (int i = 0; i < n + 5; i++) ret += 2; +; goto i; +; +; e: +; for (int i = 1; i < n * 2; i++) ret += i; +; goto h; +; +; i: +; for (int i = 0; i < n + 5; i++) ret++; +; goto j; +; +; h: +; for (int i = 0; i < n; i++) ret++; +; goto j; +; +; j: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %cmp = icmp sgt i32 %n, 5 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %while.body + %cmp2 = icmp eq i32 %n, 6 + br i1 %cmp2, label %for.cond, label %for.cond20 + +if.end: ; preds = %while.body + %inc = add nuw nsw i32 %i.0, 1 + %add = add nsw i32 %inc, %conv + %cmp5 = icmp sgt i32 %add, 3 + br i1 %cmp5, label %while.end, label %while.body + +while.end: ; preds = %if.end + %cmp9 = icmp eq i32 %n, 3 + br i1 %cmp9, label %h, label %i28 + +for.cond: ; preds = %for.body, %if.then + %ret.0 = phi i32 [ %add17, %for.body ], [ 0, %if.then ] + %storemerge3 = phi i32 [ %inc18, %for.body ], [ 0, %if.then ] + %add14 = add nsw i32 %n, 5 + %cmp15 = icmp slt i32 %storemerge3, %add14 + br i1 %cmp15, label %for.body, label %i28 + +for.body: ; preds = %for.cond + %add17 = add nuw nsw i32 %ret.0, 2 + %inc18 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond + +for.cond20: ; preds = %for.body23, %if.then + %ret.1 = phi i32 [ %add24, %for.body23 ], [ 0, %if.then ] + %storemerge2 = phi i32 [ %inc26, %for.body23 ], [ 1, %if.then ] + %mul = shl nsw i32 %n, 1 + %cmp21 = icmp slt i32 %storemerge2, %mul + br i1 %cmp21, label %for.body23, label %h + +for.body23: ; preds = %for.cond20 + %add24 = add nuw nsw i32 %storemerge2, %ret.1 + %inc26 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond20 + +i28: ; preds = %for.cond, %while.end + %ret.2 = phi i32 [ 0, %while.end ], [ %ret.0, %for.cond ] + br label %for.cond30 + +for.cond30: ; preds = %for.body34, %i28 + %ret.3 = phi i32 [ %ret.2, %i28 ], [ %inc35, %for.body34 ] + %storemerge = phi i32 [ 0, %i28 ], [ %inc37, %for.body34 ] + %add31 = add nsw i32 %n, 5 + %cmp32 = icmp slt i32 %storemerge, %add31 + br i1 %cmp32, label %for.body34, label %j + +for.body34: ; preds = %for.cond30 + %inc35 = add nuw nsw i32 %ret.3, 1 + %inc37 = add nuw nsw i32 %storemerge, 1 + br label %for.cond30 + +h: ; preds = %for.cond20, %while.end + %ret.4 = phi i32 [ 0, %while.end ], [ %ret.1, %for.cond20 ] + br label %for.cond40 + +for.cond40: ; preds = %for.body43, %h + %ret.5 = phi i32 [ %ret.4, %h ], [ %inc44, %for.body43 ] + %storemerge1 = phi i32 [ 0, %h ], [ %inc46, %for.body43 ] + %cmp41 = icmp slt i32 %storemerge1, %n + br i1 %cmp41, label %for.body43, label %j + +for.body43: ; preds = %for.cond40 + %inc44 = add nsw i32 %ret.5, 1 + %inc46 = add nuw nsw i32 %storemerge1, 1 + br label %for.cond40 + +j: ; preds = %for.cond40, %for.cond30 + %ret.6 = phi i32 [ %ret.3, %for.cond30 ], [ %ret.5, %for.cond40 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization19, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization19 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: %[[CMP2:.+]] = icmp +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[IFTHENELSE:.+]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[IFTHENSPLIT:.+]]: +; CHECK: br i1 %[[CMP2MERGE:.+]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND20PREHEADER:.+]] + +; CHECK: [[FORCOND20PREHEADER]]: +; CHECK: br label %[[FORCOND20:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[IFEND]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: %[[CMP2MERGE]] = phi i1 [ %[[CMP2]], %[[IFTHEN]] ], [ false, %[[IFEND]] ] +; CHECK: br label %[[WHILEEND:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: %[[CMPUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFENDUNIFORM:.+]] + +; CHECK: [[IFENDUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEENDUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[WHILEENDUNIFORM]]: +; CHECK: %[[CMP9UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP9UNIFORM]], label %[[HUNIFORM:.+]], label %[[I28UNIFORM:.+]] + +; CHECK: [[IFENDUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFENDUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[WHILEBODY]] + +; CHECK: [[IFTHENUNIFORM]]: +; CHECK: %[[CMP2UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND20PREHEADERUNIFORM:.+]] + +; CHECK: [[FORCOND20PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND20UNIFORM:.+]] + +; CHECK: [[FORCOND20UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23UNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY23UNIFORM]]: +; CHECK: br label %[[FORCOND20UNIFORM]] + +; CHECK: [[HLOOPEXITUNIFORM]]: +; CHECK: br label %[[HUNIFORM]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[I28LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[I28LOOPEXITUNIFORM]]: +; CHECK: br label %[[I28UNIFORM]] + +; CHECK: [[HUNIFORM]]: +; CHECK: br label %[[FORCOND40UNIFORM:.+]] + +; CHECK: [[FORCOND40UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43UNIFORM:.+]], label %[[JLOOPEXIT1UNIFORM:.+]] + +; CHECK: [[FORBODY43UNIFORM]]: +; CHECK: br label %[[FORCOND40UNIFORM]] + +; CHECK: [[JLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[JUNIFORM:.+]] + +; CHECK: [[I28UNIFORM]]: +; CHECK: br label %[[FORCOND30UNIFORM:.+]] + +; CHECK: [[FORCOND30UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34UNIFORM:.+]], label %[[JLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY34UNIFORM]]: +; CHECK: br label %[[FORCOND30UNIFORM]] + +; CHECK: [[JLOOPEXITUNIFORM]]: +; CHECK: br label %[[J:.+]] + +; CHECK: [[WHILEEND]]: +; CHECK: br label %[[WHILEENDELSE:.+]] + +; CHECK: [[WHILEENDELSE]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[I28LOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND20]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY23]]: +; CHECK: br label %[[FORCOND20]] + +; CHECK: [[I28LOOPEXIT]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[I28:.+]]: +; CHECK: br label %[[FORCOND30:.+]] + +; CHECK: [[FORCOND30]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34:.+]], label %[[JLOOPEXIT:.+]] + +; CHECK: [[FORBODY34]]: +; CHECK: br label %[[FORCOND30]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[H]] + +; CHECK: [[H]]: +; CHECK: br label %[[FORCOND40:.+]] + +; CHECK: [[FORCOND40]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43:.+]], label %[[JLOOPEXIT2:.+]] + +; CHECK: [[FORBODY43]]: +; CHECK: br label %[[FORCOND40]] + +; CHECK: [[JLOOPEXIT]]: +; CHECK: br label %[[J]] + +; CHECK: [[JLOOPEXIT2]]: +; CHECK: br label %[[I28]] + +; CHECK: [[J]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll new file mode 100644 index 0000000000000..401dfd4781787 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll @@ -0,0 +1,340 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; / \ +; / \ +; b c +; / \ / \ +; d e f g +; \ \ / / +; \ X / +; \ / \ / +; h i +; \ / +; j +; +; * where node a is a uniform branch, and nodes b and c are varying branches. +; * where nodes d, e, f, g are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; / \ +; / \ +; / \ +; b__ c________ +; / \ \___/_\___ \ +; d e f g `e' g' +; \ \ / / | | +; \ X / d' f' +; \ / \ / \ / +; h i i' +; \ / | +; j h' +; \ | +; `--> & <- j' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization2(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (n < 10) { // uniform +; if (id % 3 == 0) { // varying +; for (int i = 0; i < n - 1; i++) { ret /= 2; } goto h; +; } else { // varying +; for (int i = 0; i < n / 3; i++) { ret -= 2; } goto i; +; } +; } else { // uniform +; if (id % 2 == 0) { // varying +; for (int i = 0; i < n * 2; i++) { ret += 1; } goto h; +; } else { // varying +; for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i; +; } +; } +; +; h: +; ret += 5; +; goto end; +; +; i: +; ret *= 10; +; goto end; +; +; end: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %n, 10 + br i1 %cmp, label %if.then, label %if.else17 + +if.then: ; preds = %entry + %rem = srem i32 %conv, 3 + %cmp2 = icmp eq i32 %rem, 0 + br i1 %cmp2, label %if.then4, label %if.else + +if.then4: ; preds = %if.then + br label %for.cond + +for.cond: ; preds = %for.body, %if.then4 + %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ] + %storemerge5 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ] + %sub = add nsw i32 %n, -1 + %cmp5 = icmp slt i32 %storemerge5, %sub + br i1 %cmp5, label %for.body, label %h + +for.body: ; preds = %for.cond + %div = sdiv i32 %ret.0, 2 + %inc = add nsw i32 %storemerge5, 1 + br label %for.cond + +if.else: ; preds = %if.then + br label %for.cond8 + +for.cond8: ; preds = %for.body12, %if.else + %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ] + %storemerge4 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ] + %div9 = sdiv i32 %n, 3 + %cmp10 = icmp slt i32 %storemerge4, %div9 + br i1 %cmp10, label %for.body12, label %i42 + +for.body12: ; preds = %for.cond8 + %sub13 = add nsw i32 %ret.1, -2 + %inc15 = add nsw i32 %storemerge4, 1 + br label %for.cond8 + +if.else17: ; preds = %entry + %rem181 = and i32 %conv, 1 + %cmp19 = icmp eq i32 %rem181, 0 + br i1 %cmp19, label %if.then21, label %if.else30 + +if.then21: ; preds = %if.else17 + br label %for.cond23 + +for.cond23: ; preds = %for.body26, %if.then21 + %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ] + %storemerge3 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ] + %mul = shl nsw i32 %n, 1 + %cmp24 = icmp slt i32 %storemerge3, %mul + br i1 %cmp24, label %for.body26, label %h + +for.body26: ; preds = %for.cond23 + %add = add nsw i32 %ret.2, 1 + %inc28 = add nsw i32 %storemerge3, 1 + br label %for.cond23 + +if.else30: ; preds = %if.else17 + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.else30 + %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ] + %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ] + %add33 = add nsw i32 %n, 5 + %cmp34 = icmp slt i32 %storemerge, %add33 + br i1 %cmp34, label %for.body36, label %i42 + +for.body36: ; preds = %for.cond32 + %mul37 = shl nsw i32 %ret.3, 1 + %inc39 = add nsw i32 %storemerge, 1 + br label %for.cond32 + +h: ; preds = %for.cond23, %for.cond + %ret.4 = phi i32 [ %ret.0, %for.cond ], [ %ret.2, %for.cond23 ] + %add41 = add nsw i32 %ret.4, 5 + br label %end + +i42: ; preds = %for.cond32, %for.cond8 + %ret.5 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.3, %for.cond32 ] + %mul43 = mul nsw i32 %ret.5, 10 + br label %end + +end: ; preds = %i42, %h + %storemerge2 = phi i32 [ %mul43, %i42 ], [ %add41, %h ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization2, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization2 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]] + +; CHECK: [[FORCOND8PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND8UNIFORM:.+]] + +; CHECK: [[FORCOND8UNIFORM]]: +; CHECK: %[[CMP10UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[I42LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY12UNIFORM]]: +; CHECK: br label %[[FORCOND8UNIFORM]] + +; CHECK: [[I42LOOPEXITUNIFORM]]: +; CHECK: br label %[[I42UNIFORM:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[IFTHENBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND8PREHEADERUNIFORM]], label %[[FORCOND8PREHEADER:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: %[[CMP5UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[HLOOPEXITUNIFORM]]: +; CHECK: br label %[[HUNIFORM:.+]] + +; CHECK: [[FORCOND8PREHEADER]]: +; CHECK: br label %[[FORCOND8:.+]] + +; CHECK: [[FORCONDPREHEADER:.+]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND8]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[I42LOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND8]] + +; CHECK: [[IFELSE17]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND23PREHEADERUNIFORM:.+]], label %[[IFELSE17BOSCCINDIR:.+]] + +; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND32UNIFORM:.+]] + +; CHECK: [[FORCOND32UNIFORM]]: +; CHECK: br i1 false, label %[[FORBODY36UNIFORM:.+]], label %[[I42LOOPEXIT2UNIFORM:.+]] + +; CHECK: [[FORBODY36UNIFORM]]: +; CHECK: br label %[[FORCOND32UNIFORM]] + +; CHECK: [[I42LOOPEXIT2UNIFORM]]: +; CHECK: br label %[[I42UNIFORM]] + +; CHECK: [[FORCOND23PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND23UNIFORM:.+]] + +; CHECK: [[IFELSE17BOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM]], label %[[FORCOND32PREHEADER:.+]] + +; CHECK: [[FORCOND23UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26UNIFORM:.+]], label %[[HLOOPEXIT1UNIFORM:.+]] + +; CHECK: [[FORBODY26UNIFORM]]: +; CHECK: br label %[[FORCOND23UNIFORM]] + +; CHECK: [[HLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[HUNIFORM]] + +; CHECK: [[I42UNIFORM]]: +; CHECK: br label %[[ENDUNIFORM:.+]] + +; CHECK: [[HUNIFORM]]: +; CHECK: br label %[[END:.+]] + +; CHECK: [[FORCOND32PREHEADER]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[FORCOND23PREHEADER:.+]]: +; CHECK: br label %[[FORCOND23:.+]] + +; CHECK: [[FORCOND23]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT1:.+]] + +; CHECK: [[FORBODY26]]: +; CHECK: br label %[[FORCOND23]] + +; CHECK: [[FORCOND32]]: +; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[I42LOOPEXIT2:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[I42:.+]] + +; CHECK: [[HLOOPEXIT1]]: +; CHECK: br label %[[I42]] + +; CHECK: [[H:.+]]: +; CHECK: br label %[[END]] + +; CHECK: [[I42LOOPEXIT]]: +; CHECK: br label %[[FORCONDPREHEADER]] + +; CHECK: [[I42LOOPEXIT2]]: +; CHECK: br label %[[FORCOND23PREHEADER]] + +; CHECK: [[I42]]: +; CHECK: br label %[[H]] + +; CHECK: [[END]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll new file mode 100644 index 0000000000000..9e7184f5507ce --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll @@ -0,0 +1,288 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <--------. +; / \ | +; | c | +; | / \ | +; | f h <--. | +; | | / \ | | +; | | | d -' | +; | | | | | +; | | | e ---' +; | | | / +; | | | / +; | | |/ +; | | / +; \|/ +; g +; +; * where nodes b, d, and e are uniform branches, and node h is a varying +; branch. +; * where nodes b, d and g are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <--------. b' <--. +; / \ | | | +; | c | .-. c' | +; | / \ | | \/| | +; | f h <--. | | / h' <. | +; | | / \ | | | f' | | | +; | | | d -' | | | d' -' | +; | | | |\___|_' | | | +; | | | e ---' | e' ---' +; | | | / \ | +; | | | / \| +; | | |/ g' +; | | / | +; \|/ / +; g ----> & <-----' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization20(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0 && n < 5) { +; goto g; +; } +; if (n == 6) { +; goto f; +; } +; while (1) { +; if (ret++ + id >= n) { +; goto d; +; } +; if (n & 1) { +; goto g; +; } +; +; d: +; if (n > 3) { +; goto e; +; } +; } +; e: +; if (n & 1) { +; goto g; +; } +; } +; +; f: +; for (int i = 0; i < n + 1; i++) ret++; +; g: +; out[id] = ret; +; } + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization20(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %e, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %inc, %e ] + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 4 + br i1 %0, label %g, label %if.end + +if.end: ; preds = %while.body + %cmp4 = icmp eq i32 %n, 6 + br i1 %cmp4, label %for.cond, label %while.body9 + +while.body9: ; preds = %d, %if.end + %ret.1 = phi i32 [ %ret.0, %if.end ], [ %inc, %d ] + %inc = add nsw i32 %ret.1, 1 + %add = add nsw i32 %ret.1, %conv + %cmp10 = icmp sge i32 %add, %n + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + %or.cond1 = or i1 %tobool, %cmp10 + br i1 %or.cond1, label %d, label %g + +d: ; preds = %while.body9 + %cmp16 = icmp sgt i32 %n, 3 + br i1 %cmp16, label %e, label %while.body9 + +e: ; preds = %d + %and20 = and i32 %n, 1 + %tobool21 = icmp eq i32 %and20, 0 + br i1 %tobool21, label %while.body, label %g + +for.cond: ; preds = %for.body, %if.end + %ret.2 = phi i32 [ %inc27, %for.body ], [ %ret.0, %if.end ] + %storemerge = phi i32 [ %inc28, %for.body ], [ 0, %if.end ] + %cmp25 = icmp sgt i32 %storemerge, %n + br i1 %cmp25, label %g, label %for.body + +for.body: ; preds = %for.cond + %inc27 = add nsw i32 %ret.2, 1 + %inc28 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +g: ; preds = %for.cond, %e, %while.body9, %while.body + %ret.3 = phi i32 [ %ret.0, %while.body ], [ %inc, %e ], [ %ret.2, %for.cond ], [ %inc, %while.body9 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization20, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization20 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[IFEND:.+]] + +; CHECK: [[IFEND]]: +; CHECK: %[[CMP4:.+]] = icmp +; CHECK: br i1 %[[CMP4]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODY9PREHEADER:.+]] + +; CHECK: [[WHILEBODY9PREHEADER]]: +; CHECK: br label %[[WHILEBODY9:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[FORCONDPREHEADERELSE:.+]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[FORCONDPREHEADERSPLIT:.+]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[WHILEBODY9]]: +; CHECK: br label %[[D:.+]] + +; CHECK: [[WHILEBODYUNIFORM:.+]]: +; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT2UNIFORM:.+]], label %[[IFENDUNIFORM:.+]] + +; CHECK: [[IFENDUNIFORM]]: +; CHECK: %[[CMP4UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP4UNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[WHILEBODY9PREHEADERUNIFORM:.+]] + +; CHECK: [[WHILEBODY9PREHEADERUNIFORM]]: +; CHECK: br label %[[WHILEBODY8UNIFORM:.+]] + +; CHECK: [[WHILEBODY9UNIFORM:.+]]: +; CHECK: br i1 %{{.+}}, label %[[DUNIFORM:.+]], label %[[WHILEBODY9UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[DUNIFORM]]: +; CHECK: %[[CMP16UNIFORM:.+]] = icmp +; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[WHILEBODY9UNIFORM]] + +; CHECK: [[WHILEBODY9UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT1UNIFORM:.+]], label %[[WHILEBODY9UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[WHILEBODY9UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[D]] + +; CHECK: [[EUNIFORM]]: +; CHECK: %[[TOBOOL21UNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOL21UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[GLOOPEXIT2UNIFORM]] + + +; CHECK: [[GLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[GUNIFORM:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXITUNIFORM:.+]], label %[[FORBODYUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[GLOOPEXITUNIFORM]]: +; CHECK: br label %[[GUNIFORM]] + +; CHECK: [[GLOOPEXIT2UNIFORM]]: +; CHECK: br label %[[G]] + +; CHECK: [[D]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY9]], label %[[WHILEBODY9PUREEXIT:.+]] + +; CHECK: [[WHILEBODY9PUREEXIT]]: +; CHECK: br label %[[E:.+]] + +; CHECK: [[E]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[GLOOPEXIT1:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXIT:.+]], label %[[FORBODY:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[GLOOPEXIT]]: +; CHECK: br label %[[G]] + +; CHECK: [[GLOOPEXIT1]]: +; CHECK: br label %[[GLOOPEXIT1ELSE:.+]] + +; CHECK: [[GLOOPEXIT1ELSE]]: +; CHECK: br label %[[GLOOPEXIT2:.+]] + +; CHECK: [[GLOOPEXIT2]]: +; CHECK: br label %[[GLOOPEXIT2ELSE:.+]] + +; CHECK: [[GLOOPEXIT2ELSE]]: +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERELSE]], label %[[FORCONDPREHEADERSPLIT]] + +; CHECK: [[G]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll new file mode 100644 index 0000000000000..a91c3e08f752f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll @@ -0,0 +1,239 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <------. +; / \ | +; | c <--. | +; | / \ | | +; | | d -' | +; | | / \ | +; | | | e -' +; | | | / +; | | | / +; | | |/ +; | | / +; \|/ +; f +; +; * where nodes b, d, and e are uniform branches, and node c is a varying +; branch. +; * where nodes b, d, e and f are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <------. b' <--. +; / \ | | | +; | c <--. | c' <. | +; | / \___|_|__ | | | +; | | d -' | `d' -' | +; | | / \ | | | +; | | | e -' e' ---' +; | | | / | +; | | | / f' +; | | |/ | +; | | / | +; \|/ / +; f --> & <--' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization21(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0 && n < 5) { +; goto f; +; } +; while (1) { +; if (n <= 2) { +; goto f; +; } else { +; if (ret + id >= n) { +; goto d; +; } +; } +; if (n & 1) { +; goto f; +; } +; +; d: +; if (n > 3) { +; goto e; +; } +; } +; +; e: +; if (n & 1) { +; goto f; +; } +; } +; +; f: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization21(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %e, %entry + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 4 + %cmp6 = icmp slt i32 %n, 3 + %or.cond1 = or i1 %cmp6, %0 + br i1 %or.cond1, label %f, label %if.else + +while.body5: ; preds = %d + %cmp6.old = icmp eq i32 %n, 3 + br i1 %cmp6.old, label %if.else, label %f + +if.else: ; preds = %while.body5, %while.body + %cmp9 = icmp sge i32 %conv, %n + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + %or.cond2 = or i1 %tobool, %cmp9 + br i1 %or.cond2, label %d, label %f + +d: ; preds = %if.else + %cmp16 = icmp sgt i32 %n, 3 + br i1 %cmp16, label %e, label %while.body5 + +e: ; preds = %d + %and20 = and i32 %n, 1 + %tobool21 = icmp eq i32 %and20, 0 + br i1 %tobool21, label %while.body, label %f + +f: ; preds = %e, %if.else, %while.body5, %while.body + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 0, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization21, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization21 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[IFELSEPREHEADER:.+]] + +; CHECK: [[IFELSEPREHEADER]]: +; CHECK: br label %[[IFELSE:.+]] + +; CHECK: [[WHILEBODY5:.+]]: + +; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]] + +; CHECK: [[IFELSEPUREEXIT]]: +; CHECK: br label %[[E:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[D:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: %[[CMP6UNIFORM:cmp.+]] = icmp +; CHECK: %[[ORCOND1UNIFORM:.+]] = or i1 %[[CMP6UNIFORM]] +; CHECK: br i1 %[[ORCOND1UNIFORM]], label %[[FLOOPEXIT1UNIFORM:.+]], label %[[IFELSEPREHEADERUNIFORM:.+]] + +; CHECK: [[IFELSEPREHEADERUNIFORM]]: +; CHECK: br label %[[IFELSEUNIFORM:.+]] + +; CHECK: [[IFELSEUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[DUNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[DUNIFORM]]: +; CHECK: %[[CMP16UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP16UNIFORM]], label %[[EUNIFORM:.+]], label %[[WHILEBODY5UNIFORM:.+]] + +; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FLOOPEXITUNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFELSEUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[D]] + +; CHECK: [[WHILEBODY5UNIFORM]]: +; CHECK: %[[CMP6OLDUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP6OLDUNIFORM]], label %[[IFELSEUNIFORM]], label %[[FLOOPEXITUNIFORM]] + +; CHECK: [[EUNIFORM]]: +; CHECK: %[[TOBOOL21UNIFORM:.+]] = icmp +; CHECK: br i1 %[[TOBOOL21UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FLOOPEXIT1UNIFORM]] + + +; CHECK: [[FLOOPEXITUNIFORM]]: +; CHECK: br label %[[FUNIFORM:.+]] + +; CHECK: [[FLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[D]]: +; CHECK: br label %[[WHILEBODY5]] + +; CHECK: [[E]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[FLOOPEXIT:.+]] + +; CHECK: [[FLOOPEXIT]]: +; CHECK: br label %[[FLOOPEXITELSE:.+]] + +; CHECK: [[FLOOPEXITELSE]]: +; CHECK: br label %[[FLOOPEXIT1:.+]] + +; CHECK: [[FLOOPEXIT1]]: +; CHECK: br label %[[F]] + +; CHECK: [[F]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll new file mode 100644 index 0000000000000..acd9dcba0bb7e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll @@ -0,0 +1,332 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; / \ +; / \ +; b c +; / \ / \ +; d e f g +; \ \ / / +; \ h / +; \ \ / +; \ i +; \ / +; j +; +; * where node a is a uniform branch, and nodes b and c are varying branches. +; * where nodes d, e, f, g, i and j are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; / \ +; / \ +; / \ +; b__ c________ +; / \ \___/_\___ \ +; d e f g `e' g' +; | \ / / | | +; j h / d' f' +; | \ / \ / +; | i h' +; | | | +; | `--> & <- i' +; | | +; `---> & <-- j' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization3(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (n < 10) { // uniform +; if (id % 3 == 0) { // varying +; for (int i = 0; i < n - 1; i++) { ret /= 2; } goto end; +; } else { // varying +; for (int i = 0; i < n / 3; i++) { ret -= 2; } goto h; +; } +; } else { // uniform +; if (id % 2 == 0) { // varying +; for (int i = 0; i < n * 2; i++) { ret += 1; } goto h; +; } else { // varying +; for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i; +; } +; } +; +; h: +; ret += 5; +; +; i: +; ret *= 10; +; +; end: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %n, 10 + br i1 %cmp, label %if.then, label %if.else17 + +if.then: ; preds = %entry + %rem = srem i32 %conv, 3 + %cmp2 = icmp eq i32 %rem, 0 + br i1 %cmp2, label %if.then4, label %if.else + +if.then4: ; preds = %if.then + br label %for.cond + +for.cond: ; preds = %for.body, %if.then4 + %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ] + %storemerge4 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ] + %sub = add nsw i32 %n, -1 + %cmp5 = icmp slt i32 %storemerge4, %sub + br i1 %cmp5, label %for.body, label %end + +for.body: ; preds = %for.cond + %div = sdiv i32 %ret.0, 2 + %inc = add nsw i32 %storemerge4, 1 + br label %for.cond + +if.else: ; preds = %if.then + br label %for.cond8 + +for.cond8: ; preds = %for.body12, %if.else + %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ] + %storemerge3 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ] + %div9 = sdiv i32 %n, 3 + %cmp10 = icmp slt i32 %storemerge3, %div9 + br i1 %cmp10, label %for.body12, label %h + +for.body12: ; preds = %for.cond8 + %sub13 = add nsw i32 %ret.1, -2 + %inc15 = add nsw i32 %storemerge3, 1 + br label %for.cond8 + +if.else17: ; preds = %entry + %rem181 = and i32 %conv, 1 + %cmp19 = icmp eq i32 %rem181, 0 + br i1 %cmp19, label %if.then21, label %if.else30 + +if.then21: ; preds = %if.else17 + br label %for.cond23 + +for.cond23: ; preds = %for.body26, %if.then21 + %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ] + %storemerge2 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ] + %mul = shl nsw i32 %n, 1 + %cmp24 = icmp slt i32 %storemerge2, %mul + br i1 %cmp24, label %for.body26, label %h + +for.body26: ; preds = %for.cond23 + %add = add nsw i32 %ret.2, 1 + %inc28 = add nsw i32 %storemerge2, 1 + br label %for.cond23 + +if.else30: ; preds = %if.else17 + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.else30 + %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ] + %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ] + %add33 = add nsw i32 %n, 5 + %cmp34 = icmp slt i32 %storemerge, %add33 + br i1 %cmp34, label %for.body36, label %i42 + +for.body36: ; preds = %for.cond32 + %mul37 = shl nsw i32 %ret.3, 1 + %inc39 = add nsw i32 %storemerge, 1 + br label %for.cond32 + +h: ; preds = %for.cond23, %for.cond8 + %ret.4 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.2, %for.cond23 ] + %add41 = add nsw i32 %ret.4, 5 + br label %i42 + +i42: ; preds = %h, %for.cond32 + %ret.5 = phi i32 [ %add41, %h ], [ %ret.3, %for.cond32 ] + %mul43 = mul nsw i32 %ret.5, 10 + br label %end + +end: ; preds = %i42, %for.cond + %ret.6 = phi i32 [ %mul43, %i42 ], [ %ret.0, %for.cond ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization3, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization3 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]] + +; CHECK: [[FORCOND8PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND8UNIFORM:.+]] + +; CHECK: [[FORCOND8UNIFORM]]: +; CHECK: %[[CMP10UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY12UNIFORM]]: +; CHECK: br label %[[FORCOND8UNIFORM]] + +; CHECK: [[HLOOPEXITUNIFORM]]: +; CHECK: br label %[[HUNIFORM:.+]] + +; CHECK: [[FORCONDPREHEADERUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[IFTHENBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND8PREHEADERUNIFORM]], label %[[FORCOND8PREHEADER:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: %[[CMP5UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[ENDLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[ENDLOOPEXITUNIFORM]]: +; CHECK: br label %[[END:.+]] + +; CHECK: [[FORCOND8PREHEADER]]: +; CHECK: br label %[[FORCOND8:.+]] + +; CHECK: [[FORCONDPREHEADER:.+]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[EXITCOND:.+]] = icmp +; CHECK: br i1 %[[EXITCOND]], label %[[FORBODY:.+]], label %[[ENDLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND8]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND8]] + +; CHECK: [[IFELSE17]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND23PREHEADERUNIFORM:.+]], label %[[IFELSE17BOSCCINDIR:.+]] + +; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]: +; CHECK: br label %[[FORCOND32UNIFORM:.+]] + +; CHECK: [[FORCOND32UNIFORM]]: +; CHECK: br i1 false, label %[[FORBODY36UNIFORM:.+]], label %[[ENDLOOPEXIT2UNIFORM:.+]] + +; CHECK: [[FORBODY36UNIFORM]]: +; CHECK: br label %[[FORCOND32UNIFORM]] + +; CHECK: [[ENDLOOPEXIT2UNIFORM]]: +; CHECK: br label %[[END]] + +; CHECK: [[FORCOND23PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND23UNIFORM:.+]] + +; CHECK: [[IFELSE17BOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM:.+]], label %[[FORCOND32PREHEADER:.+]] + +; CHECK: [[FORCOND23UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26UNIFORM:.+]], label %[[HLOOPEXIT1UNIFORM:.+]] + +; CHECK: [[FORBODY26UNIFORM]]: +; CHECK: br label %[[FORCOND23UNIFORM]] + +; CHECK: [[HLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[HUNIFORM]] + +; CHECK: [[HUNIFORM]]: +; CHECK: br label %[[END]] + +; CHECK: [[FORCOND32PREHEADER]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[FORCOND23PREHEADER:.+]]: +; CHECK: br label %[[FORCOND23:.+]] + +; CHECK: [[FORCOND23]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT1:.+]] + +; CHECK: [[FORBODY26]]: +; CHECK: br label %[[FORCOND23]] + +; CHECK: [[FORCOND32]]: +; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[ENDLOOPEXIT2:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[FORCONDPREHEADER]] + +; CHECK: [[HLOOPEXIT1]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[H]]: +; CHECK: br label %[[I42:.+]] + +; CHECK: [[ENDLOOPEXIT]]: +; CHECK: br label %[[H]] + +; CHECK: [[ENDLOOPEXIT2]]: +; CHECK: br label %[[FORCOND23PREHEADER]] + +; CHECK: [[END]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll new file mode 100644 index 0000000000000..5c6f686043c6f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll @@ -0,0 +1,219 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-. +; / \ | +; e c | +; | / \| +; | f d +; |/ +; g +; +; * where node b is a uniform branch, and node c is a varying branch. +; * where nodes f, d and g are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <-. b' <--. +; / \ | / \ | +; e c_|_ e' c' | +; | / \| \_|__ | | +; | f d | `d' -' +; |/ \ / +; g f' +; | | +; `---> & <-- g' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization4(__global int *out, int n) { +; int id = get_global_id(0); +; +; int x = id / n; +; int y = id % n; +; int i = 0; +; for (;;) { +; if (n > 20) goto e; +; if (x + y > n) goto f; +; y++; +; x++; +; i++; +; } +; +; goto g; +; +; e: +; i *= 2 + n; +; goto g; +; +; f: +; i /= i + n; +; +; g: +; out[id] = x + y + i; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization4(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %0 = icmp eq i32 %conv, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %conv, %5 + %6 = icmp eq i32 %conv, -2147483648 + %7 = icmp eq i32 %n, -1 + %8 = and i1 %7, %6 + %9 = icmp eq i32 %n, 0 + %10 = or i1 %9, %8 + %11 = select i1 %10, i32 1, i32 %n + %rem = srem i32 %conv, %11 + br label %for.cond + +for.cond: ; preds = %if.end5, %entry + %x.0 = phi i32 [ %div, %entry ], [ %inc6, %if.end5 ] + %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end5 ] + %storemerge = phi i32 [ 0, %entry ], [ %inc7, %if.end5 ] + %cmp = icmp sgt i32 %n, 20 + br i1 %cmp, label %e, label %if.end + +if.end: ; preds = %for.cond + %add = add nsw i32 %y.0, %x.0 + %cmp2 = icmp sgt i32 %add, %n + br i1 %cmp2, label %f, label %if.end5 + +if.end5: ; preds = %if.end + %inc = add nsw i32 %y.0, 1 + %inc6 = add nsw i32 %x.0, 1 + %inc7 = add nsw i32 %storemerge, 1 + br label %for.cond + +e: ; preds = %for.cond + %add8 = add nsw i32 %n, 2 + %mul = mul nsw i32 %storemerge, %add8 + br label %g + +f: ; preds = %if.end + %add9 = add nsw i32 %storemerge, %n + %12 = icmp eq i32 %add9, 0 + %13 = select i1 %12, i32 1, i32 %add9 + %div10 = sdiv i32 %storemerge, %13 + br label %g + +g: ; preds = %f, %e + %storemerge1 = phi i32 [ %div10, %f ], [ %mul, %e ] + %add11 = add i32 %y.0, %x.0 + %add12 = add i32 %add11, %storemerge1 + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %add12, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization4, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization4 +; CHECK: br i1 true, label %[[FORCONDUNIFORM:.+]], label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[E:.+]], label %[[IFEND:.+]] + +; CHECK: [[IFEND]]: +; CHECK: br label %[[IFEND5:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: %[[CMPUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMPUNIFORM]], label %[[EUNIFORM:.+]], label %[[IFENDUNIFORM:.+]] + +; CHECK: [[IFENDUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[FUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFEND5UNIFORM:.+]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[FUNIFORM]]: +; CHECK: br label %[[GUNIFORM:.+]] + +; CHECK: [[IFENDUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFEND5UNIFORM:.+]], label %[[IFENDUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFENDUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFEND5]] + +; CHECK: [[EUNIFORM]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[IFEND5]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]] + +; CHECK: [[FORCONDPUREEXIT]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[E]]: +; CHECK: br label %[[FORCONDPUREEXIT]] + +; CHECK: [[EELSE:.+]]: +; CHECK: br label %[[G]] + +; CHECK: [[ESPLIT:.+]]: +; CHECK: br label %[[G]] + +; CHECK: [[F]]: +; CHECK: br label %[[FELSE:.+]] + +; CHECK: [[FELSE]]: +; CHECK: br i1 %{{.+}}, label %[[EELSE]], label %[[ESPLIT]] + +; CHECK: [[G]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll new file mode 100644 index 0000000000000..f7536ca9ad196 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll @@ -0,0 +1,264 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c +; |\ / \ +; | d e +; | \ / +; | f +; \ / +; g +; +; * where node c is a uniform branch, and nodes a and b are varying branches. +; * where nodes b, c, d, f, g are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a________ +; / \ \ +; b c c' +; |\_/_\__ / \ +; | d e \ | e' +; | \ / \ \ / +; | f \ b' +; \ / \| +; g d' +; | | +; | f' +; | | +; `--> & <- g' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization5(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (id % 2 == 0) { // a +; if (id == 4) { // b +; goto g; +; } else { +; goto d; +; } +; } else { // c +; if (n % 2 == 0) { +; goto d; +; } else { +; goto e; +; } +; } +; +; d: +; for (int i = 0; i < n / 4; i++) { ret += i - 2; } +; goto f; +; +; e: +; for (int i = 0; i < n + 5; i++) { ret += i + 5; } +; +; f: +; ret *= ret % n; +; ret *= ret + 4; +; +; g: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %rem1 = and i32 %conv, 1 + %cmp = icmp eq i32 %rem1, 0 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + %cmp2 = icmp eq i32 %conv, 4 + br i1 %cmp2, label %g, label %d + +if.else5: ; preds = %entry + %rem62 = and i32 %n, 1 + %cmp7 = icmp eq i32 %rem62, 0 + br i1 %cmp7, label %d, label %e + +d: ; preds = %if.else5, %if.then + br label %for.cond + +for.cond: ; preds = %for.body, %d + %ret.0 = phi i32 [ 0, %d ], [ %add, %for.body ] + %storemerge3 = phi i32 [ 0, %d ], [ %inc, %for.body ] + %div = sdiv i32 %n, 4 + %cmp11 = icmp slt i32 %storemerge3, %div + br i1 %cmp11, label %for.body, label %f + +for.body: ; preds = %for.cond + %sub = add i32 %ret.0, -2 + %add = add i32 %sub, %storemerge3 + %inc = add nsw i32 %storemerge3, 1 + br label %for.cond + +e: ; preds = %if.else5 + br label %for.cond14 + +for.cond14: ; preds = %for.body18, %e + %ret.1 = phi i32 [ 0, %e ], [ %add20, %for.body18 ] + %storemerge = phi i32 [ 0, %e ], [ %inc22, %for.body18 ] + %add15 = add nsw i32 %n, 5 + %cmp16 = icmp slt i32 %storemerge, %add15 + br i1 %cmp16, label %for.body18, label %f + +for.body18: ; preds = %for.cond14 + %add19 = add i32 %ret.1, 5 + %add20 = add i32 %add19, %storemerge + %inc22 = add nsw i32 %storemerge, 1 + br label %for.cond14 + +f: ; preds = %for.cond14, %for.cond + %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond14 ] + %0 = icmp eq i32 %ret.2, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %rem24 = srem i32 %ret.2, %5 + %mul = mul nsw i32 %rem24, %ret.2 + %add25 = add nsw i32 %mul, 4 + %mul26 = mul nsw i32 %add25, %mul + br label %g + +g: ; preds = %f, %if.then + %ret.3 = phi i32 [ %mul26, %f ], [ 0, %if.then ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization5, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization5 +; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]] + +; CHECK: [[IFELSE5UNIFORM:.+]]: +; CHECK: %[[CMP7UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP7UNIFORM]], label %[[DUNIFORM:.+]], label %[[FORCOND14PREHEADERUNIFORM:.+]] + +; CHECK: [[FORCOND14PREHEADERUNIFORM]]: +; CHECK: br label %[[FORCOND14UNIFORM:.+]] + +; CHECK: [[FORCOND14UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18UNIFORM:.+]], label %[[FLOOPEXIT1UNIFORM:.+]] + +; CHECK: [[FORBODY18UNIFORM]]: +; CHECK: br label %[[FORCOND14UNIFORM]] + +; CHECK: [[FLOOPEXIT1UNIFORM]]: +; CHECK: br label %[[FUNIFORM:.+]] + +; CHECK: [[IFTHENUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[GUNIFORM:.+]], label %[[IFTHENUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[ENTRYBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFELSE5UNIFORM]], label %[[IFELSE5:.+]] + +; CHECK: [[DUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: %[[CMP11UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP11UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[FLOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[FLOOPEXITUNIFORM]]: +; CHECK: br label %[[FUNIFORM]] + +; CHECK: [[FUNIFORM]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[IFTHEN:.+]]: +; CHECK: br label %[[D:.+]] + +; CHECK: [[IFELSE5]]: +; CHECK: %[[CMP7:.+]] = icmp +; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]] + +; CHECK: [[FORCOND14PREHEADER]]: +; CHECK: br label %[[FORCOND14:.+]] + +; CHECK: [[D]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[CMP11:.+]] = icmp +; CHECK: br i1 %[[CMP11]], label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND14]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18:.+]], label %[[FLOOPEXIT1:.+]] + +; CHECK: [[FORBODY18]]: +; CHECK: br label %[[FORCOND14]] + +; CHECK: [[FLOOPEXIT]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[FLOOPEXIT1]]: +; CHECK: br label %[[IFTHEN]] + +; CHECK: [[F]]: +; CHECK: br label %[[G]] + +; CHECK: [[G]]: +; CHECK: ret void + +; CHECK: [[IFTHENUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[DUNIFORM]], label %[[D]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll new file mode 100644 index 0000000000000..f1b5f3582dd7a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll @@ -0,0 +1,228 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization6 -vecz-passes="function(simplifycfg),vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-. +; / \ | +; c d | +; / \ / | +; e f --' +; \ | +; \ g +; \| +; h +; +; * where nodes b and c are uniform branches, and node f is a varying +; branch. +; * where nodes g and h are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <-. .---> b' <-. +; / \ | | / \ | +; c d | | c' d' | +; / \ / | | / \ / | +; e f --' | e' f' --' +; \ |\____' \ | +; \ g \ | +; \| \| +; h g' +; | | +; `---> & <-- h' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization6(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n % 2 == 0) { +; if (n > 2) { +; goto e; +; } +; } else { +; ret += n + 1; +; } +; if (id == n) break; +; } +; +; ret += n * 2; +; ret /= n; +; goto early; +; +; e: +; ret += n * 4; +; ret -= n; +; +; early: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end10, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %ret.1, %if.end10 ] + %rem1 = and i32 %n, 1 + %cmp = icmp eq i32 %rem1, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body + %cmp2 = icmp sgt i32 %n, 2 + br i1 %cmp2, label %e, label %if.end6 + +if.else: ; preds = %while.body + %add = add nsw i32 %n, 1 + %add5 = add nsw i32 %add, %ret.0 + br label %if.end6 + +if.end6: ; preds = %if.else, %if.then + %ret.1 = phi i32 [ %add5, %if.else ], [ %ret.0, %if.then ] + %cmp7 = icmp eq i32 %conv, %n + br i1 %cmp7, label %while.end, label %if.end10 + +if.end10: ; preds = %if.end6 + br label %while.body + +while.end: ; preds = %if.end6 + %mul = shl nsw i32 %n, 1 + %add11 = add nsw i32 %ret.1, %mul + %0 = icmp eq i32 %add11, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %add11, %5 + br label %early + +e: ; preds = %if.then + %mul12 = mul i32 %n, 4 + %n.neg = sub i32 0, %n + %add13 = add i32 %mul12, %n.neg + %sub = add i32 %add13, %ret.0 + br label %early + +early: ; preds = %e, %while.end + %storemerge = phi i32 [ %div, %while.end ], [ %sub, %e ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization6, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization6 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: %[[CMP2:.+]] = icmp +; CHECK: br i1 %[[CMP2]], label %[[E:.+]], label %[[IFEND6:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[IFEND6]] + +; CHECK: [[IFEND6]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[WHILEEND:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: %[[CMPUNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]] + +; CHECK: [[IFELSEUNIFORM]]: +; CHECK: br label %[[IFEND6UNIFORM:.+]] + +; CHECK: [[IFTHENUNIFORM]]: +; CHECK: %[[CMP2UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP2UNIFORM]], label %[[EUNIFORM:.+]], label %[[IFEND6EUNIFORM:.+]] + +; CHECK: [[IFEND6UNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEENDUNIFORM:.+]], label %[[IFEND6UNIFORMBOSCCINDIR:.+]] + +; CHECK: [[WHILEENDUNIFORM]]: +; CHECK: br label %[[EARLYUNIFORM:.+]] + +; CHECK: [[IFEND6UNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFEND6UNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFEND6UNIFORMBOSCCSTORE]]: +; CHECK: br label %[[WHILEBODY]] + +; CHECK: [[EUNIFORM]]: +; CHECK: br label %[[EARLY:.+]] + +; CHECK: [[WHILEEND]]: +; CHECK: br label %[[WHILEENDELSE:.+]] + +; CHECK: [[WHILEENDELSE]]: +; CHECK: br i1 %{{.+}}, label %[[EELSE:.+]], label %[[ESPLIT:.+]] + +; CHECK: [[E]]: +; CHECK: br label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[EELSE]]: +; CHECK: br label %[[EARLY]] + +; CHECK: [[ESPLIT]]: +; CHECK: br label %[[EARLY]] + +; CHECK: [[EARLY]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll new file mode 100644 index 0000000000000..ab42eddff1897 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll @@ -0,0 +1,262 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c +; / \ / \ +; d e f +; \ / \ / +; g h +; \ / +; i +; +; * where nodes a, c and e are uniform branches, and node b is a varying +; branch. +; * where nodes d, e, g and i are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; / \ +; / \ +; / \ +; / \ +; b____ c +; / \ \ / \ +; d e d'| | +; \ / \ \| | +; g h e' f +; \ / \ / +; i h' +; | | +; | g' +; | | +; | i' +; \ / +; \ / +; \ / +; & +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization7(__global int *out, int n) { +; int id = get_global_id(0); +; int i = 0; +; +; if (n > 10) { // a +; if (n + id > 10) { // b +; i = n * 10; // d +; goto g; +; } else { +; goto e; +; } +; } else { +; if (n < 5) { // c +; goto e; +; } else { +; for (int j = 0; j < n; j++) { i++; } +; goto h; +; } +; } +; +; e: +; if (n > 5) { +; goto g; +; } else { +; i = n * 3 / 5; +; goto h; +; } +; +; g: +; for (int j = 0; j < n; j++) { i++; } +; goto i; +; +; h: +; i = n + id / 3; +; +; i: +; out[id] = i; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp sgt i32 %n, 10 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + %add = add nsw i32 %conv, %n + %cmp2 = icmp sgt i32 %add, 10 + br i1 %cmp2, label %if.then4, label %e + +if.then4: ; preds = %if.then + %mul = mul nsw i32 %n, 10 + br label %g + +if.else5: ; preds = %entry + %cmp6 = icmp slt i32 %n, 5 + br i1 %cmp6, label %e, label %if.else9 + +if.else9: ; preds = %if.else5 + br label %for.cond + +for.cond: ; preds = %for.body, %if.else9 + %storemerge = phi i32 [ 0, %if.else9 ], [ %inc12, %for.body ] + %cmp10 = icmp slt i32 %storemerge, %n + br i1 %cmp10, label %for.body, label %h + +for.body: ; preds = %for.cond + %inc12 = add nsw i32 %storemerge, 1 + br label %for.cond + +e: ; preds = %if.else5, %if.then + %cmp13 = icmp sgt i32 %n, 5 + br i1 %cmp13, label %g, label %h + +g: ; preds = %e, %if.then4 + %i.1 = phi i32 [ %mul, %if.then4 ], [ 0, %e ] + br label %for.cond19 + +for.cond19: ; preds = %for.body22, %g + %i.2 = phi i32 [ %i.1, %g ], [ %inc23, %for.body22 ] + %storemerge1 = phi i32 [ 0, %g ], [ %inc25, %for.body22 ] + %cmp20 = icmp slt i32 %storemerge1, %n + br i1 %cmp20, label %for.body22, label %i29 + +for.body22: ; preds = %for.cond19 + %inc23 = add nsw i32 %i.2, 1 + %inc25 = add nsw i32 %storemerge1, 1 + br label %for.cond19 + +h: ; preds = %e, %for.cond + %div27 = sdiv i32 %conv, 3 + %add28 = add nsw i32 %div27, %n + br label %i29 + +i29: ; preds = %h, %for.cond19 + %i.3 = phi i32 [ %add28, %h ], [ %i.2, %for.cond19 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %i.3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization7, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization7 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE5:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]] + +; CHECK: [[IFTHEN4UNIFORM]]: +; CHECK: br label %[[GUNIFORM:.+]] + +; CHECK: [[IFTHENBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[IFTHEN4:.+]] + +; CHECK: [[EUNIFORM]]: +; CHECK: %[[CMP13UNIFORM:.+]] = icmp +; CHECK: br i1 %[[CMP13UNIFORM]], label %[[GUNIFORM]], label %[[HUNIFORM:.+]] + +; CHECK: [[HUNIFORM]]: +; CHECK: br label %[[I29UNIFORM:.+]] + +; CHECK: [[GUNIFORM]]: +; CHECK: br label %[[FORCOND19UNIFORM:.+]] + +; CHECK: [[FORCOND19UNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22UNIFORM:.+]], label %[[I29LOOPEXITUNIFORM:.+]] + +; CHECK: [[FORBODY22UNIFORM]]: +; CHECK: br label %[[FORCOND19UNIFORM]] + +; CHECK: [[I29LOOPEXITUNIFORM]]: +; CHECK: br label %[[I29:.+]] + +; CHECK: [[IFTHEN4]]: +; CHECK: br label %[[E:.+]] + +; CHECK: [[IFELSE5]]: +; CHECK: %[[CMP6:.+]] = icmp +; CHECK: br i1 %[[CMP6]], label %[[E]], label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[E]]: +; CHECK: %[[CMP13:.+]] = icmp +; CHECK: br i1 %[[CMP13]], label %[[G:.+]], label %[[H:.+]] + +; CHECK: [[G]]: +; CHECK: br label %[[FORCOND19:.+]] + +; CHECK: [[FORCOND19]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[I29LOOPEXIT:.+]] + +; CHECK: [[FORBODY22]]: +; CHECK: br label %[[FORCOND19]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[H]] + +; CHECK: [[H]]: +; CHECK: br label %[[G]] + +; CHECK: [[I29LOOPEXIT]]: +; CHECK: br label %[[I29]] + +; CHECK: [[I29]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll new file mode 100644 index 0000000000000..1245dc2ca0c0e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll @@ -0,0 +1,220 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization8 -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-. +; / \ | +; e c | +; | / \| +; | f d +; |/ +; g +; +; * where nodes b and c varying branches. +; * where nodes e, f, d and g are divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <-. b' <. +; / \__|__ | | +; e c_|__`c' | +; | / \| \| | +; | f d d' -' +; |/ | +; g f' +; | | +; | e' +; | | +; `--> & <- g' +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization8(__global int *out, int n) { +; int id = get_global_id(0); +; +; int x = id / n; +; int y = id % n; +; int i = 0; +; for (;;) { +; if (i + id > n) goto e; +; if (x + y > n) goto f; +; y++; +; x++; +; i++; +; } +; +; goto g; +; +; e: +; i *= 2 + n; +; goto g; +; +; f: +; i /= i + n; +; +; g: +; out[id] = x + y + i; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization8(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %0 = icmp eq i32 %conv, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %conv, %5 + %6 = icmp eq i32 %conv, -2147483648 + %7 = icmp eq i32 %n, -1 + %8 = and i1 %7, %6 + %9 = icmp eq i32 %n, 0 + %10 = or i1 %9, %8 + %11 = select i1 %10, i32 1, i32 %n + %rem = srem i32 %conv, %11 + br label %for.cond + +for.cond: ; preds = %if.end6, %entry + %x.0 = phi i32 [ %div, %entry ], [ %inc7, %if.end6 ] + %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end6 ] + %storemerge = phi i32 [ 0, %entry ], [ %inc8, %if.end6 ] + %add = add nsw i32 %storemerge, %conv + %cmp = icmp sgt i32 %add, %n + br i1 %cmp, label %e, label %if.end + +if.end: ; preds = %for.cond + %add2 = add nsw i32 %y.0, %x.0 + %cmp3 = icmp sgt i32 %add2, %n + br i1 %cmp3, label %f, label %if.end6 + +if.end6: ; preds = %if.end + %inc = add nsw i32 %y.0, 1 + %inc7 = add nsw i32 %x.0, 1 + %inc8 = add nsw i32 %storemerge, 1 + br label %for.cond + +e: ; preds = %for.cond + %add9 = add nsw i32 %n, 2 + %mul = mul nsw i32 %storemerge, %add9 + br label %g + +f: ; preds = %if.end + %add10 = add nsw i32 %storemerge, %n + %12 = icmp eq i32 %add10, 0 + %13 = select i1 %12, i32 1, i32 %add10 + %div11 = sdiv i32 %storemerge, %13 + br label %g + +g: ; preds = %f, %e + %storemerge1 = phi i32 [ %div11, %f ], [ %mul, %e ] + %add12 = add i32 %y.0, %x.0 + %add13 = add i32 %add12, %storemerge1 + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %add13, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization8, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization8 +; CHECK: br i1 true, label %[[FORCONDUNIFORM:.+]], label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br label %[[IFEND:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[FORCONDUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFENDUNIFORM:.+]]: +; CHECK: br i1 %{{.+}}, label %[[FUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[IFEND6UNIFORM:.+]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[FUNIFORM]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[IFENDUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFEND6UNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[IFENDUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFEND6:.+]] + +; CHECK: [[EUNIFORM]]: +; CHECK: br label %[[G]] + +; CHECK: [[FORCONDUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[IFENDUNIFORM]], label %[[FORCONDUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[FORCONDUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[IFEND]] + +; CHECK: [[IFEND]]: +; CHECK: br label %[[IFEND6]] + +; CHECK: [[IFEND6]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]] + +; CHECK: [[FORCONDPUREEXIT]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[E:.+]]: +; CHECK: br label %[[G]] + +; CHECK: [[F]]: +; CHECK: br label %[[FELSE:.+]] + +; CHECK: [[FELSE]]: +; CHECK: br label %[[E]] + +; CHECK: [[G]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll new file mode 100644 index 0000000000000..43bb9c44eb492 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll @@ -0,0 +1,173 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization9 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <--. +; | | +; c <. | +; | | | +; d -' | +; | | +; e ---' +; | +; f +; +; * where node e is a varying branch. +; * where node f is divergent. +; +; With BOSCC, it will be transformed as follows: +; +; a +; | +; b <--. .> b' <--. +; | | | | | +; c <. | | c' <. | +; | | | | | | | +; d -' | | d' -' | +; | | | | | +; e ---' | e' ---' +; |\_____' | +; f f' +; \ / +; \ / +; \ / +; \ / +; & +; +; where '&' represents merge blocks of BOSCC regions. +; +; __kernel void partial_linearization9(__global int *out, int n) { +; int id = get_global_id(0); +; int i = 0; +; +; while (1) { +; int j = 0; +; for (; ; i++) { +; if (j++ > n) break; +; } +; if (i++ + id > n) break; +; } +; +; out[id] = i; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization9(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end7, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc3, %if.end7 ] + br label %for.cond + +for.cond: ; preds = %for.inc, %while.body + %i.1 = phi i32 [ %i.0, %while.body ], [ %inc3, %for.inc ] + %j.0 = phi i32 [ 0, %while.body ], [ %inc, %for.inc ] + %cmp = icmp sgt i32 %j.0, %n + %inc3 = add nsw i32 %i.1, 1 + br i1 %cmp, label %for.end, label %for.inc + +for.inc: ; preds = %for.cond + %inc = add nsw i32 %j.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %add = add nsw i32 %i.1, %conv + %cmp4 = icmp sgt i32 %add, %n + br i1 %cmp4, label %while.end, label %if.end7 + +if.end7: ; preds = %for.end + br label %while.body + +while.end: ; preds = %for.end + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %inc3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization9, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization9 +; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FOREND:.+]], label %[[FORINC:.+]] + +; CHECK: [[FORINC]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[WHILEEND:.+]] + +; CHECK: [[WHILEBODYUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM:.+]] + +; CHECK: [[FORCONDUNIFORM]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FORENDUNIFORM:.+]], label %[[FORINCUNIFORM:.+]] + +; CHECK: [[FORINCUNIFORM]]: +; CHECK: br label %[[FORCONDUNIFORM]] + +; CHECK: [[FORENDUNIFORM]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEEND]], label %[[FORENDUNIFORMBOSCCINDIR:.+]] + +; CHECK: [[WHILEEND]]: +; CHECK: ret void + +; CHECK: [[FORENDUNIFORMBOSCCINDIR]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[FORENDUNIFORMBOSCCSTORE:.+]] + +; CHECK: [[FORENDUNIFORMBOSCCSTORE]]: +; CHECK: br label %[[WHILEBODY]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll new file mode 100644 index 0000000000000..83976b565214c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll @@ -0,0 +1,25 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that we don't crash when costing a scalable reduction +; RUN: veczc -vecz-scalable -vecz-passes="pre-linearize" -vecz-choices=LinearizeBOSCC -S < %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @boscc_merge() { + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll new file mode 100644 index 0000000000000..a7d72cd259d0d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll @@ -0,0 +1,82 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"} +!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""} +!6 = !{!"clang version 3.8.1 "} + +; Test if the interleaved store is defined correctly +; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}}) +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer +; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> +; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) #[[ATTRS:[0-9]+]] +; CHECK: ret void + +; CHECK: attributes #[[ATTRS]] = { diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll new file mode 100644 index 0000000000000..1f70bde790233 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll @@ -0,0 +1,82 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"} +!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""} +!6 = !{!"clang version 3.8.1 "} + +; Test if the interleaved store is defined correctly +; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}}) +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer +; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> +; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) #[[ATTRS:[0-9]+]] +; CHECK: ret void + +; CHECK: attributes #[[ATTRS]] = { diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll new file mode 100644 index 0000000000000..7d9b0385dbb90 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll @@ -0,0 +1,87 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %call, 0 + br i1 %cmp, label %for.end, label %for.cond + +for.cond: ; preds = %entry, %for.body + %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ] + %call1 = call i64 @__mux_get_global_size(i32 0) + %conv = trunc i64 %call1 to i32 + %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat + %0 = extractelement <4 x i1> %cmp2, i64 0 + br i1 %0, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %3 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom3 = sext i32 %3 to i64 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3 + store i32 %2, i32 addrspace(1)* %arrayidx4, align 4 + %4 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom5 = sext i32 %4 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5 + %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %6 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom7 = sext i32 %6 to i64 + %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7 + store i32 %5, i32 addrspace(1)* %arrayidx8, align 4 + %7 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom9 = sext i32 %7 to i64 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9 + %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %9 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom11 = sext i32 %9 to i64 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 %8, i32 addrspace(1)* %arrayidx12, align 4 + %10 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom13 = sext i32 %10 to i64 + %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13 + %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4 + %12 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom15 = sext i32 %12 to i64 + %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15 + store i32 %11, i32 addrspace(1)* %arrayidx16, align 4 + %inc = add <4 x i32> %storemerge, + br label %for.cond + +for.end: ; preds = %entry, %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_global_size(i32) + +; This test checks if a uniform <4 x i32> phi is not scalarized +; CHECK: define spir_kernel void @__vecz_v4_vector_loop +; CHECK: %[[STOREMERGE:.+]] = phi <4 x i32> [ %[[INC:.+]], %for.body ], [ zeroinitializer, %entry.ROSCC ] +; CHECK: %[[INC]] = add <4 x i32> %storemerge, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll new file mode 100644 index 0000000000000..692a8cc7ecc5a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll @@ -0,0 +1,94 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @vector_loop(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %in2, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %initaddr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in2, i64 %call + %init = load <4 x i32>, <4 x i32> addrspace(1)* %initaddr + %cmp = icmp eq i64 %call, 0 + br i1 %cmp, label %for.end, label %for.cond + +for.cond: ; preds = %entry, %for.body + %storemerge = phi <4 x i32> [ %inc, %for.body ], [ %init, %entry ] + %call1 = call i64 @__mux_get_global_size(i32 0) + %conv = trunc i64 %call1 to i32 + %0 = extractelement <4 x i32> %storemerge, i64 0 + %cmp2 = icmp slt i32 %0, %conv + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %3 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom3 = sext i32 %3 to i64 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3 + store i32 %2, i32 addrspace(1)* %arrayidx4, align 4 + %4 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom5 = sext i32 %4 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5 + %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %6 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom7 = sext i32 %6 to i64 + %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7 + store i32 %5, i32 addrspace(1)* %arrayidx8, align 4 + %7 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom9 = sext i32 %7 to i64 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9 + %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %9 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom11 = sext i32 %9 to i64 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 %8, i32 addrspace(1)* %arrayidx12, align 4 + %10 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom13 = sext i32 %10 to i64 + %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13 + %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4 + %12 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom15 = sext i32 %12 to i64 + %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15 + store i32 %11, i32 addrspace(1)* %arrayidx16, align 4 + %inc = add <4 x i32> %storemerge, + br label %for.cond + +for.end: ; preds = %entry, %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_global_size(i32) + +; This test checks if a varying <4 x i32> phi gets scalarized +; if it is only accessed through individually extracted elements. +; CHECK: define spir_kernel void @__vecz_v4_vector_loop +; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ] +; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ] +; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ] +; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ] +; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll new file mode 100644 index 0000000000000..36b06c64063d8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define dso_local spir_kernel void @vector_broadcast_const(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = fadd <4 x float> %1, + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} + +define dso_local spir_kernel void @vector_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = fadd <4 x float> %1, %addend + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} + +define dso_local spir_kernel void @vector_broadcast_illegal(<32 x float> addrspace(1)* nocapture readonly %in, <32 x float> %addend, <32 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)* + %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64 + %2 = fadd <32 x float> %1, %addend + %arrayidx3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %out, i64 %call + store <32 x float> %2, <32 x float> addrspace(1)* %arrayidx3, align 64 + ret void +} + +define dso_local spir_kernel void @vector_broadcast_regression(<4 x float> addrspace(1)* nocapture readonly %in, i32 %nancode, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x i32> addrspace(1)* + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %0, align 16 + %and1.i.i.i1.i = and <4 x i32> %1, + %cmp.i.i.i2.i = icmp ne <4 x i32> %and1.i.i.i1.i, + %and2.i.i.i3.i = and <4 x i32> %1, + %cmp3.i.i.i4.i = icmp eq <4 x i32> %and2.i.i.i3.i, zeroinitializer + %2 = or <4 x i1> %cmp.i.i.i2.i, %cmp3.i.i.i4.i + %3 = bitcast <4 x i32> %1 to <4 x float> + %4 = select <4 x i1> %2, <4 x float> %3, <4 x float> + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} + +; Check that new instructions aren't inserting before pre-existing allocas +define dso_local spir_kernel void @vector_broadcast_insertpt(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, i32 %nancode, <4 x float> addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) local_unnamed_addr #0 { +entry: + %existing.alloc = alloca <4 x i32> + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + store <4 x i32> zeroinitializer, <4 x i32>* %existing.alloc + %scalar = bitcast <4 x i32>* %existing.alloc to i32* + store i32 1, i32* %scalar + %v = load <4 x i32>, <4 x i32>* %existing.alloc + %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call + store <4 x i32> %v, <4 x i32> addrspace(1)* %arrayidx4, align 16 + + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %op = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %v4 = fadd <4 x float> %op, %addend + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %v4, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} + +define dso_local spir_kernel void @vector_mask_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x i1> %input, <4 x float> %woof, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = fcmp oeq <4 x float> %1, + %3 = and <4 x i1> %2, %input + %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> %woof + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]] +; CHECK-NEXT: store splat (float 0x7FF8000020000000), ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void +; +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VS2:%.*]] = call @llvm.vector.insert.nxv16f32.v4f32( poison, <4 x float> [[ADDEND:%.*]], i64 0) +; CHECK-NEXT: [[IDX0:%.*]] = call @llvm.stepvector.nxv16i32() +; CHECK-NEXT: [[VS1:%.*]] = and [[IDX0]], splat (i32 3) +; CHECK-NEXT: [[XLEN:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP0:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.riscv.vrgather.vv.nxv16f32.i64( poison, [[VS2]], [[VS1]], i64 [[TMP0]]) +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]] +; CHECK-NEXT: [[TMP3:%.*]] = load , ptr addrspace(1) [[ARRAYIDX]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = fadd [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]] +; CHECK-NEXT: store [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void +; +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_illegal( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[FIXLEN_ALLOC:%.*]] = alloca <32 x float>, align 128 +; CHECK-NEXT: store <32 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 128 +; CHECK-NEXT: [[IDX0:%.*]] = call @llvm.stepvector.nxv128i32() +; CHECK-NEXT: [[IDX1:%.*]] = and [[IDX0]], splat (i32 31) +; CHECK-NEXT: [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} [[IDX1]] to +; CHECK-NEXT: [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.masked.gather.nxv128f32.nxv128p0( [[VEC_ALLOC]], i32 4, splat (i1 true), poison) +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr <32 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]] +; CHECK-NEXT: [[TMP3:%.*]] = load , ptr addrspace(1) [[ARRAYIDX]], align 64 +; CHECK-NEXT: [[TMP4:%.*]] = fadd [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <32 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]] +; CHECK-NEXT: store [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 64 +; CHECK-NEXT: ret void +; +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]] +; CHECK-NEXT: [[TMP1:%.*]] = load , ptr addrspace(1) [[ARRAYIDX]], align 16 +; CHECK-NEXT: [[AND1_I_I_I1_I1:%.*]] = and [[TMP1]], splat (i32 2139095040) +; CHECK-NEXT: [[CMP_I_I_I2_I2:%.*]] = icmp ne [[AND1_I_I_I1_I1]], splat (i32 2139095040) +; CHECK-NEXT: [[AND2_I_I_I3_I3:%.*]] = and [[TMP1]], splat (i32 8388607) +; CHECK-NEXT: [[CMP3_I_I_I4_I4:%.*]] = icmp eq [[AND2_I_I_I3_I3]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = or [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: [[TMP4:%.*]] = select [[TMP2]], [[TMP3]], splat (float 0x7FF0000020000000) +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]] +; CHECK-NEXT: store [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void +; +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_insertpt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VS22:%.*]] = call @llvm.vector.insert.nxv16f32.v4f32( poison, <4 x float> [[ADDEND]], i64 0) +; CHECK-NEXT: [[IDX03:%.*]] = call @llvm.stepvector.nxv16i32() +; CHECK-NEXT: [[VS14:%.*]] = and [[IDX03]], splat (i32 3) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl {{(nuw )?}}i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.riscv.vrgather.vv.nxv16f32.i64( poison, [[VS22]], [[VS14]], i64 [[TMP1]]) +; CHECK-NEXT: [[VS2:%.*]] = call @llvm.vector.insert.nxv16i32.v4i32( poison, <4 x i32> , i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.riscv.vrgather.vv.nxv16i32.i64( poison, [[VS2]], [[VS14]], i64 [[TMP1]]) +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]] +; CHECK-NEXT: store [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN]], i64 [[CALL]] +; CHECK-NEXT: [[TMP4:%.*]] = load , ptr addrspace(1) [[ARRAYIDX]], align 16 +; CHECK-NEXT: [[V45:%.*]] = fadd [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT]], i64 [[CALL]] +; CHECK-NEXT: store [[V45]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void + +; CHECK-LABEL: @__vecz_nxv4_vector_mask_broadcast( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VS21:%.*]] = call @llvm.vector.insert.nxv16f32.v4f32( poison, <4 x float> [[WOOF:%.*]], i64 0) +; CHECK-NEXT: [[IDX02:%.*]] = call @llvm.stepvector.nxv16i32() +; CHECK-NEXT: [[VS13:%.*]] = and [[IDX02]], splat (i32 3) +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl {{(nuw )?}}i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.riscv.vrgather.vv.nxv16f32.i64( poison, [[VS21]], [[VS13]], i64 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8> +; CHECK-NEXT: [[VS2:%.*]] = call @llvm.vector.insert.nxv16i8.v4i8( poison, <4 x i8> [[TMP3]], i64 0) +; CHECK-NEXT: [[IDX0:%.*]] = call @llvm.stepvector.nxv16i16() +; CHECK-NEXT: [[VS1:%.*]] = and [[IDX0]], splat (i16 3) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.riscv.vrgatherei16.vv.nxv16i8.i64( poison, [[VS2]], [[VS1]], i64 [[TMP1]]) +; CHECK-NEXT: [[TMP5:%.*]] = trunc [[TMP4]] to +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN]], i64 [[CALL]] +; CHECK-NEXT: [[TMP6:%.*]] = load , ptr addrspace(1) [[ARRAYIDX]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp oeq [[TMP6]], splat (float 1.000000e+00) +; CHECK-NEXT: [[TMP8:%.*]] = and [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = select [[TMP8]], [[TMP6]], [[TMP2]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT]], i64 [[CALL]] +; CHECK-NEXT: store [[TMP9]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll new file mode 100644 index 0000000000000..2895d1848afea --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll @@ -0,0 +1,182 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "riscv64-unknown-unknown" + +define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { + ; Dummy uses of the builtins, as we don't define any with zero uses. + %a = call @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j( zeroinitializer) + %b = call @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j( zeroinitializer) + %c = call @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f( zeroinitializer) + %d = call @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f( zeroinitializer) + %e = call @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j( zeroinitializer) + %f = call @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j( zeroinitializer) + %g = call @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j( zeroinitializer) + %h = call @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j( zeroinitializer) + %i = call @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j( zeroinitializer) + %j = call @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f( zeroinitializer) + %k = call @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f( zeroinitializer) + %l = call @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f( zeroinitializer) + %m = call @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f( zeroinitializer) + ret void +} + +declare @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j({{.*}}) { +; CHECK: entry: +; CHECK: %[[STEP:.+]] = call @llvm.stepvector.nxv4i32() +; CHECK: %[[SCALE:.+]] = call i32 @llvm.vscale.i32() +; CHECK: %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4 +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector %[[N_INS]], poison, zeroinitializer +; CHECK: %[[MASK:.+]] = xor %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64() +; CHECK: %[[VL:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE]], 4 +; CHECK: %[[SHUFFLE:.+]] = call @llvm.riscv.vrgather.vv.nxv4i32.i64({{( poison, )?}} %[[VEC]], %[[MASK]], i64 %[[VL]]) + +; CHECK: %[[ACCUM:.+]] = add %[[VEC]], %{{.+}} +; CHECK: %[[BIT:.+]] = and %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select %[[WHICH]], %[[ACCUM]], %[[VEC]] +; CHECK: %[[NEWMASK]] = or %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]] +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[RESULT:.+]] = phi [ %[[NEWVEC]], %loop ] +; CHECK: ret %[[RESULT]] +; CHECK: } + +declare @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j({{.*}}) { +; CHECK: entry: +; CHECK: %[[STEP:.+]] = call @llvm.stepvector.nxv4i32() +; CHECK: %[[SCALE:.+]] = call i32 @llvm.vscale.i32() +; CHECK: %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4 +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector %[[N_INS]], poison, zeroinitializer +; CHECK: %[[MASK:.+]] = xor %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64() +; CHECK: %[[VL:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE]], 4 +; CHECK: %[[SHUFFLE:.+]] = call @llvm.riscv.vrgather.vv.nxv4i32.i64({{( poison, )?}} %[[VEC]], %[[MASK]], i64 %[[VL]]) + +; CHECK: %[[ACCUM:.+]] = add %[[VEC]], %{{.+}} +; CHECK: %[[BIT:.+]] = and %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select %[[WHICH]], %[[ACCUM]], %[[VEC]] +; CHECK: %[[NEWMASK]] = or %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]] +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[SCAN:.+]] = phi [ %[[NEWVEC]], %loop ] + +;------- target-dependent slide-up code: +; CHECK: %[[VLSCALE2:.+]] = call i64 @llvm.vscale.i64() +; CHECK: %[[VL2:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE2]], 4 +; CHECK: %[[RESULT:.+]] = call @llvm.riscv.vslide1up.nxv4i32.i32.i64({{( poison, )?}} %[[SCAN]], i32 0, i64 %[[VL2]]) + +; CHECK: ret %[[RESULT]] +; CHECK: } + +; We know the generated code is correct for one scan type, +; now verify that all the others use the correct binary operations. + +declare @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = fadd %[[VEC]], %{{.+}} + +declare @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = fadd %[[VEC]], %{{.+}} + +; Make sure the floating point version of the slide1up intrinsic is created +; CHECK: call @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{( poison, )?}} %{{.+}}, float 0.000000e+00, i64 %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smax.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.umin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.umax.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.maxnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.maxnum.nxv4f32( %[[VEC]], %{{.+}}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll new file mode 100644 index 0000000000000..fbd4bcf657f63 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll @@ -0,0 +1,183 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "riscv64-unknown-unknown" + +define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { + ; Dummy uses of the builtins, as we don't define any with zero uses. + %a = call @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj( zeroinitializer, i32 0) + %b = call @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj( zeroinitializer, i32 0) + %c = call @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj( zeroinitializer, i32 0) + %d = call @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj( zeroinitializer, i32 0) + %e = call @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj( zeroinitializer, i32 0) + %f = call @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj( zeroinitializer, i32 0) + %g = call @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj( zeroinitializer, i32 0) + %h = call @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj( zeroinitializer, i32 0) + %i = call @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj( zeroinitializer, i32 0) + %j = call @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj( zeroinitializer, i32 0) + %k = call @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj( zeroinitializer, i32 0) + %l = call @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj( zeroinitializer, i32 0) + %m = call @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj( zeroinitializer, i32 0) + %n = call @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj( zeroinitializer, i32 0) + ret void +} + +declare @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj({{.*}}, i32{{.*}}) { +; CHECK: entry: +; CHECK: %[[STEP:.+]] = call @llvm.stepvector.nxv4i32() +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector %[[N_INS]], poison, zeroinitializer +; CHECK: %[[MASK:.+]] = xor %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: %[[VL:.+]] = zext i32 %1 to i64 +; CHECK: %[[SHUFFLE:.+]] = call @llvm.riscv.vrgather.vv.nxv4i32.i64({{( poison, )?}} %[[VEC]], %[[MASK]], i64 %[[VL]]) + +; CHECK: %[[ACCUM:.+]] = add %[[VEC]], %[[SHUFFLE]] +; CHECK: %[[BIT:.+]] = and %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select %[[WHICH]], %[[ACCUM]], %[[VEC]] +; CHECK: %[[NEWMASK]] = or %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %1 +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[RESULT:.+]] = phi [ %[[NEWVEC]], %loop ] +; CHECK: ret %[[RESULT]] +; CHECK: } + +declare @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj({{.*}}, i32{{.*}}) { +; CHECK: entry: +; CHECK: %[[STEP:.+]] = call @llvm.stepvector.nxv4i32() +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector %[[N_INS]], poison, zeroinitializer +; CHECK: %[[MASK:.+]] = xor %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: %[[VL:.+]] = zext i32 %1 to i64 +; CHECK: %[[SHUFFLE:.+]] = call @llvm.riscv.vrgather.vv.nxv4i32.i64({{( poison, )?}} %[[VEC]], %[[MASK]], i64 %[[VL]]) + +; CHECK: %[[ACCUM:.+]] = add %[[VEC]], %{{.+}} +; CHECK: %[[BIT:.+]] = and %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select %[[WHICH]], %[[ACCUM]], %[[VEC]] +; CHECK: %[[NEWMASK]] = or %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %1 +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[SCAN:.+]] = phi [ %[[NEWVEC]], %loop ] + +;------- target-dependent slide-up code: +; CHECK: %[[VL2:.+]] = zext i32 %1 to i64 +; CHECK: %[[RESULT:.+]] = call @llvm.riscv.vslide1up.nxv4i32.i32.i64({{( poison, )?}} %[[SCAN]], i32 0, i64 %[[VL2]]) + +; CHECK: ret %[[RESULT]] +; CHECK: } + +; We know the generated code is correct for one scan type, +; now verify that all the others use the correct binary operations. + +declare @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = fadd %[[VEC]], %{{.+}} + +declare @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = fadd %[[VEC]], %{{.+}} + +; Make sure the floating point version of the slide1up intrinsic is created +; CHECK: call @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{( poison, )?}} %{{.+}}, float 0.000000e+00, i64 %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smax.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.umin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.umax.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.maxnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.maxnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f64( %[[VEC]], %{{.+}}) +; CHECK: call @llvm.riscv.vfslide1up.nxv4f64.f64.i64({{( poison, )?}} %{{.+}}, double 0x7FF0000000000000, i64 %{{.+}}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll new file mode 100644 index 0000000000000..8c98f98249bf4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll @@ -0,0 +1,167 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k extract_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE +; RUN: not veczc -k extract_element_ilegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s +; RUN: veczc -k extract_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI +; RUN: veczc -k extract_element_uniform_vec -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI-VEC +; RUN: veczc -k extract_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-INDICES +; RUN: veczc -k extract_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-BOOL + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @extract_element(<4 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = extractelement <4 x float> %1, i32 %idx + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %2, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +; NOTE: Base packetization failing for this case. + +define spir_kernel void @extract_element_ilegal(<32 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)* + %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64 + %2 = extractelement <32 x float> %1, i32 %idx + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %2, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @extract_element_uniform(<4 x float> %in, i32 %idx, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %0 = extractelement <4 x float> %in, i32 %idx + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %0, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @extract_element_uniform_vec(<4 x float> %in, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %i = urem i64 %call, 4 + %0 = extractelement <4 x float> %in, i64 %i + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %0, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @extract_element_varying_indices(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %idxs, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call + %idx = load i32, i32 addrspace(1)* %arrayidxidx + %i = urem i32 %idx, 4 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx + %1 = extractelement <4 x float> %0, i32 %i + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %1, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @extract_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %idx, i32 addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call + %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4 + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4 + %2 = icmp slt <4 x i32> %0, %1 + %i = urem i64 %call, 4 + %3 = extractelement <4 x i1> %2, i64 %i + %4 = sext i1 %3 to i32 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %4, i32 addrspace(1)* %arrayidx3, align 4 + %5 = sext <4 x i1> %2 to <4 x i32> + %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call + store <4 x i32> %5, <4 x i32> addrspace(1)* %arrayidx4, align 4 + ret void +} + +; EE-LABEL: @__vecz_nxv4_extract_element( +; EE: [[XLEN:%.*]] = call i64 @llvm.vscale.i64() +; EE-NEXT: [[TMP2:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2 +; EE-NEXT: [[SPLATINSERT:%.*]] = insertelement poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0 +; EE-NEXT: [[SPLAT:%.*]] = shufflevector [[SPLATINSERT]], poison, zeroinitializer +; EE-NEXT: [[IDX0:%.*]] = call @llvm.stepvector.nxv4i32() +; EE-NEXT: [[IDXSCALE:%.*]] = shl [[IDX0]], splat (i32 2) +; EE-NEXT: [[VS1:%.*]] = add [[IDXSCALE]], [[SPLAT]] +; EE-NEXT: [[T3:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[VS1]], i64 0) +; EE-NEXT: [[T4:%.*]] = call @llvm.riscv.vrgather.vv.nxv16f32.i64( poison, [[T1:%.*]], [[T3]], i64 [[TMP2]]) +; EE-NEXT: [[T5:%.*]] = call @llvm.vector.extract.nxv4f32.nxv16f32( [[T4]], i64 0) + +; Both the vector and index are uniform, so check we're not unnecessarily packetizing + +; EE-UNI-LABEL: @__vecz_nxv4_extract_element_uniform( +; EE-UNI: [[T0:%.*]] = extractelement <4 x float> %in, i32 %idx +; EE-UNI: [[T1:%.*]] = insertelement poison, float [[T0]], {{(i32|i64)}} 0 +; EE-UNI: [[T2:%.*]] = shufflevector [[T1]], poison, zeroinitializer +; EE-UNI: store [[T2]], ptr addrspace(1) {{%.*}}, align 4 + +; The vector is uniform and the index is varying, so we must broadcast the vector +; FIXME: Do we really need to broadcast? Can we mod the indices with the original vector length? + +; EE-UNI-VEC-LABEL: @__vecz_nxv4_extract_element_uniform_vec( +; EE-UNI-VEC: [[XLEN:%.*]] = call i64 @llvm.vscale.i64() +; EE-UNI-VEC: [[T3:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2 +; EE-UNI-VEC-NEXT: [[T:%.*]] = trunc [[T2:%.*]] to +; EE-UNI-VEC-NEXT: [[I1:%.*]] = and [[T]], {{splat \(i32 3\)|trunc \( splat \(i64 3\) to \)}} +; EE-UNI-VEC-NEXT: [[IDX02:%.*]] = call @llvm.stepvector.nxv4i32() +; EE-UNI-VEC-NEXT: [[IDXSCALE:%.*]] = shl [[IDX02]], splat (i32 2) + +; EE-UNI-VEC-NEXT: [[VS1:%.*]] = or disjoint [[IDXSCALE]], [[I1]] + +; EE-UNI-VEC-NEXT: [[T4:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[VS1]], i64 0) +; EE-UNI-VEC-NEXT: [[T5:%.*]] = call @llvm.riscv.vrgather.vv.nxv16f32.i64( poison, [[T1:%.*]], [[T4]], i64 [[T3]]) +; EE-UNI-VEC-NEXT: [[T6:%.*]] = call @llvm.vector.extract.nxv4f32.nxv16f32( [[T5]], i64 0) + +; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices( +; EE-INDICES: [[XLEN:%.*]] = call i64 @llvm.vscale.i64() +; EE-INDICES-NEXT: [[T4:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2 +; EE-INDICES-NEXT: [[IDX0:%.*]] = call @llvm.stepvector.nxv4i32() +; EE-INDICES-NEXT: [[IDXSCALE:%.*]] = shl [[IDX0]], splat (i32 2) +; EE-INDICES-NEXT: [[VS1:%.*]] = or disjoint [[IDXSCALE]], [[I1:%.*]] +; EE-INDICES-NEXT: [[T5:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[VS1]], i64 0) +; EE-INDICES-NEXT: [[T6:%.*]] = call @llvm.riscv.vrgather.vv.nxv16f32.i64( poison, [[T3:%.*]], [[T5]], i64 [[T4]]) +; EE-INDICES-NEXT: [[T7:%.*]] = call @llvm.vector.extract.nxv4f32.nxv16f32( [[T6]], i64 0) + +; Check we promote from i1 to i8 before doing our memops and use vrgatherei16. +; EE-BOOL-LABEL: @__vecz_nxv4_extract_element_bool( +; EE-BOOL: [[T6:%.*]] = sext [[T5:%.*]] to +; EE-BOOL-NEXT: [[XLEN:%.*]] = call i64 @llvm.vscale.i64() +; EE-BOOL-NEXT: [[T7:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2 +; EE-BOOL-NEXT: [[T8:%.*]] = trunc [[T0:%.*]] to +; EE-BOOL-NEXT: [[T9:%.*]] = and [[T8]], {{splat \(i16 3\)|trunc \( splat \(i64 3\) to \)}} +; EE-BOOL-NEXT: [[T10:%.*]] = call @llvm.stepvector.nxv4i16() +; EE-BOOL-NEXT: [[T11:%.*]] = shl [[T10]], splat (i16 2) +; EE-BOOL-NEXT: [[VS1:%.*]] = or disjoint [[T11]], [[T9]] +; EE-BOOL-NEXT: [[T12:%.*]] = call @llvm.vector.insert.nxv16i16.nxv4i16( poison, [[VS1]], i64 0) +; EE-BOOL-NEXT: [[T13:%.*]] = call @llvm.riscv.vrgatherei16.vv.nxv16i8.i64( poison, [[T6]], [[T12]], i64 [[T7]]) +; EE-BOOL-NEXT: [[T14:%.*]] = call @llvm.vector.extract.nxv4i8.nxv16i8( [[T13]], i64 0) +; EE-BOOL-NEXT: [[T15:%.*]] = trunc [[T14]] to diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll new file mode 100644 index 0000000000000..782982d0447ee --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll @@ -0,0 +1,136 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k insert_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE +; RUN: veczc -k insert_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-UNI +; RUN: veczc -k insert_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-INDICES +; RUN: not veczc -k insert_element_illegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s +; RUN: veczc -k insert_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-BOOL + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @insert_element(<4 x float> addrspace(1)* nocapture readonly %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = insertelement <4 x float> %1, float %val, i32 %idx + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @insert_element_uniform(<4 x float> %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %0 = insertelement <4 x float> %in, float %val, i32 %idx + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @insert_element_varying_indices(<4 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <4 x float> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call + %idx = load i32, i32 addrspace(1)* %arrayidxidx + %i = urem i32 %idx, 4 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %fidx = uitofp i64 %call to float + %2 = insertelement <4 x float> %1, float %fidx, i32 %i + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @insert_element_illegal(<32 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <32 x float> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call + %idx = load i32, i32 addrspace(1)* %arrayidxidx, align 4 + %i = urem i32 %idx, 32 + %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)* + %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64 + %fidx = uitofp i64 %call to float + %2 = insertelement <32 x float> %1, float %fidx, i32 %i + %arrayidx3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %out, i64 %call + store <32 x float> %2, <32 x float> addrspace(1)* %arrayidx3, align 64 + ret void +} + +define spir_kernel void @insert_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %val, i32 %idx, <4 x i32> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call + %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4 + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4 + %2 = icmp slt <4 x i32> %0, %1 + %i = urem i64 %call, 4 + %v = trunc i32 %val to i1 + %3 = insertelement <4 x i1> %2, i1 %v, i64 %i + %4 = sext <4 x i1> %3 to <4 x i32> + %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx4, align 4 + ret void +} + +; IE-LABEL: @__vecz_nxv4_insert_element( +; IE: [[SPLATINSERT:%.*]] = insertelement poison, float [[VAL:%.*]], {{(i32|i64)}} 0 +; IE: [[SPLAT:%.*]] = shufflevector [[SPLATINSERT]], poison, zeroinitializer +; IE: [[XLEN:%.*]] = call i64 @llvm.vscale.i64() +; IE-NEXT: [[TMP2:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4 +; IE-NEXT: [[SPLATINSERT1:%.*]] = insertelement poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0 +; IE-NEXT: [[SPLAT2:%.*]] = shufflevector [[SPLATINSERT1]], poison, zeroinitializer +; IE-NEXT: [[ELTS:%.*]] = call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[SPLAT]], i64 0) +; IE-NEXT: [[STEP:%.*]] = call @llvm.stepvector.nxv16i32() +; IE-NEXT: [[INNER:%.*]] = and [[STEP]], splat (i32 3) +; IE-NEXT: [[OUTER:%.*]] = lshr [[STEP]], splat (i32 2) +; IE-NEXT: [[VM:%.*]] = icmp eq [[SPLAT2]], [[INNER]] +; IE-NEXT: [[TMP8:%.*]] = call @llvm.riscv.vrgather.vv.mask.nxv16f32.i64( [[TMP1:%.*]], [[ELTS]], [[OUTER]], [[VM]], i64 [[TMP2]]{{(, i64 1)?}}) + +; Both the vector and index are uniform, so check we're not unnecessarily packetizing + +; IE-UNI-LABEL: @__vecz_nxv4_insert_element_uniform( +; IE-UNI: {{%.*}} = insertelement <4 x float> %in, float %val, {{(i32|i64)}} %idx + +; IE-INDICES-LABEL: @__vecz_nxv4_insert_element_varying_indices( +; IE-INDICES: [[FIDX2:%.*]] = uitofp [[TMP0:%.*]] to +; IE-INDICES-NEXT: [[XLEN:%.*]] = call i64 @llvm.vscale.i64() +; IE-INDICES-NEXT: [[TMP5:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4 +; IE-INDICES-NEXT: [[VS2:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, {{%.*}}, i64 0) +; IE-INDICES: [[IDX0:%.*]] = call @llvm.stepvector.nxv16i32() +; IE-INDICES-NEXT: [[IDX1:%.*]] = lshr [[IDX0]], splat (i32 2) +; IE-INDICES-NEXT: [[TMP9:%.*]] = call @llvm.riscv.vrgather.vv.nxv16i32.i64( poison, [[VS2:%.*]], [[IDX1]], i64 [[TMP5]]) +; IE-INDICES-NEXT: [[VS25:%.*]] = call @llvm.vector.insert.nxv16f32.nxv4f32( poison, [[FIDX2]], i64 0) +; IE-INDICES-NEXT: [[INNER:%.*]] = and [[IDX0]], splat (i32 3) +; IE-INDICES-NEXT: [[VM:%.*]] = icmp eq [[TMP9]], [[INNER]] +; IE-INDICES-NEXT: [[TMP11:%.*]] = call @llvm.riscv.vrgather.vv.mask.nxv16f32.i64( [[TMP4:%.*]], [[VS25]], [[IDX1]], [[VM]], i64 [[TMP5]]{{(, i64 1)?}}) + +; Check we promote from i1 to i8 before doing our memops +; IE-BOOL-LABEL: @__vecz_nxv4_insert_element_bool( +; IE-BOOL-DAG: [[T1:%.*]] = sext {{%.*}} to +; IE-BOOL-DAG: [[T0:%.*]] = sext {{%.*}} to +; IE-BOOL: [[TMP18:%.*]] = call @llvm.riscv.vrgatherei16.vv.mask.nxv16i8.i64( [[TMP7:%.*]], {{%.*}}, [[TMP16:%.*]], [[VM:%.*]], i64 [[TMP8:%.*]]) +; %12 = call @llvm.riscv.vrgatherei16.vv.mask.nxv16i8.i64( %6, %vs25, %vs16, %vm, i64 %7, i64 1) +; IE-BOOL-NEXT: [[TMP19:%.*]] = trunc [[TMP18]] to diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg new file mode 100644 index 0000000000000..8b3578af8d21e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg @@ -0,0 +1,20 @@ +# Copyright (C) Codeplay Software Limited +# +# Licensed under the Apache License, Version 2.0 (the "License") with LLVM +# Exceptions; you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if not 'RISCV' in config.root.targets: + config.unsupported = True + +config.substitutions.append(('%vattr', '+v')) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll new file mode 100644 index 0000000000000..c80338ff7de9c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll @@ -0,0 +1,42 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid + %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr + %out.data = shufflevector <4 x i32> %in.data, <4 x i32> poison, <4 x i32> + %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid + store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +; It checks that a single-operand shuffle that doesn't change the length is packetized to a gather intrinsic. +; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} { +; CHECK: entry: +; CHECK: %[[DATA:.+]] = load , {{( addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}} +; CHECK: %[[GATHER:.+]] = call @llvm.riscv.vrgather.vv.nxv16i32.i64( poison, %[[DATA]], %{{.+}}, i64 %{{.+}}) +; CHECK: store %[[GATHER]] +; CHECK: ret void +; CHECK: } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll new file mode 100644 index 0000000000000..d0ec0c5e6ce07 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid + %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr + %in.bool = icmp ne <4 x i32> %in.data, zeroinitializer + %out.data = shufflevector <4 x i1> %in.bool, <4 x i1> poison, <4 x i32> + %out.sext = sext <4 x i1> %out.data to <4 x i32> + %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid + store <4 x i32> %out.sext, <4 x i32> addrspace(1)* %out.ptr, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +; It checks that a single-operand shuffle that doesn't change the length is packetized to a gather intrinsic, +; and that it works with a vector of i1 type by temporarily extending to i8. +; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} { +; CHECK: entry: +; CHECK: %[[DATA:.+]] = load , {{( addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}} +; CHECK: %[[DATA_i1:.+]] = icmp ne %[[DATA]], zeroinitializer +; CHECK: %[[DATA_i8:.+]] = zext %[[DATA_i1]] to +; CHECK: %[[GATHER:.+]] = call @llvm.riscv.vrgatherei16.vv.nxv16i8.i64( poison, %[[DATA_i8]], %{{.+}}, i64 %{{.+}}) +; CHECK: %[[GATHER_i1:.+]] = trunc %[[GATHER]] to +; CHECK: %[[RESULT:.+]] = sext %[[GATHER_i1]] to +; CHECK: store %[[RESULT]] +; CHECK: ret void +; CHECK: } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll new file mode 100644 index 0000000000000..cf0b5b3ac8d5a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @f(<2 x i32> addrspace(1)* %a, <2 x i32> addrspace(1)* %b, <4 x i32> addrspace(1)* %out) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %a.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 %gid + %a.data = load <2 x i32>, <2 x i32> addrspace(1)* %a.ptr + %b.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 %gid + %b.data = load <2 x i32>, <2 x i32> addrspace(1)* %b.ptr + %out.data = shufflevector <2 x i32> %a.data, <2 x i32> %b.data, <4 x i32> + %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid + store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +; It checks that a two-operand shuffle is packetized to a gather intrinsics and a select. +; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} { +; CHECK: entry: +; CHECK: %[[DATA:.+]] = load , {{( addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}} +; CHECK: %[[DATB:.+]] = load , {{( addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}} +; CHECK: %[[WIDENA:.+]] = call @llvm.vector.insert.nxv16i32.nxv8i32( poison, %[[DATA]], i64 0) +; CHECK: %[[GATHERA:.+]] = call @llvm.riscv.vrgather.vv.nxv16i32.i64( poison, %[[WIDENA]], %{{.+}}, i64 %{{.+}}) +; CHECK: %[[WIDENB:.+]] = call @llvm.vector.insert.nxv16i32.nxv8i32( poison, %[[DATB]], i64 0) +; CHECK: %[[GATHERB:.+]] = call @llvm.riscv.vrgather.vv.nxv16i32.i64( poison, %[[WIDENB]], %{{.+}}, i64 %{{.+}}) +; CHECK: %[[SELECT:.+]] = select %{{.+}}, %[[GATHERB]], %[[GATHERA]] +; CHECK: store %[[SELECT]] +; CHECK: ret void +; CHECK: } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll new file mode 100644 index 0000000000000..8c28d3762451d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <2 x i32> addrspace(1)* %out) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid + %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr + %out.data = shufflevector <4 x i32> %in.data, <4 x i32> poison, <2 x i32> + %out.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i64 %gid + store <2 x i32> %out.data, <2 x i32> addrspace(1)* %out.ptr, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +; It checks that a single-operand shuffle that narrows the vector is packetized to a gather intrinsic. +; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} { +; CHECK: entry: +; CHECK: %[[DATA:.+]] = load , {{( addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}} +; CHECK: %[[GATHER:.+]] = call @llvm.riscv.vrgather.vv.nxv16i32.i64( poison, %[[DATA]], %{{.+}}, i64 %{{.+}}) +; CHECK: %[[EXTRACT:.+]] = call @llvm.vector.extract.nxv8i32.nxv16i32( %[[GATHER]], i64 0) +; CHECK: store %[[EXTRACT]] +; CHECK: ret void +; CHECK: } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll new file mode 100644 index 0000000000000..789e091a7e7b3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @f(<2 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %in.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %in, i64 %gid + %in.data = load <2 x i32>, <2 x i32> addrspace(1)* %in.ptr + %out.data = shufflevector <2 x i32> %in.data, <2 x i32> poison, <4 x i32> + %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid + store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +; It checks that a single-operand shuffle that widens the vector is packetized to a gather intrinsic. +; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} { +; CHECK: entry: +; CHECK: %[[DATA:.+]] = load , {{( addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}} +; CHECK: %[[WIDEN:.+]] = call @llvm.vector.insert.nxv16i32.nxv8i32( poison, %[[DATA]], i64 0) +; CHECK: %[[GATHER:.+]] = call @llvm.riscv.vrgather.vv.nxv16i32.i64( poison, %[[WIDEN]], %{{.+}}, i64 %{{.+}}) +; CHECK: store %[[GATHER]] +; CHECK: ret void +; CHECK: } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll new file mode 100644 index 0000000000000..8af9cb06320bf --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k select_scalar_vector -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @select_scalar_vector(i32* %aptr, i32* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx + %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load i32, i32* %arrayidxb, align 4 + %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4 + %cmp = icmp slt i32 %a, %b + %sel = select i1 %cmp, <2 x i32> %c, <2 x i32> + store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_vector +; CHECK: [[rhs:%.*]] = load , ptr +; CHECK: [[cmp1:%.*]] = icmp slt +; CHECK: [[sext:%.*]] = sext [[cmp1]] to +; CHECK: [[idx0:%.*]] = call @llvm.stepvector.nxv8i16() +; CHECK: [[idx1:%.*]] = lshr [[idx0]], splat (i16 1) +; CHECK: [[gather:%.*]] = call @llvm.riscv.vrgatherei16.vv.nxv8i8.i64( poison, [[vs2:%.*]], [[vs1:%.*]], i64 [[xlen:%.*]]) +; CHECK: [[cmp:%.*]] = trunc [[gather]] to +; CHECK: [[sel:%.*]] = select [[cmp]], [[rhs]], splat (i32 4) +; CHECK: store [[sel]], diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll new file mode 100644 index 0000000000000..7ab9888ec9b91 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll @@ -0,0 +1,99 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-4 +; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-8 +; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-16 +; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-4 +; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-8 +; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-16 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @store_element(i32 %0, i32 addrspace(1)* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %cond = icmp ne i64 %call, 0 + br i1 %cond, label %do, label %ret + +do: + %dest = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 %0, i32 addrspace(1)* %dest, align 4 + br label %ret + +ret: + ret void +} + +; CHECK-STORE-4: define void @__vecz_b_masked_store4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj( [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], [[TMP2:%.*]], i32 [[TMP3:%.*]]) +; CHECK-STORE-4-NEXT: entry: +; CHECK-STORE-4-NEXT: call void @llvm.vp.store.nxv4i32.p1( [[TMP0]], ptr addrspace(1) [[TMP1]], [[TMP2]], i32 [[TMP3]]) +; CHECK-STORE-4-NEXT: ret void + +; CHECK-STORE-8: define void @__vecz_b_masked_store4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj( [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], [[TMP2:%.*]], i32 [[TMP3:%.*]]) +; CHECK-STORE-8-NEXT: entry: +; CHECK-STORE-8-NEXT: call void @llvm.vp.store.nxv8i32.p1( [[TMP0]], ptr addrspace(1) [[TMP1]], [[TMP2]], i32 [[TMP3]]) +; CHECK-STORE-8-NEXT: ret void + +; CHECK-STORE-16: define void @__vecz_b_masked_store4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj( [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], [[TMP2:%.*]], i32 [[TMP3:%.*]]) +; CHECK-STORE-16-NEXT: entry: +; CHECK-STORE-16-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; CHECK-STORE-16-NEXT: [[SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP3]], {{i32|i64}} 0 +; CHECK-STORE-16-NEXT: [[SPLAT:%.*]] = shufflevector [[SPLATINSERT]], poison, zeroinitializer +; CHECK-STORE-16-NEXT: [[TMP6:%.*]] = icmp ult [[TMP5]], [[SPLAT]] +; CHECK-STORE-16-NEXT: [[TMP7:%.*]] = select [[TMP2]], [[TMP6]], zeroinitializer +; CHECK-STORE-16-NEXT: call void @llvm.masked.store.nxv16i32.p1( [[TMP0]], ptr addrspace(1) [[TMP1]], i32 4, [[TMP7]]) +; CHECK-STORE-16-NEXT: ret void + +define spir_kernel void @load_element(i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %cond = icmp ne i64 %call, 0 + br i1 %cond, label %do, label %ret + +do: + %src = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call + %dest = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + %do.res = load i32, i32 addrspace(1)* %src, align 4 + store i32 %do.res, i32 addrspace(1)* %dest, align 4 + br label %ret + +ret: + ret void +} + +; CHECK-LOAD-4: define @__vecz_b_masked_load4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(ptr addrspace(1) [[TMP0:%.*]], [[TMP1:%.*]], i32 [[TMP2:%.*]]) +; CHECK-LOAD-4-NEXT: entry: +; CHECK-LOAD-4-NEXT: [[TMP4:%.*]] = call @llvm.vp.load.nxv4i32.p1(ptr addrspace(1) [[TMP0]], [[TMP1]], i32 [[TMP2]]) +; CHECK-LOAD-4-NEXT: ret [[TMP4]] + +; CHECK-LOAD-8: define @__vecz_b_masked_load4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(ptr addrspace(1) [[TMP0:%.*]], [[TMP1:%.*]], i32 [[TMP2:%.*]]) +; CHECK-LOAD-8-NEXT: entry: +; CHECK-LOAD-8-NEXT: [[TMP4:%.*]] = call @llvm.vp.load.nxv8i32.p1(ptr addrspace(1) [[TMP0]], [[TMP1]], i32 [[TMP2]]) +; CHECK-LOAD-8-NEXT: ret [[TMP4]] + +; CHECK-LOAD-16: define @__vecz_b_masked_load4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(ptr addrspace(1) [[TMP0:%.*]], [[TMP1:%.*]], i32 [[TMP2:%.*]]) +; CHECK-LOAD-16-NEXT: entry: +; CHECK-LOAD-16-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; CHECK-LOAD-16-NEXT: [[TMPSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP2]], {{i32|i64}} 0 +; CHECK-LOAD-16-NEXT: [[TMPSPLAT:%.*]] = shufflevector [[TMPSPLATINSERT]], poison, zeroinitializer +; CHECK-LOAD-16-NEXT: [[TMP5:%.*]] = icmp ult [[TMP4]], [[TMPSPLAT]] +; CHECK-LOAD-16-NEXT: [[TMP6:%.*]] = select [[TMP1]], [[TMP5]], zeroinitializer +; CHECK-LOAD-16-NEXT: [[TMP7:%.*]] = call @llvm.masked.load.nxv16i32.p1(ptr addrspace(1) [[TMP0]], i32 4, [[TMP6]], poison) +; CHECK-LOAD-16-NEXT: ret [[TMP7]] + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll new file mode 100644 index 0000000000000..7823d56291ac5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll @@ -0,0 +1,46 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+v -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @load_add_store(i32* %aptr, i32* %bptr, i32* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load i32, i32* %arrayidxb, align 4 + %sum = add i32 %a, %b + store i32 %sum, i32* %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv4_vp_load_add_store +; CHECK: %local.id = call i64 @__mux_get_local_id(i32 0) +; CHECK: %local.size = call i64 @__mux_get_local_size(i32 0) +; CHECK: %work.remaining = sub nuw nsw i64 %local.size, %local.id +; CHECK: %[[vli64:.+]] = call i64 @llvm.riscv.vsetvli.i64(i64 %work.remaining, i64 2, i64 1) +; CHECK: %[[vl:.+]] = trunc nuw i64 %[[vli64]] to i32 +; CHECK: %[[lhs:.+]] = call @llvm.vp.load.nxv4i32.p0({{.*}}, i32 %[[vl]]) +; CHECK: %[[rhs:.+]] = call @llvm.vp.load.nxv4i32.p0({{.*}}, i32 %[[vl]]) +; CHECK: %[[sum:.+]] = call @llvm.vp.add.nxv4i32( %[[lhs]], %[[rhs]], {{.*}}, i32 %[[vl]]) +; CHECK: call void @llvm.vp.store.nxv4i32.p0( %[[sum]], {{.*}}, i32 %[[vl]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll new file mode 100644 index 0000000000000..aa4559aad057e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll @@ -0,0 +1,179 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-passes="function(instcombine),packetizer,gvn,function(instcombine)" -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define dso_local spir_kernel void @vector_broadcast_const(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = fadd <4 x float> %1, + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} + +define dso_local spir_kernel void @vector_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = fadd <4 x float> %1, %addend + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} + +define dso_local spir_kernel void @vector_broadcast_regression(<4 x float> addrspace(1)* nocapture readonly %in, i32 %nancode, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x i32> addrspace(1)* + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %0, align 16 + %and1.i.i.i1.i = and <4 x i32> %1, + %cmp.i.i.i2.i = icmp ne <4 x i32> %and1.i.i.i1.i, + %and2.i.i.i3.i = and <4 x i32> %1, + %cmp3.i.i.i4.i = icmp eq <4 x i32> %and2.i.i.i3.i, zeroinitializer + %2 = or <4 x i1> %cmp.i.i.i2.i, %cmp3.i.i.i4.i + %3 = bitcast <4 x i32> %1 to <4 x float> + %4 = select <4 x i1> %2, <4 x float> %3, <4 x float> + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} + +; Check that new instructions aren't inserting before pre-existing allocas +define dso_local spir_kernel void @vector_broadcast_insertpt(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, i32 %nancode, <4 x float> addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) local_unnamed_addr #0 { +entry: + %existing.alloc = alloca <4 x i32> + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + store <4 x i32> zeroinitializer, <4 x i32>* %existing.alloc + %scalar = bitcast <4 x i32>* %existing.alloc to i32* + store i32 1, i32* %scalar + %v = load <4 x i32>, <4 x i32>* %existing.alloc + %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call + store <4 x i32> %v, <4 x i32> addrspace(1)* %arrayidx4, align 16 + + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %op = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %v4 = fadd <4 x float> %op, %addend + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %v4, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} + +define dso_local spir_kernel void @vector_mask_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x i1> %input, <4 x float> %woof, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = fcmp oeq <4 x float> %1, + %3 = and <4 x i1> %2, %input + %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> %woof + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16 + ret void +} +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]] +; CHECK-NEXT: store {{shufflevector \( insertelement \( poison, float 0x7FF8000020000000, (i32|i64) 0\), poison, zeroinitializer\)|splat \(float 0x7FF8000020000000\)}}, ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void + +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[FIXLEN_ALLOC:%.*]] = alloca <4 x float>, align 16 +; CHECK-NEXT: store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 16 +; CHECK-NEXT: [[IDX0:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv16i32() +; CHECK-NEXT: [[IDX1:%.*]] = and [[IDX0]], {{shufflevector \( insertelement \( poison, i32 3, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 3\)}} +; CHECK-NEXT: [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} [[IDX1]] to +; CHECK-NEXT: [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.masked.gather.nxv16f32.nxv16p0( [[VEC_ALLOC]], i32 4, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}, poison) +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]] +; CHECK-NEXT: [[TMP3:%.*]] = load , ptr addrspace(1) [[ARRAYIDX]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = fadd [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]] +; CHECK-NEXT: store [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void + +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]] +; CHECK-NEXT: [[TMP1:%.*]] = load , ptr addrspace(1) [[ARRAYIDX]], align 16 +; CHECK-NEXT: [[AND1_I_I_I1_I1:%.*]] = and [[TMP1]], {{shufflevector \( insertelement \( poison, i32 2139095040, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 2139095040\)}} +; CHECK-NEXT: [[CMP_I_I_I2_I2:%.*]] = icmp ne [[AND1_I_I_I1_I1]], {{shufflevector \( insertelement \( poison, i32 2139095040, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 2139095040\)}} +; CHECK-NEXT: [[AND2_I_I_I3_I3:%.*]] = and [[TMP1]], {{shufflevector \( insertelement \( poison, i32 8388607, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 8388607\)}} +; CHECK-NEXT: [[CMP3_I_I_I4_I4:%.*]] = icmp eq [[AND2_I_I_I3_I3]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = or [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: [[TMP4:%.*]] = select [[TMP2]], [[TMP3]], {{shufflevector \( insertelement \( poison, float 0x7FF0000020000000, (i32|i64) 0\), poison, zeroinitializer\)|splat \(float 0x7FF0000020000000\)}} +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]] +; CHECK-NEXT: store [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void +; +; +; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_insertpt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXISTING_ALLOC:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: [[FIXLEN_ALLOC:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: [[FIXLEN_ALLOC1:%.*]] = alloca <4 x float>, align 16 +; CHECK-NEXT: store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC1]], align 16 +; CHECK-NEXT: [[IDX03:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv16i32() +; CHECK-NEXT: [[IDX14:%.*]] = and [[IDX03]], {{shufflevector \( insertelement \( poison, i32 3, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 3\)}} +; CHECK-NEXT: [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} [[IDX14]] to +; CHECK-NEXT: [[VEC_ALLOC5:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC1]], [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.masked.gather.nxv16f32.nxv16p0( [[VEC_ALLOC5]], i32 4, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}, poison) +; CHECK-NEXT: [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[EXISTING_ALLOC]], align 16 +; CHECK-NEXT: store i32 1, ptr [[EXISTING_ALLOC]], align +; CHECK-NEXT: [[V:%.*]] = load <4 x i32>, ptr [[EXISTING_ALLOC]], align 16 +; CHECK-NEXT: store <4 x i32> [[V]], ptr [[FIXLEN_ALLOC]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = {{s|z}}ext{{( nneg)?}} [[IDX14]] to +; CHECK-NEXT: [[VEC_ALLOC:%.*]] = getelementptr inbounds i32, ptr [[FIXLEN_ALLOC]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.masked.gather.nxv16i32.nxv16p0( [[VEC_ALLOC]], i32 4, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}, poison) +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]] +; CHECK-NEXT: store [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]] +; CHECK-NEXT: [[TMP6:%.*]] = load , ptr addrspace(1) [[ARRAYIDX]], align 16 +; CHECK-NEXT: [[V46:%.*]] = fadd [[TMP6]], [[TMP1]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]] +; CHECK-NEXT: store [[V46]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; CHECK-NEXT: ret void +; +; CHECK-LABEL: @__vecz_nxv4_vector_mask_broadcast( +; CHECK-NEXT: entry: +; CHECK: [[FIXLEN_MASK_ALLOC:%.*]] = alloca <4 x i8>, align 4 +; CHECK: [[IDX0:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv16i32() +; CHECK: [[IDX1:%.*]] = and [[IDX0]], {{shufflevector \( insertelement \( poison, i32 3, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 3\)}} +; CHECK: [[SEXT:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8> +; CHECK: store <4 x i8> [[SEXT]], ptr [[FIXLEN_MASK_ALLOC]], align 4 +; CHECK: [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} [[IDX1]] to +; CHECK: [[VEC_ALLOC:%.*]] = getelementptr inbounds i8, ptr [[FIXLEN_MASK_ALLOC]], [[TMP0]] +; CHECK: [[TMP1:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[VEC_ALLOC]], i32 1, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}, poison) +; CHECK: [[BMASK:%.*]] = trunc [[TMP1]] to +; CHECK: {{.*}} = and {{.*}}, [[BMASK]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll new file mode 100644 index 0000000000000..f58b2bd62f539 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k builtins -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @builtins(float* %aptr, float* %bptr, i32* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds float, float* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx + %a = load float, float* %arrayidxa, align 4 + %b = load float, float* %arrayidxb, align 4 + %cmp = call spir_func i32 @_Z9isgreaterff(float %a, float %b) + store i32 %cmp, i32* %arrayidxz, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z9isgreaterff(float, float) + +; CHECK: void @__vecz_nxv4_builtins +; CHECK: = fcmp ogt %{{.*}}, %{{.*}} +; CHECK: = zext %relational2 to diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll new file mode 100644 index 0000000000000..484415bb395db --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll @@ -0,0 +1,35 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k cast -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @cast(i32* %aptr, float* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %c = sitofp i32 %a to float + store float %c, float* %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv8_cast +; CHECK: sitofp {{%[0-9]+}} to +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll new file mode 100644 index 0000000000000..5d29c785dab6a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll @@ -0,0 +1,93 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_nxv4_test_fn(ptr %p, ptr %q, ptr %r) +define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { +entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector [[SPLAT_PTR_INS]], poison, zeroinitializer + %call = call i64 @__mux_get_global_id(i32 0) + +; Test that this cmpxchg is packetized by generating a call to an all-true masked version. +; CHECK: [[A0:%.*]] = call { , } @__vecz_b_nxv4_masked_cmpxchg_align4_acquire_monotonic_1_u9nxv4u3ptru5nxv4ju5nxv4ju5nxv4b( +; CHECK-SAME: [[SPLAT_PTR]], +; CHECK-SAME: {{shufflevector \( insertelement \( poison, i32 1, i64 0\), poison, zeroinitializer\)|splat \(i32 1\)}} +; CHECK-SAME: {{shufflevector \( insertelement \( poison, i32 2, i64 0\), poison, zeroinitializer\)|splat \(i32 2\)}} +; CHECK-SAME: {{shufflevector \( insertelement \( poison, i1 true, i64 0\), poison, zeroinitializer\)|splat \(i1 true\)}} + %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic +; CHECK: [[EXT0:%.*]] = extractvalue { , } [[A0]], 0 + %val0 = extractvalue { i32, i1 } %old0, 0 +; CHECK: [[EXT1:%.*]] = extractvalue { , } [[A0]], 1 + %success0 = extractvalue { i32, i1 } %old0, 1 + + %out = getelementptr i32, ptr %q, i64 %call +; Stored as a vector +; CHECK: store [[EXT0]], ptr + store i32 %val0, ptr %out, align 4 + +; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call + %outsuccess = getelementptr i8, ptr %r, i64 %call +; CHECK: [[ZEXT0:%.*]] = zext [[EXT1]] to + %outbyte = zext i1 %success0 to i8 +; Stored as a vector +; CHECK: store [[ZEXT0]], ptr [[PTR]], align 1 + store i8 %outbyte, ptr %outsuccess, align 1 + + ; Test a couple of insert/extract patterns + + ; Test inserting a uniform value into a varying literal struct +; CHECK: [[INS0:%.*]] = insertvalue { , } [[A0]], zeroinitializer, 1 +; CHECK: [[EXT2:%.*]] = extractvalue { , } [[INS0]], 1 +; CHECK: [[ZEXT1:%.*]] = zext [[EXT2]] to +; CHECK: store [[ZEXT1]], ptr [[PTR]], align 1 + %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1 + %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1 + %outbyte0 = zext i1 %testextract0 to i8 + store i8 %outbyte0, ptr %outsuccess, align 1 + + ; Test inserting a varying value into a varying literal struct +; CHECK: [[LD:%.*]] = load , ptr +; CHECK: [[VBOOL:%.*]] = trunc [[LD]] to +; CHECK: [[INS1:%.*]] = insertvalue { , } [[A0]], [[VBOOL]], 1 +; CHECK: [[EXT3:%.*]] = extractvalue { , } [[INS1]], 1 +; CHECK: [[ZEXT2:%.*]] = zext [[EXT3]] to +; CHECK: store [[ZEXT2]], ptr [[PTR]], align 1 + %byte1 = load i8, ptr %outsuccess, align 1 + %bool1 = trunc i8 %byte1 to i1 + %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1 + %testextract1 = extractvalue { i32, i1 } %testinsertvarying0, 1 + %outbyte1 = zext i1 %testextract1 to i8 + store i8 %outbyte1, ptr %outsuccess, align 1 + + ; Test inserting a varying value into a uniform literal struct +; CHECK: [[INS2:%.*]] = insertvalue { , } poison, [[VBOOL]], 1 +; CHECK: [[EXT4:%.*]] = extractvalue { , } [[INS2]], 1 +; CHECK: [[ZEXT3:%.*]] = zext [[EXT4]] to +; CHECK: store [[ZEXT3]], ptr [[PTR]], align 1 + %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1 + %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1 + %outbyte2 = zext i1 %testextract2 to i8 + store i8 %outbyte2, ptr %outsuccess, align 1 + + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll new file mode 100644 index 0000000000000..315e721aea82d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll @@ -0,0 +1,64 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +; Test if the interleaved store is defined correctly +; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1({{( %0)?}}, ptr addrspace(1){{( %1)?}}) +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement poison, ptr addrspace(1) %1, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector %BroadcastAddr.splatinsert, poison, zeroinitializer +; CHECK: %2 = call @llvm.{{(experimental\.)?}}stepvector.nxv4i64() +; CHECK: %3 = mul {{shufflevector \( insertelement \( poison, i64 4, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i64 4\)}}, %2 +; CHECK: %4 = getelementptr double, %BroadcastAddr.splat, %3 +; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p1( %0, %4, i32{{( immarg)?}} 8, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll new file mode 100644 index 0000000000000..314ee922d7be6 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll @@ -0,0 +1,66 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare void @__mux_work_group_barrier(i32, i32, i32) + +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) + +; Test if the interleaved store is defined correctly +; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1( %0, ptr addrspace(1) %1) [[ATTRS:#[0-9]+]] { +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement poison, ptr addrspace(1) %1, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector %BroadcastAddr.splatinsert, poison, zeroinitializer +; CHECK: %2 = call @llvm.{{(experimental\.)?}}stepvector.nxv4i64() +; CHECK: %3 = mul {{shufflevector \( insertelement \( poison, i64 4, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i64 4\)}}, %2 +; CHECK: %4 = getelementptr double, %BroadcastAddr.splat, %3 +; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p1( %0, %4, i32 immarg 8, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}) +; CHECK: ret void +; CHECK: } + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll new file mode 100644 index 0000000000000..bec291abbf638 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll @@ -0,0 +1,69 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dont_mask_workitem_builtins -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + %conv = trunc i64 %call to i32 + %cmp = icmp sgt i32 %conv, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %call2 = call i64 @__mux_get_global_id(i32 0) + %conv3 = trunc i64 %call2 to i32 + %idxprom = sext i32 %conv3 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(2)* %arrayidx, align 4 + %idxprom4 = sext i32 %conv3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4 + store i32 %0, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.else: ; preds = %entry + %call8 = call i64 @__mux_get_local_size(i32 0) + %call9 = call i64 @__mux_get_group_id(i32 0) + %mul = mul i64 %call9, %call8 + %add = add i64 %mul, %call + %sext = shl i64 %add, 32 + %idxprom11 = ashr exact i64 %sext, 32 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 42, i32 addrspace(1)* %arrayidx12, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare void @__mux_work_group_barrier(i32, i32, i32) + +declare i64 @__mux_get_local_id(i32) + +declare i64 @__mux_get_global_id(i32) + +declare i64 @__mux_get_local_size(i32) + +declare i64 @__mux_get_group_id(i32) + +; Test if the masked load is defined correctly +; CHECK: define @__vecz_b_masked_load4_u5nxv4ju3ptrU3AS2u5nxv4b(ptr addrspace(2){{( %0)?}}, {{( %1)?}}) +; CHECK: entry: +; CHECK: %2 = call @llvm.masked.load.nxv4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, %1, poison) +; CHECK: ret %2 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll new file mode 100644 index 0000000000000..24815c1ae56d1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll @@ -0,0 +1,89 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %cmp = icmp eq i64 %rem, 0 + br i1 %cmp, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call + %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %idxprom4 = sext i32 %2 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4 + store i32 42, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %cmp = icmp eq i64 %rem, 0 + br i1 %cmp, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 42, i32 addrspace(1)* %arrayidx3, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Test if the masked scatter store is defined correctly +; CHECK: define void @__vecz_b_masked_scatter_store4_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4b({{( %0)?}}, {{( %1)?}}, {{( %2)?}}) +; CHECK: entry: +; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p1( %0, %1, i32{{( immarg)?}} 4, %2) +; CHECK: ret void + +; Test if the masked gather load is defined correctly +; CHECK: define @__vecz_b_masked_gather_load4_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4b({{( %0)?}}, {{( %1)?}}) +; CHECK: entry: +; CHECK: %2 = call @llvm.masked.gather.nxv4i32.nxv4p1( %0, i32{{( immarg)?}} 4, %1, poison) +; CHECK: ret %2 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll new file mode 100644 index 0000000000000..e151d82fa7981 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll @@ -0,0 +1,182 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { + ; Dummy uses of the builtins, as we don't define any with zero uses. + %a = call @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j( zeroinitializer) + %b = call @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j( zeroinitializer) + %c = call @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f( zeroinitializer) + %d = call @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f( zeroinitializer) + %e = call @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j( zeroinitializer) + %f = call @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j( zeroinitializer) + %g = call @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j( zeroinitializer) + %h = call @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j( zeroinitializer) + %i = call @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j( zeroinitializer) + %j = call @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f( zeroinitializer) + %k = call @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f( zeroinitializer) + %l = call @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f( zeroinitializer) + %m = call @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f( zeroinitializer) + ret void +} + +declare @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j({{.*}}) { +; CHECK: entry: +; CHECK: %[[SHUFFLE_ALLOC:.+]] = alloca +; CHECK: %[[STEP:.+]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; CHECK: %[[SCALE:.+]] = call i32 @llvm.vscale.i32() +; CHECK: %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4 +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector %[[N_INS]], poison, zeroinitializer +; CHECK: %[[MASK:.+]] = xor %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: store %[[VEC]], {{(\*)|(ptr)}} %[[SHUFFLE_ALLOC]] +;------- there will be a bitcast here if pointers are typed +; CHECK: %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, %[[MASK]] +; CHECK: %[[SHUFFLE:.+]] = call @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}( %[[INDEX]], i32 4, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}, poison) + +; CHECK: %[[ACCUM:.+]] = add %[[VEC]], %{{.+}} +; CHECK: %[[BIT:.+]] = and %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select %[[WHICH]], %[[ACCUM]], %[[VEC]] +; CHECK: %[[NEWMASK]] = or %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]] +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[RESULT:.+]] = phi [ %[[NEWVEC]], %loop ] +; CHECK: ret %[[RESULT]] +; CHECK: } + +declare @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j({{.*}}) { +; CHECK: entry: +; CHECK: %[[SHUFFLE_ALLOC:.+]] = alloca +; CHECK: %[[STEP:.+]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; CHECK: %[[SCALE:.+]] = call i32 @llvm.vscale.i32() +; CHECK: %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4 +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector %[[N_INS]], poison, zeroinitializer +; CHECK: %[[MASK:.+]] = xor %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: store %[[VEC]], {{(\*)|(ptr)}} %[[SHUFFLE_ALLOC]] +;------- there will be a bitcast here if pointers are typed +; CHECK: %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, %[[MASK]] +; CHECK: %[[SHUFFLE:.+]] = call @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}( %[[INDEX]], i32 4, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}, poison) + +; CHECK: %[[ACCUM:.+]] = add %[[VEC]], %{{.+}} +; CHECK: %[[BIT:.+]] = and %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select %[[WHICH]], %[[ACCUM]], %[[VEC]] +; CHECK: %[[NEWMASK]] = or %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]] +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[SCAN:.+]] = phi [ %[[NEWVEC]], %loop ] + +;------- target-dependent slide-up code: +; CHECK: %[[SLIDE:.+]] = call @llvm{{(\.experimental)?}}.vector.splice.nxv4i32( poison, %[[SCAN]], i32 -1) +; CHECK: %[[RESULT:.+]] = insertelement %[[SLIDE]], i32 0, {{i32|i64}} 0 + +; CHECK: ret %[[RESULT]] +; CHECK: } + +; We know the generated code is correct for one scan type, +; now verify that all the others use the correct binary operations. + +declare @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = fadd %[[VEC]], %{{.+}} + +declare @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = fadd %[[VEC]], %{{.+}} + +declare @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smax.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.umin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.umax.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.maxnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f() +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f({{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.maxnum.nxv4f32( %[[VEC]], %{{.+}}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll new file mode 100644 index 0000000000000..cdd9ef6de8e02 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll @@ -0,0 +1,186 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { + ; Dummy uses of the builtins, as we don't define any with zero uses. + %a = call @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj( zeroinitializer, i32 0) + %b = call @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj( zeroinitializer, i32 0) + %c = call @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj( zeroinitializer, i32 0) + %d = call @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj( zeroinitializer, i32 0) + %e = call @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj( zeroinitializer, i32 0) + %f = call @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj( zeroinitializer, i32 0) + %g = call @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj( zeroinitializer, i32 0) + %h = call @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj( zeroinitializer, i32 0) + %i = call @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj( zeroinitializer, i32 0) + %j = call @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj( zeroinitializer, i32 0) + %k = call @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj( zeroinitializer, i32 0) + %l = call @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj( zeroinitializer, i32 0) + %m = call @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj( zeroinitializer, i32 0) + ret void +} + +declare @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj({{.*}}, i32{{.*}}) { +; CHECK: entry: +; CHECK: %[[SHUFFLE_ALLOC:.+]] = alloca +; CHECK: %[[STEP:.+]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector %[[N_INS]], poison, zeroinitializer +; CHECK: %[[MASK:.+]] = xor %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: store %[[VEC]], {{(\*)|(ptr)}} %[[SHUFFLE_ALLOC]] +;------- there will be a bitcast here if pointers are typed +; CHECK: %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, %[[MASK]] +; CHECK: %[[VLSTEP:.+]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; CHECK: %[[VLINS:.+]] = insertelement poison, i32 %1, {{i32|i64}} 0 +; CHECK: %[[VLSPLAT:.+]] = shufflevector %[[VLINS]], poison, zeroinitializer +; CHECK: %[[VLMASK:.+]] = icmp ult %3, %[[VLSPLAT]] +; CHECK: %[[SHUFFLE:.+]] = call @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}( %[[INDEX]], i32 4, %[[VLMASK]], poison) + +; CHECK: %[[ACCUM:.+]] = add %[[VEC]], %[[SHUFFLE]] +; CHECK: %[[BIT:.+]] = and %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select %[[WHICH]], %[[ACCUM]], %[[VEC]] +; CHECK: %[[NEWMASK]] = or %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %1 +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[RESULT:.+]] = phi [ %[[NEWVEC]], %loop ] +; CHECK: ret %[[RESULT]] +; CHECK: } + +declare @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj({{.*}}, i32{{.*}}) { +; CHECK: entry: +; CHECK: %[[SHUFFLE_ALLOC:.+]] = alloca +; CHECK: %[[STEP:.+]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector %[[N_INS]], poison, zeroinitializer +; CHECK: %[[MASK:.+]] = xor %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: store %[[VEC]], {{(\*)|(ptr)}} %[[SHUFFLE_ALLOC]] +;------- there will be a bitcast here if pointers are typed +; CHECK: %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, %[[MASK]] +; CHECK: %[[VLSTEP:.+]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; CHECK: %[[VLINS:.+]] = insertelement poison, i32 %1, {{i32|i64}} 0 +; CHECK: %[[VLSPLAT:.+]] = shufflevector %[[VLINS]], poison, zeroinitializer +; CHECK: %[[VLMASK:.+]] = icmp ult %3, %[[VLSPLAT]] +; CHECK: %[[SHUFFLE:.+]] = call @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}( %[[INDEX]], i32 4, %[[VLMASK]], poison) + +; CHECK: %[[ACCUM:.+]] = add %[[VEC]], %{{.+}} +; CHECK: %[[BIT:.+]] = and %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select %[[WHICH]], %[[ACCUM]], %[[VEC]] +; CHECK: %[[NEWMASK]] = or %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %1 +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[SCAN:.+]] = phi [ %[[NEWVEC]], %loop ] + +;------- target-dependent slide-up code: +; CHECK: %[[SLIDE:.+]] = call @llvm{{(\.experimental)?}}.vector.splice.nxv4i32( poison, %[[SCAN]], i32 -1) +; CHECK: %[[RESULT:.+]] = insertelement %[[SLIDE]], i32 0, {{i32|i64}} 0 + +; CHECK: ret %[[RESULT]] +; CHECK: } + +; We know the generated code is correct for one scan type, +; now verify that all the others use the correct binary operations. + +declare @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = fadd %[[VEC]], %{{.+}} + +declare @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = fadd %[[VEC]], %{{.+}} + +declare @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.smax.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.umin.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.umax.nxv4i32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.maxnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.minnum.nxv4f32( %[[VEC]], %{{.+}}) + +declare @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(, i32) +; CHECK-LABEL: define @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj({{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi [ %0, %entry ], +; CHECK: %{{.+}} = call @llvm.maxnum.nxv4f32( %[[VEC]], %{{.+}}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll new file mode 100644 index 0000000000000..28785e54202a0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll @@ -0,0 +1,144 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k extract_element -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE +; RUN: veczc -k extract_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI +; RUN: veczc -k extract_element_uniform_vec -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI-VEC +; RUN: veczc -k extract_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-INDICES +; RUN: veczc -k extract_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-BOOL + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @extract_element(<4 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = extractelement <4 x float> %1, i32 %idx + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %2, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @extract_element_uniform(<4 x float> %in, i32 %idx, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %0 = extractelement <4 x float> %in, i32 %idx + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %0, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @extract_element_uniform_vec(<4 x float> %in, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %i = urem i64 %call, 4 + %0 = extractelement <4 x float> %in, i64 %i + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %0, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @extract_element_varying_indices(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %idxs, float addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call + %idx = load i32, i32 addrspace(1)* %arrayidxidx + %i = urem i32 %idx, 4 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx + %1 = extractelement <4 x float> %0, i32 %i + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %1, float addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @extract_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %idx, i32 addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call + %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4 + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4 + %2 = icmp slt <4 x i32> %0, %1 + %i = urem i64 %call, 4 + %3 = extractelement <4 x i1> %2, i64 %i + %4 = sext i1 %3 to i32 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %4, i32 addrspace(1)* %arrayidx3, align 4 + %5 = sext <4 x i1> %2 to <4 x i32> + %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call + store <4 x i32> %5, <4 x i32> addrspace(1)* %arrayidx4, align 4 + ret void +} + + +; EE-LABEL: @__vecz_nxv4_extract_element( +; EE: [[ALLOC:%.*]] = alloca , align 64 +; EE: store {{.*}}, ptr [[ALLOC]], align 64 +; EE: [[IDX:%.*]] = sext i32 %idx to i64 +; EE: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], i64 [[IDX]] +; EE: [[GATHER:%.*]] = call @__vecz_b_interleaved_load4_4_u5nxv4fu3ptr(ptr [[ADDR]]) + +; Both the vector and index are uniform, so check we're not unnecessarily packetizing + +; EE-UNI-LABEL: @__vecz_nxv4_extract_element_uniform( +; EE-UNI: [[T0:%.*]] = extractelement <4 x float> %in, i32 %idx +; EE-UNI: [[T1:%.*]] = insertelement poison, float [[T0]], {{(i32|i64)}} 0 +; EE-UNI: [[T2:%.*]] = shufflevector [[T1]], poison, zeroinitializer +; EE-UNI: store [[T2]], ptr addrspace(1) {{%.*}}, align 4 + +; The vector is uniform and the index is varying, so we must broadcast the vector +; FIXME: Do we really need to broadcast? Can we mod the indices with the original vector length? + +; EE-UNI-VEC-LABEL: @__vecz_nxv4_extract_element_uniform_vec( +; EE-UNI-VEC: [[T3:%.*]] = insertelement poison, i64 %call, {{(i32|i64)}} 0 +; EE-UNI-VEC: [[T4:%.*]] = shufflevector [[T3]], poison, zeroinitializer +; EE-UNI-VEC: [[STEP:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i64() +; EE-UNI-VEC: [[T5:%.*]] = add [[T4]], [[STEP]] +; EE-UNI-VEC: [[MOD:%.*]] = and [[T5]], {{shufflevector \( insertelement \( poison, i64 3, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i64 3\)}} +; EE-UNI-VEC: [[T6:%.*]] = shl [[STEP]], {{shufflevector \( insertelement \( poison, i64 2, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i64 2\)}} + +; LLVM 16 deduces add/or equivalence and uses `or` instead. +; EE-UNI-VEC: [[T7:%.*]] = {{add|or}} {{(disjoint )?}} [[T6]], [[MOD]] + +; EE-UNI-VEC: [[T8:%.*]] = getelementptr float, ptr {{%.*}}, [[T7]] +; EE-UNI-VEC: [[T9:%.*]] = call @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr( [[T8]]) +; EE-UNI-VEC: store [[T9]], ptr addrspace(1) {{%.*}}, align 4 + +; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices( +; EE-INDICES: [[ALLOC:%.*]] = alloca , align 64 +; EE-INDICES: [[T0:%.*]] = getelementptr i32, ptr addrspace(1) %idxs, i64 %call +; EE-INDICES: [[T2:%.*]] = load , ptr addrspace(1) [[T0]], align 4 +; EE-INDICES: [[T3:%.*]] = and [[T2]], {{shufflevector \( insertelement \( poison, i32 3, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 3\)}} +; EE-INDICES: store {{.*}}, ptr [[ALLOC]], align 64 +; EE-INDICES: [[STEP:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; EE-INDICES: [[T4:%.*]] = shl [[STEP]], {{shufflevector \( insertelement \( poison, i32 2, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 2\)}} +; EE-INDICES: [[T5:%.*]] = {{add|or}} {{(disjoint )?}} [[T4]], [[T3]] +; EE-INDICES: [[IDX:%.*]] = sext [[T5]] to +; EE-INDICES: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], [[IDX]] +; EE-INDICES: [[GATHER:%.*]] = call @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr( [[ADDR]]) + +; Check we promote from i1 to i8 before doing our memops +; EE-BOOL-LABEL: @__vecz_nxv4_extract_element_bool( +; EE-BOOL: [[T0:%.*]] = sext {{%.*}} to +; EE-BOOL: store {{.*}} +; EE-BOOL: [[T1:%.*]] = call @__vecz_b_gather_load1_u5nxv4hu9nxv4u3ptr( {{%.*}} +; EE-BOOL: [[T2:%.*]] = trunc [[T1]] to diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll new file mode 100644 index 0000000000000..023a617b6e2bc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k fadd -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @fadd(float* %aptr, float* %bptr, float* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds float, float* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx + %a = load float, float* %arrayidxa, align 4 + %b = load float, float* %arrayidxb, align 4 + %sum = fadd float %a, %b + store float %sum, float* %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv4_fadd +; CHECK: load , ptr +; CHECK: load , ptr +; CHECK: fadd +; CHECK: store +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll new file mode 100644 index 0000000000000..9528dd86c8a77 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll @@ -0,0 +1,37 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: not veczc -k fail_builtins -vecz-scalable -vecz-simd-width=4 -S < %s 2>&1 | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @fail_builtins(float* %aptr, float* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx + %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx + %a = load float, float* %arrayidxa, align 4 + %math = call spir_func float @_Z4tanff(float %a) + store float %math, float* %arrayidxz, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare spir_func float @_Z4tanff(float) + +; We can't scalarize this builtin call +; CHECK: Error: Failed to vectorize function 'fail_builtins' diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll new file mode 100644 index 0000000000000..107603f898c7e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll @@ -0,0 +1,120 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k insert_element -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE +; RUN: veczc -k insert_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-UNI +; RUN: veczc -k insert_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-INDICES +; RUN: veczc -k insert_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-BOOL + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @insert_element(<4 x float> addrspace(1)* nocapture readonly %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %2 = insertelement <4 x float> %1, float %val, i32 %idx + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @insert_element_uniform(<4 x float> %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %0 = insertelement <4 x float> %in, float %val, i32 %idx + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @insert_element_varying_indices(<4 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <4 x float> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call + %idx = load i32, i32 addrspace(1)* %arrayidxidx + %i = urem i32 %idx, 4 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)* + %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16 + %fidx = uitofp i64 %call to float + %2 = insertelement <4 x float> %1, float %fidx, i32 %i + %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4 + ret void +} + +define spir_kernel void @insert_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %val, i32 %idx, <4 x i32> addrspace(1)* nocapture %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #6 + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call + %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4 + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4 + %2 = icmp slt <4 x i32> %0, %1 + %i = urem i64 %call, 4 + %v = trunc i32 %val to i1 + %3 = insertelement <4 x i1> %2, i1 %v, i64 %i + %4 = sext <4 x i1> %3 to <4 x i32> + %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx4, align 4 + ret void +} + +; IE-LABEL: @__vecz_nxv4_insert_element( +; IE: [[ALLOC:%.*]] = alloca , align 64 +; IE: [[VAL0:%.*]] = insertelement poison, float %val, {{(i32|i64)}} 0 +; IE: [[VAL1:%.*]] = shufflevector [[VAL0]], poison, zeroinitializer +; IE: store {{.*}}, ptr [[ALLOC]], align 64 +; IE: [[IDX:%.*]] = sext i32 %idx to i64 +; IE: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], i64 [[IDX]] +; IE: call void @__vecz_b_interleaved_store4_4_u5nxv4fu3ptr( [[VAL1]], ptr [[ADDR]]) +; IE: = load , ptr [[ALLOC]], align 64 + +; Both the vector and index are uniform, so check we're not unnecessarily packetizing + +; IE-UNI-LABEL: @__vecz_nxv4_insert_element_uniform( +; IE-UNI: {{%.*}} = insertelement <4 x float> %in, float %val, {{(i32|i64)}} %idx + +; IE-INDICES-LABEL: @__vecz_nxv4_insert_element_varying_indices( +; IE-INDICES: [[ALLOC:%.*]] = alloca , align 64 +; IE-INDICES: [[VAL:%.*]] = uitofp {{%.*}} to +; IE-INDICES: store {{%.*}}, ptr [[ALLOC]], align 64 +; IE-INDICES: [[T1:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; IE-INDICES: [[T2:%.*]] = shl [[T1]], {{shufflevector \( insertelement \( poison, i32 2, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 2\)}} + +; LLVM 16 deduces add/or equivalence and uses `or` instead. +; IE-INDICES: [[T3:%.*]] = {{add|or}} {{(disjoint )?}} [[T2]], {{%.*}} + +; IE-INDICES: [[T4:%.*]] = sext [[T3]] to +; IE-INDICES: [[ADDR:%.*]] = getelementptr float, ptr %0, [[T4]] +; IE-INDICES: call void @__vecz_b_scatter_store4_u5nxv4fu9nxv4u3ptr( [[VAL]], [[ADDR]]) +; IE-INDICES: = load , ptr [[ALLOC]], align 64 + +; Check we promote from i1 to i8 before doing our memops +; IE-BOOL-LABEL: @__vecz_nxv4_insert_element_bool( +; IE-BOOL: [[ALLOC:%.*]] = alloca , align 16 +; IE-BOOL-DAG: [[T0:%.*]] = sext {{%.*}} to +; IE-BOOL-DAG: [[T1:%.*]] = sext {{%.*}} to +; IE-BOOL: store [[T1]], ptr [[ALLOC]], align 16 +; IE-BOOL: call void @__vecz_b_scatter_store1_u5nxv4hu9nxv4u3ptr( [[T0]], {{%.*}}) +; IE-BOOL: [[T2:%.*]] = load , ptr [[ALLOC]], align 16 +; IE-BOOL: [[T3:%.*]] = trunc [[T2]] to + diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll new file mode 100644 index 0000000000000..212adee0fff9f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll @@ -0,0 +1,60 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k load_interleaved -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @load_interleaved(i32 addrspace(1)* nocapture readonly %input, i32 addrspace(1)* nocapture %output, i32 %stride) local_unnamed_addr { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #2 + %0 = trunc i64 %call to i32 + %conv1 = mul i32 %0, %stride + %idxprom = sext i32 %conv1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom + store i32 %1, i32 addrspace(1)* %arrayidx3, align 4 + %add = add nsw i32 %conv1, 1 + %idxprom4 = sext i32 %add to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom4 + store i32 1, i32 addrspace(1)* %arrayidx5, align 4 + %add6 = add nsw i32 %conv1, 2 + %idxprom7 = sext i32 %add6 to i64 + %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom7 + store i32 1, i32 addrspace(1)* %arrayidx8, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define void @__vecz_b_interleaved_store4_V_u5nxv4ju3ptrU3AS1( [[ARG0:%.*]], ptr addrspace(1) [[ARG1:%.*]], i64 [[ARG2:%.*]]) [[ATTRS:#[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement poison, ptr addrspace(1) [[ARG1]], {{i32|i64}} 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector [[TMP0]], poison, zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement poison, i64 [[ARG2]], {{i32|i64}} 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector [[TMP2]], poison, zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, [[TMP1]], [[TMP5]] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p1( [[ARG0]], [[TMP6]], i32 immarg 4, {{shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\)}}) [[MASKED_ATTRS:#[0-9]+]] +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK-DAG: attributes [[ATTRS]] = { norecurse nounwind } +; CHECK-DAG: attributes [[MASKED_ATTRS]] = { diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll new file mode 100644 index 0000000000000..021b103ac4e56 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll @@ -0,0 +1,195 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k ctpop -vecz-scalable -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix CTPOP +; RUN: veczc -k ctlz -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix CTLZ +; RUN: veczc -k cttz -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s --check-prefix CTTZ + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a) + %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b) + store i32 %ctpopi32, i32* %arrayidxy, align 4 + store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false) + %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false) + store i32 %ctlzi32, i32* %arrayidxy, align 4 + store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false) + %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false) + store i32 %cttzi32, i32* %arrayidxy, align 4 + store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +declare i32 @llvm.ctpop.i32(i32) +declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>) + +declare i32 @llvm.ctlz.i32(i32, i1) +declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) + +declare i32 @llvm.cttz.i32(i32, i1) +declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1) + +declare i32 @llvm.sadd.sat.i32(i32, i32) +declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.uadd.sat.i32(i32, i32) +declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.ssub.sat.i32(i32, i32) +declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.usub.sat.i32(i32, i32) +declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i64 @__mux_get_global_id(i32) + +; CTPOP: void @__vecz_nxv2_ctpop +; CTPOP: = call {{.*}} @llvm.ctpop.nxv2i32( %{{.*}}) +; CTPOP: = call {{.*}} @llvm.ctpop.nxv4i8( %{{.*}}) + +; CTLZ: void @__vecz_nxv4_ctlz +; ... but it does widen ctlz +; CTLZ: = call {{.*}} @llvm.ctlz.nxv4i32( %{{.*}}, i1 false) +; CTLZ: = call {{.*}} @llvm.ctlz.nxv8i8( %{{.*}}, i1 false) + +; CTTZ: void @__vecz_nxv8_cttz +; ... and cttz +; CTTZ: = call {{.*}} @llvm.cttz.nxv8i32( %{{.*}}, i1 false) +; CTTZ: = call {{.*}} @llvm.cttz.nxv16i8( %{{.*}}, i1 false) + +; SADD_SAT: void @__vecz_nxv2_sadd_sat +; SADD_SAT: = call @llvm.sadd.sat.nxv2i32( +; SADD_SAT: = call @llvm.sadd.sat.nxv4i8( + +; UADD_SAT: void @__vecz_nxv2_uadd_sat +; UADD_SAT: = call @llvm.uadd.sat.nxv2i32( +; UADD_SAT: = call @llvm.uadd.sat.nxv4i8( + +; SSUB_SAT: void @__vecz_nxv2_ssub_sat +; SSUB_SAT: = call @llvm.ssub.sat.nxv2i32( +; SSUB_SAT: = call @llvm.ssub.sat.nxv4i8( + +; USUB_SAT: void @__vecz_nxv2_usub_sat +; USUB_SAT: = call @llvm.usub.sat.nxv2i32( +; USUB_SAT: = call @llvm.usub.sat.nxv4i8( diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll new file mode 100644 index 0000000000000..d44cbf1bf4a12 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load_add_store(i32* %aptr, i32* %bptr, i32* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load i32, i32* %arrayidxb, align 4 + %sum = add i32 %a, %b + store i32 %sum, i32* %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv4_load_add_store +; CHECK: [[lhs:%[0-9a-z]+]] = load , ptr +; CHECK: [[rhs:%[0-9a-z]+]] = load , ptr +; CHECK: [[sum:%[0-9a-z]+]] = add [[lhs]], [[rhs]] +; CHECK: store [[sum]], +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll new file mode 100644 index 0000000000000..a3026450fd767 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll @@ -0,0 +1,47 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k load_binops_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load_binops_store(i32* %aptr, i32* %bptr, i32* %cptr, i32* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx + %arrayidxc = getelementptr inbounds i32, i32* %cptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load i32, i32* %arrayidxb, align 4 + %c = load i32, i32* %arrayidxc, align 4 + %sum = add i32 %a, %b + %mpy = mul i32 %sum, %c + %shf = ashr i32 %mpy, 3 + %dvu = udiv i32 %shf, %sum + store i32 %dvu, i32* %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv4_load_binops_store +; CHECK: load , ptr +; CHECK: load , ptr +; CHECK: add +; CHECK: mul +; CHECK: ashr +; CHECK: store +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll new file mode 100644 index 0000000000000..e97fd6da75ffe --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll @@ -0,0 +1,41 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %load = load i32, i32 addrspace(1)* %in + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idx + store i32 %load, i32 addrspace(1)* %slot + ret void +} + +; CHECK: define spir_kernel void @test(ptr addrspace(1) %in) !codeplay_ca_vecz.base !0 +; CHECK: define spir_kernel void @__vecz_nxv8_test(ptr addrspace(1) %in) #0 !codeplay_ca_vecz.derived !2 + +; CHECK: attributes #0 = { "mux-base-fn-name"="__vecz_nxv8_test" } + +; CHECK: !0 = !{!1, ptr @__vecz_nxv8_test} + +; CHECK: !1 = !{i32 8, i32 1, i32 0, i32 0} +; CHECK: !2 = !{!1, ptr @test} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll new file mode 100644 index 0000000000000..fbadfebf05d4f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; A kernel which should produce a uniform masked vector load where the mask is +; a single varying splatted bit. +define spir_kernel void @mask_varying(<4 x i32>* %aptr, <4 x i32>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %mod_idx = urem i64 %idx, 2 + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx + %ins = insertelement <4 x i1> poison, i1 true, i32 0 + %cmp = icmp slt i64 %idx, 64 + br i1 %cmp, label %if.then, label %if.end +if.then: + %v = load <4 x i32>, <4 x i32>* %aptr + %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx + store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16 + br label %if.end +if.end: + ret void +; CHECK: define spir_kernel void @__vecz_nxv4_mask_varying +; CHECK: [[idx0:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv16i32() +; CHECK: [[idx1:%.*]] = lshr [[idx0]], {{shufflevector \( insertelement \( poison, i32 2, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 2\)}} + +; Note that since we just did a lshr 2 on the input of the extend, it doesn't +; make any difference whether it's a zext or sext, but LLVM 16 prefers zext. +; CHECK: [[idx2:%.*]] = {{s|z}}ext{{( nneg)?}} [[idx1]] to + +; CHECK: [[t1:%.*]] = getelementptr i8, ptr {{.*}}, [[idx2]] +; CHECK: [[t2:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[t1]], +; CHECK: [[splat:%.*]] = trunc [[t2]] to +; CHECK: call void @__vecz_b_masked_store16_u6nxv16ju3ptru6nxv16b( {{.*}}, ptr %arrayidxz, [[splat]]) + +} + +declare i64 @__mux_get_global_id(i32) +declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPDv4_jDv4_b(<4 x i32>*, <4 x i1>) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll new file mode 100644 index 0000000000000..c6e25c5f327e1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll @@ -0,0 +1,37 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k cast -vecz-scalable -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @cast(i32* %aptr, float* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %c = sitofp i32 %a to float + store float %c, float* %arrayidxz, align 4 + ret void +} + +; Check that passing -vecz-scalable with no width automatically chooses an +; appropriate scalable vectorization factor. +; CHECK: define spir_kernel void @__vecz_nxv[[VF:[0-9]+]]_cast +; CHECK: sitofp {{%[0-9]+}} to +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll new file mode 100644 index 0000000000000..55d888bf1b4b8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll @@ -0,0 +1,67 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @select_scalar_scalar(i32* %aptr, i32* %bptr, i32* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load i32, i32* %arrayidxb, align 4 + %cmp = icmp slt i32 %a, %b + %sel = select i1 %cmp, i32 %b, i32 4 + store i32 %sel, i32* %arrayidxz, align 4 + ret void +} + +define spir_kernel void @select_vector_vector(<2 x i32>* %aptr, <2 x i32>* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds <2 x i32>, <2 x i32>* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i32>, <2 x i32>* %bptr, i64 %idx + %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx + %a = load <2 x i32>, <2 x i32>* %arrayidxa, align 4 + %b = load <2 x i32>, <2 x i32>* %arrayidxb, align 4 + %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4 + %cmp = icmp slt <2 x i32> %a, %b + %sel = select <2 x i1> %cmp, <2 x i32> %c, <2 x i32> + store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_scalar +; CHECK: [[lhs:%[0-9a-z]+]] = load , ptr +; CHECK: [[rhs:%[0-9a-z]+]] = load , ptr +; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt [[lhs]], [[rhs]] +; CHECK: [[sel:%[0-9a-z]+]] = select [[cmp]], [[rhs]], {{shufflevector \( insertelement \( poison, i32 4, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 4\)}} +; CHECK: store [[sel]], + +; CHECK: define spir_kernel void @__vecz_nxv4_select_vector_vector +; CHECK: [[x:%[0-9a-z]+]] = load , ptr +; CHECK: [[y:%[0-9a-z]+]] = load , ptr +; CHECK: [[z:%[0-9a-z]+]] = load , ptr +; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt [[x]], [[y]] +; CHECK: [[sel:%[0-9a-z]+]] = select [[cmp]], [[z]], {{shufflevector \( insertelement \( poison, i32 4, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 4\)}} +; CHECK: store [[sel]], diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll new file mode 100644 index 0000000000000..501f4245ec090 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k select_scalar_vector -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @select_scalar_vector(i32* %aptr, i32* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx + %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load i32, i32* %arrayidxb, align 4 + %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4 + %cmp = icmp slt i32 %a, %b + %sel = select i1 %cmp, <2 x i32> %c, <2 x i32> + store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_vector +; CHECK: [[rhs:%.*]] = load , ptr +; CHECK: [[cmp1:%.*]] = icmp slt +; CHECK: [[sext:%.*]] = sext [[cmp1]] to +; CHECK: store [[sext]], ptr [[alloc:%.*]], align 4 +; CHECK: [[idx0:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv8i32() +; CHECK: [[idx1:%.*]] = lshr [[idx0]], {{shufflevector \( insertelement \( poison, i32 1, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 1\)}} + +; Note that since we just did a lshr 1 on the input of the extend, it doesn't +; make any difference whether it's a zext or sext, but LLVM 16 prefers zext. +; CHECK: [[sext2:%.*]] = {{s|z}}ext{{( nneg)?}} [[idx1]] to + +; CHECK: [[addrs:%.*]] = getelementptr i8, ptr [[alloc]], [[sext2]] +; CHECK: [[gather:%.*]] = call @llvm.masked.gather.nxv8i8.nxv8p0( [[addrs]], +; CHECK: [[cmp:%.*]] = trunc [[gather]] to +; CHECK: [[sel:%.*]] = select [[cmp]], [[rhs]], {{shufflevector \( insertelement \( poison, i32 4, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 4\)}} +; CHECK: store [[sel]], diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll new file mode 100644 index 0000000000000..9d9f141cf12ff --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll @@ -0,0 +1,61 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) { + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16 + %insert = insertelement <4 x i32> poison, i32 %a, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer + %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx + store <4 x i32> %splat, <4 x i32>* %arrayidxz + ret void +; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat +; CHECK: [[idx0:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv16i32() +; CHECK: [[idx1:%.*]] = lshr [[idx0]], {{shufflevector \( insertelement \( poison, i32 2, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i32 2\)}} + +; Note that since we just did a lshr 2 on the input of the extend, it doesn't +; make any difference whether it's a zext or sext, but LLVM 16 prefers zext. +; CHECK: [[idx2:%.*]] = {{s|z}}ext{{( nneg)?}} [[idx1]] to + +; CHECK: [[alloc:%.*]] = getelementptr i32, ptr %{{.*}}, [[idx2]] +; CHECK: [[splat:%.*]] = call @llvm.masked.gather.nxv16i32.nxv16p0( [[alloc]], +; CHECK: store [[splat]], ptr +} + +define spir_kernel void @do_shuffle_splat_uniform(i32 %a, <4 x i32>* %bptr, <4 x i32>* %zptr) { + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx + %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16 + %insert = insertelement <4 x i32> poison, i32 %a, i32 0 + %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer + %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx + store <4 x i32> %splat, <4 x i32>* %arrayidxz + ret void +; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat_uniform +; CHECK: [[ins:%.*]] = insertelement poison, i32 %a, {{(i32|i64)}} 0 +; CHECK: [[splat:%.*]] = shufflevector [[ins]], poison, zeroinitializer +; CHECK: store [[splat]], ptr +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll new file mode 100644 index 0000000000000..28f4e99f7fb28 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that we do something correct when scalably packetizing struct literals. +; Right now we fail to packetize, but if we could packetize this we'd have to +; be careful as storing a struct literal containing scalable vectors is invalid +; IR. +; RUN: veczc -w 4 -vecz-scalable -vecz-passes=verify,packetizer,verify \ +; RUN: --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: Vecz: Could not packetize %v = load { i32, i32 }, ptr %arrayidx.p, align 4 +define spir_kernel void @test_fn(ptr %p, ptr %q) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidx.p = getelementptr { i32, i32 }, ptr %p, i64 %idx + %v = load { i32, i32 }, ptr %arrayidx.p, align 4 + %arrayidx.q = getelementptr { i32, i32 }, ptr %q, i64 %idx + store { i32, i32 } %v, ptr %arrayidx.q, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll new file mode 100644 index 0000000000000..994c87fce14f5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll @@ -0,0 +1,79 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i32 @__mux_get_sub_group_id() +declare spir_func i32 @__mux_get_sub_group_size() +declare spir_func i32 @__mux_get_sub_group_local_id() +declare spir_func i32 @__mux_sub_group_broadcast_i32(i32, i32) + +define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call.i = tail call spir_func i32 @__mux_get_sub_group_id() + %conv = zext i32 %call.i to i64 + %call2 = tail call spir_func i32 @__mux_get_sub_group_size() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx, align 4 + ret void +; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_size( +; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK: [[W:%.*]] = shl {{(nuw )?}}i32 [[VSCALE]], 2 +; CHECK: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[W]]) +; CHECK: store i32 [[RED]], ptr addrspace(1) {{.*}} +} + +define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call = tail call spir_func i32 @__mux_get_sub_group_local_id() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call + store i32 %call, i32 addrspace(1)* %arrayidx, align 4 + ret void +; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_local_id( +; CHECK: %call = tail call spir_func i32 @__mux_get_sub_group_local_id() +; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32() +; CHECK: [[SHL:%.*]] = shl {{(nuw )?}}i32 [[VSCALE]], 2 +; CHECK: [[MUL:%.*]] = mul i32 %call, [[SHL]] +; CHECK: [[SPLATINSERT:%.*]] = insertelement poison, i32 [[MUL]], i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[SPLATINSERT]], poison, zeroinitializer +; CHECK: [[STEPVEC:%.*]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i32() +; CHECK: [[LID:%.*]] = add [[SPLAT]], [[STEPVEC]] +; CHECK: [[EXT:%.*]] = sext i32 %call to i64 +; CHECK: %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]] +; CHECK: store [[LID]], ptr addrspace(1) %arrayidx +} + +define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call = tail call spir_func i32 @__mux_get_sub_group_local_id() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call + %v = load i32, i32 addrspace(1)* %arrayidx, align 4 + %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call + store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_sub_group_broadcast( +; CHECK: [[LD:%.*]] = load , ptr addrspace(1) {{%.*}}, align 4 +; CHECK: [[EXT:%.*]] = extractelement [[LD]], {{(i32|i64)}} 0 +; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 0) +; CHECK: [[INS:%.*]] = insertelement poison, i32 [[BDCAST]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[INS]], poison, zeroinitializer +; CHECK: store [[SPLAT]], ptr addrspace(1) +} + +!opencl.ocl.version = !{!0} + +!0 = !{i32 3, i32 0} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll new file mode 100644 index 0000000000000..612a67f496406 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll @@ -0,0 +1,204 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S -vecz-passes=packetizer < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32) +declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64) +declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32) +declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float) +declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float) + +define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_i32( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_add_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = add [[SCAN]], [[SPLAT]] +; CHECK: store [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call + %0 = load i64, i64 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0) + %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call + store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_i64( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_add_u5nxv4m( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_sub_group_scan_exclusive_add_i64(i64 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = add [[SCAN]], [[SPLAT]] +; CHECK: store [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_f32( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.0{{.*}}, [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fadd_f32(float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = fadd [[SCAN]], [[SPLAT]] +; CHECK: store [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_smin_i32( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4i( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32( [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smin_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = call @llvm.smin.nxv4i32( [[SCAN]], [[SPLAT]]) +; CHECK: store [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_umin_i32( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32( [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umin_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = call @llvm.umin.nxv4i32( [[SCAN]], [[SPLAT]]) +; CHECK: store [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_smax_i32( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4i( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32( [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smax_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = call @llvm.smax.nxv4i32( [[SCAN]], [[SPLAT]]) +; CHECK: store [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_umax_i32( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32( [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umax_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = call @llvm.umax.nxv4i32( [[SCAN]], [[SPLAT]]) +; CHECK: store [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmin_f32( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.nxv4f32( [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmin_f32(float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = call @llvm.minnum.nxv4f32( [[SCAN]], [[SPLAT]]) +; CHECK: store [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmax_f32( +; CHECK: [[SCAN:%.*]] = call @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f( [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.nxv4f32( [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmax_f32(float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector [[HEAD]], poison, zeroinitializer +; CHECK: [[FINAL:%.*]] = call @llvm.maxnum.nxv4f32( [[SCAN]], [[SPLAT]]) +; CHECK: store [[FINAL]], +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll new file mode 100644 index 0000000000000..06d079f2128ac --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll @@ -0,0 +1,171 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -w 4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32) +declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32) +declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1) + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_mul_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_mul_u5nxv4j( %{{.*}}) +define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_excl_mul_i32( +; CHECK: call @__vecz_b_sub_group_scan_exclusive_mul_u5nxv4j( %{{.*}}) +define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_mul_f32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_mul_u5nxv4f( %{{.*}}) +define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0) + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_excl_mul_f32( +; CHECK: call @__vecz_b_sub_group_scan_exclusive_mul_u5nxv4f( %{{.*}}) +define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0) + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_and_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_and_u5nxv4j( %{{.*}}) +define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_or_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_or_u5nxv4j( %{{.*}}) +define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_xor_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_xor_u5nxv4j( %{{.*}}) +define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_and( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_and_u5nxv4b( %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_or( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_or_u5nxv4b( %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_xor( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_xor_u5nxv4b( %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll new file mode 100644 index 0000000000000..7ad386dedb3e4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll @@ -0,0 +1,171 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -w 4 -S -vecz-choices=VectorPredication < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32) +declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32) +declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1) + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_mul_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_mul_vp_u5nxv4jj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_excl_mul_i32( +; CHECK: call @__vecz_b_sub_group_scan_exclusive_mul_vp_u5nxv4jj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_mul_f32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_mul_vp_u5nxv4fj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0) + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_excl_mul_f32( +; CHECK: call @__vecz_b_sub_group_scan_exclusive_mul_vp_u5nxv4fj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0) + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_and_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_and_vp_u5nxv4jj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_or_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_or_vp_u5nxv4jj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_xor_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_xor_vp_u5nxv4jj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_and( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_and_vp_u5nxv4bj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_or( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_or_vp_u5nxv4bj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_xor( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_xor_vp_u5nxv4bj( %{{.*}}, i32 %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll new file mode 100644 index 0000000000000..14bee4967bfbc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll @@ -0,0 +1,150 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S -vecz-passes=packetizer < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32) +declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64) +declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32) +declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float) +declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float) + +define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj( %{{.*}}, i32 %{{.+}}) +} + +define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call + %0 = load i64, i64 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0) + %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call + store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_i64( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4mj( %{{.*}}, i32 %{{.+}}) +} + +define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_f32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj( %{{.*}}, i32 %{{.+}}) +} + +define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_smin_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4ij( %{{.*}}, i32 %{{.+}}) +} + +define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_umin_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj( %{{.*}}, i32 %{{.+}}) +} + +define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_smax_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4ij( %{{.*}}, i32 %{{.+}}) +} + +define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_umax_i32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj( %{{.*}}, i32 %{{.+}}) +} + +define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_fmin_f32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj( %{{.*}}, i32 %{{.+}}) +} + +define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_fmax_f32( +; CHECK: call @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj( %{{.*}}, i32 %{{.+}}) +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll new file mode 100644 index 0000000000000..f8ed17cf10c67 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll @@ -0,0 +1,41 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load_add_store(<4 x i32>* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx + %a = load <4 x i32>, <4 x i32>* %arrayidxa, align 4 + %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 4 + %sum = add <4 x i32> %a, %b + store <4 x i32> %sum, <4 x i32>* %arrayidxz, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define spir_kernel void @__vecz_nxv4_load_add_store +; CHECK: [[lhs:%[0-9a-z]+]] = load , ptr +; CHECK: [[rhs:%[0-9a-z]+]] = load , ptr +; CHECK: [[sum:%[0-9a-z]+]] = add [[lhs]], [[rhs]] +; CHECK: store [[sum]], diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll new file mode 100644 index 0000000000000..a4ff1d7c228f4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that we fail to vectorize but don't leave behind an invalid function. +; RUN: not veczc -k regression_phis -vecz-scalable -w 1 -vecz-passes=packetizer,verify -S < %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @regression_phis(i64 addrspace(1)* %xs, i64 addrspace(1)* %ys, i32 addrspace(1)* %out, i64 %lim) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx.x = getelementptr inbounds i64, i64 addrspace(1)* %xs, i64 %call + %x = load i64, i64 addrspace(1)* %arrayidx.x, align 4 + %cond = icmp eq i64 %call, 0 + br i1 %cond, label %if.then, label %exit + +if.then: + %arrayidx.y = getelementptr inbounds i64, i64 addrspace(1)* %ys, i64 %call + %y = load i64, i64 addrspace(1)* %arrayidx.y, align 4 + br label %exit + +exit: + ; We previously left behind an invalid PHI with too few operands, owing to us + ; bailing our while PHIs were still pending post-vectorization fixup. + %retval = phi i64 [ %x, %entry ], [ %y, %if.then ] + %0 = icmp eq i64 %lim, 0 + %1 = select i1 %0, i64 1, i64 %lim + %rem = urem i64 %retval, %1 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %rem + %2 = atomicrmw add i32 addrspace(1)* %arrayidx, i32 1 monotonic + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll new file mode 100644 index 0000000000000..43f40444837b3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll @@ -0,0 +1,34 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k widen_vload -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @widen_vload(<4 x i32>* %aptr, <4 x i32>* %zptr) { + %idx = call i64 @__mux_get_global_id(i32 0) + %mod_idx = urem i64 %idx, 2 + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %mod_idx + %v = load <4 x i32>, <4 x i32>* %arrayidxa, align 16 + %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx + store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16 + ret void +; CHECK: define spir_kernel void @__vecz_nxv4_widen_vload( +; CHECK: %v4 = call @__vecz_b_gather_load16_u6nxv16ju10nxv16u3ptr( %{{.*}}) +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll new file mode 100644 index 0000000000000..c8e7e27514fa9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k store_ult -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s + +; Check that we can scalably-vectorize a call to get_global_id by using the +; stepvector intrinsic + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @store_ult(i32* %out, i64* %N) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #2 + %0 = load i64, i64* %N, align 8 + %cmp = icmp ult i64 %call, %0 + %conv = zext i1 %cmp to i32 + %arrayidx = getelementptr inbounds i32, i32* %out, i64 %call + store i32 %conv, i32* %arrayidx, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define spir_kernel void @__vecz_nxv4_store_ult +; CHECK: [[step:%[0-9.a-z]+]] = call @llvm.{{(experimental\.)?}}stepvector.nxv4i64() +; CHECK: %{{.*}} = add %{{.*}}, [[step]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll new file mode 100644 index 0000000000000..05f7c9483f2f8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll @@ -0,0 +1,45 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k foo -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @foo(float addrspace(1)* readonly %a, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %cmp = fcmp oeq float %0, 0.000000e+00 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %add = add nsw i32 %1, 42 + store i32 %add, i32 addrspace(1)* %arrayidx1, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv2_vp_foo(ptr addrspace(1) readonly %a, ptr addrspace(1) %out) +; CHECK: [[CMP:%.*]] = fcmp oeq %{{.*}}, zeroinitializer +; CHECK: %{{.*}} = call i1 @llvm.vp.reduce.or.nxv2i1(i1 false, [[CMP]], {{.*}}, i32 {{.*}}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll new file mode 100644 index 0000000000000..9835c56732a32 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll @@ -0,0 +1,33 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Just check that the VectorPredication choice is valid +; RUN: veczc -k foo -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @foo(float* %aptr, float* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx + %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx + %a = load float, float* %arrayidxa, align 4 + store float %a, float* %arrayidxz, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll new file mode 100644 index 0000000000000..38abafeb2cb77 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k get_sub_group_size -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-F2 +; RUN: veczc -k get_sub_group_size -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-S4 + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i32 @__mux_get_sub_group_id() +declare spir_func i32 @__mux_get_sub_group_size() + +define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call.i = tail call spir_func i32 @__mux_get_sub_group_id() + %conv = zext i32 %call.i to i64 + %call2 = tail call spir_func i32 @__mux_get_sub_group_size() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Makes sure the vector length is properly computed and substituted for get_sub_group_size() + +; CHECK-F2-LABEL: define spir_kernel void @__vecz_v2_vp_get_sub_group_size( +; CHECK-F2: [[ID:%.*]] = call i64 @__mux_get_local_id(i32 0) +; CHECK-F2: [[SZ:%.*]] = call i64 @__mux_get_local_size(i32 0) +; CHECK-F2: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]] +; CHECK-F2: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 2) +; CHECK-F2: [[VL1:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[VL0]] to i32 +; CHECK-F2: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]]) +; CHECK-F2: store i32 [[RED]], ptr addrspace(1) {{.*}} + +; CHECK-S4-LABEL: define spir_kernel void @__vecz_nxv4_vp_get_sub_group_size( +; CHECK-S4: [[ID:%.*]] = call i64 @__mux_get_local_id(i32 0) +; CHECK-S4: [[SZ:%.*]] = call i64 @__mux_get_local_size(i32 0) +; CHECK-S4: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]] +; CHECK-S4: [[VF0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-S4: [[VF1:%.*]] = shl {{(nuw )?}}i64 [[VF0]], 2 +; CHECK-S4: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 [[VF1]]) +; CHECK-S4: [[VL1:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[VL0]] to i32 +; CHECK-S4: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]]) +; CHECK-S4: store i32 [[RED]], ptr addrspace(1) {{.*}} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll new file mode 100644 index 0000000000000..a913198ca3f2b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll @@ -0,0 +1,81 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication:FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +; Test if the interleaved load is defined correctly +; Vector-predicated interleaved loads are always masked +; CHECK: define @__vecz_b_masked_interleaved_load8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(ptr addrspace(1){{( %0)?}}, {{( %1)?}}, i32{{( %2)?}}) [[ATTRS:#[0-9]+]] { +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement poison, ptr addrspace(1) %0, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector %BroadcastAddr.splatinsert, poison, zeroinitializer +; CHECK: %3 = call @llvm.{{(experimental\.)?}}stepvector.nxv4i64() +; CHECK: %4 = mul {{shufflevector \( insertelement \( poison, i64 4, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i64 4\)}}, %3 +; CHECK: %5 = getelementptr double, %BroadcastAddr.splat, %4 +; CHECK: %6 = call @llvm.vp.gather.nxv4f64.nxv4p1( %5, %1, i32 %2) +; CHECK: ret %6 +; CHECK: } + + +; Test if the interleaved store is defined correctly +; Vector-predicated interleaved stores are always masked +; CHECK: define void @__vecz_b_masked_interleaved_store8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj({{( %0)?}}, ptr addrspace(1){{( %1)?}}, {{( %2)?}}, i32{{( %3)?}}) [[ATTRS]] +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement poison, ptr addrspace(1) %1, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector %BroadcastAddr.splatinsert, poison, zeroinitializer +; CHECK: %4 = call @llvm.{{(experimental\.)?}}stepvector.nxv4i64() +; CHECK: %5 = mul {{shufflevector \( insertelement \( poison, i64 4, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i64 4\)}}, %4 +; CHECK: %6 = getelementptr double, %BroadcastAddr.splat, %5 +; CHECK: call void @llvm.vp.scatter.nxv4f64.nxv4p1( %0, %6, %2, i32 %3) +; CHECK: ret void +; CHECK: } + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll new file mode 100644 index 0000000000000..7ef8742f87f58 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll @@ -0,0 +1,77 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + %conv = trunc i64 %call to i32 + %cmp = icmp sgt i32 %conv, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %call2 = call i64 @__mux_get_global_id(i32 0) + %conv3 = trunc i64 %call2 to i32 + %idxprom = sext i32 %conv3 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(2)* %arrayidx, align 4 + %idxprom4 = sext i32 %conv3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4 + store i32 %0, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.else: ; preds = %entry + %call8 = call i64 @__mux_get_local_size(i32 0) + %call9 = call i64 @__mux_get_group_id(i32 0) + %mul = mul i64 %call9, %call8 + %add = add i64 %mul, %call + %sext = shl i64 %add, 32 + %idxprom11 = ashr exact i64 %sext, 32 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 42, i32 addrspace(1)* %arrayidx12, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_local_id(i32) + +declare i64 @__mux_get_global_id(i32) + +declare i64 @__mux_get_local_size(i32) + +declare i64 @__mux_get_group_id(i32) + +; Test if the masked store is defined correctly +; CHECK: define void @__vecz_b_masked_store4_vp_Dv4_ju3ptrU3AS1Dv4_bj(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}, i32{{( %3)?}}) [[ATTRS:#[0-9]+]] { +; CHECK: entry: +; CHECK: call void @llvm.vp.store.v4i32.p1(<4 x i32> %0, ptr addrspace(1) %1, <4 x i1> %2, i32 %3) +; CHECK: ret void + +; Test if the masked load is defined correctly +; CHECK: define <4 x i32> @__vecz_b_masked_load4_vp_Dv4_ju3ptrU3AS2Dv4_bj(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}}, i32{{( %2)?}}) [[ATTRS]] { +; CHECK: entry: +; CHECK: %3 = call <4 x i32> @llvm.vp.load.v4i32.p2(ptr addrspace(2) %0, <4 x i1> %1, i32 %2) +; CHECK: ret <4 x i32> %3 + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll new file mode 100644 index 0000000000000..5353ab9a90aae --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll @@ -0,0 +1,89 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %cmp = icmp eq i64 %rem, 0 + br i1 %cmp, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call + %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %idxprom4 = sext i32 %2 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4 + store i32 42, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +; Test if the vector-predicated scatter store is defined correctly +; CHECK: define void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj({{( %0)?}}, {{( %1)?}}, {{( %2)?}}, i32{{( %3)?}}) +; CHECK: entry: +; CHECK: call void @llvm.vp.scatter.nxv4i32.nxv4p1( %0, %1, %2, i32 %3) +; CHECK: ret void + +define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %cmp = icmp eq i64 %rem, 0 + br i1 %cmp, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 42, i32 addrspace(1)* %arrayidx3, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Test if the vector-predicated gather load is defined correctly +; CHECK: define @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj({{( %0)?}}, {{( %1)?}}, i32{{( %2)?}}) +; CHECK: entry: +; CHECK: %3 = call @llvm.vp.gather.nxv4i32.nxv4p1( %0, %1, i32 %2) +; CHECK: ret %3 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll new file mode 100644 index 0000000000000..0d1b86390d6d1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll @@ -0,0 +1,183 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { + ; Dummy uses of the builtins, as we don't define any with zero uses. + %a = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0) + %b = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0) + %c = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0) + %d = call <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0) + %e = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0) + %f = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0) + %g = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0) + %h = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0) + %i = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0) + %j = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0) + %k = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0) + %l = call <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0) + %m = call <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0) + ret void +} + +declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32>, i32) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) { +; CHECK: entry: +; CHECK: %[[SHUFFLE_ALLOC:.+]] = alloca <4 x i32> +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi <4 x i32> [ , %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement <4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector <4 x i32> %[[N_INS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: %[[MASK:.+]] = xor <4 x i32> %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: store <4 x i32> %[[VEC]], {{(<4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]] +;------- there will be a bitcast here if pointers are typed +; CHECK: %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <4 x i32> %[[MASK]] +; CHECK: %[[VLINS:.+]] = insertelement <4 x i32> poison, i32 %1, {{i32|i64}} 0 +; CHECK: %[[VLSPLAT:.+]] = shufflevector <4 x i32> %[[VLINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: %[[VLMASK:.+]] = icmp ult <4 x i32> , %[[VLSPLAT]] +; CHECK: %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> poison) + +; CHECK: %[[ACCUM:.+]] = add <4 x i32> %[[VEC]], %{{.+}} +; CHECK: %[[BIT:.+]] = and <4 x i32> %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne <4 x i32> %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select <4 x i1> %[[WHICH]], <4 x i32> %[[ACCUM]], <4 x i32> %[[VEC]] +; CHECK: %[[NEWMASK]] = or <4 x i32> %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %1 +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[RESULT:.+]] = phi <4 x i32> [ %[[NEWVEC]], %loop ] +; CHECK: ret <4 x i32> %[[RESULT]] +; CHECK: } + +declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32>, i32) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) { +; CHECK: entry: +; CHECK: %[[SHUFFLE_ALLOC:.+]] = alloca <4 x i32> +; CHECK: br label %loop +; CHECK: loop: +; CHECK: %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ] +; CHECK: %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ] +; CHECK: %[[MASKPHI:.+]] = phi <4 x i32> [ , %entry ], [ %[[NEWMASK:.+]], %loop ] +; CHECK: %[[N_INS:.+]] = insertelement <4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0 +; CHECK: %[[N_SPLAT:.+]] = shufflevector <4 x i32> %[[N_INS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: %[[MASK:.+]] = xor <4 x i32> %[[MASKPHI]], %[[N_SPLAT]] + +;------- target-dependent dynamic shuffle code: +; CHECK: store <4 x i32> %[[VEC]], {{(<4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]] +;------- there will be a bitcast here if pointers are typed +; CHECK: %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <4 x i32> %[[MASK]] +; CHECK: %[[VLINS:.+]] = insertelement <4 x i32> poison, i32 %1, {{i32|i64}} 0 +; CHECK: %[[VLSPLAT:.+]] = shufflevector <4 x i32> %[[VLINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: %[[VLMASK:.+]] = icmp ult <4 x i32> , %[[VLSPLAT]] +; CHECK: %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> poison) + +; CHECK: %[[ACCUM:.+]] = add <4 x i32> %[[VEC]], %{{.+}} +; CHECK: %[[BIT:.+]] = and <4 x i32> %[[MASKPHI]], %[[N_SPLAT]] +; CHECK: %[[WHICH:.+]] = icmp ne <4 x i32> %[[BIT]], zeroinitializer +; CHECK: %[[NEWVEC]] = select <4 x i1> %[[WHICH]], <4 x i32> %[[ACCUM]], <4 x i32> %[[VEC]] +; CHECK: %[[NEWMASK]] = or <4 x i32> %[[MASK]], %[[N_SPLAT]] +; CHECK: %[[N2]] = shl nuw nsw i32 %[[IV]], 1 +; CHECK: %[[CMP:.+]] = icmp ult i32 %[[N2]], %1 +; CHECK: br i1 %[[CMP]], label %loop, label %exit +; CHECK: exit: +; CHECK: %[[SCAN:.+]] = phi <4 x i32> [ %[[NEWVEC]], %loop ] + +;------- target-dependent slide-up goes here +; CHECK: %[[SLIDE:.+]] = shufflevector <4 x i32> %[[SCAN]], <4 x i32> poison, <4 x i32> +; CHECK: %[[RESULT:.+]] = insertelement <4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0 + +; CHECK: ret <4 x i32> %[[RESULT]] +; CHECK: } + + +; We know the generated code is correct for one scan type, +; now verify that all the others use the correct binary operations. + +declare <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float>, i32) +; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x float> [ %0, %entry ], +; CHECK: %{{.+}} = fadd <4 x float> %[[VEC]], %{{.+}} + +declare <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float>, i32) +; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x float> [ %0, %entry ], +; CHECK: %{{.+}} = fadd <4 x float> %[[VEC]], %{{.+}} + +declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32>, i32) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}}) + +declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32>, i32) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}}) + +declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32>, i32) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}}) + +declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32>, i32) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}}) + +declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32>, i32) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}}) + +declare <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float>, i32) +; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x float> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x float> @llvm.minnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}}) + +declare <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float>, i32) +; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x float> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}}) + +declare <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float>, i32) +; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x float> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x float> @llvm.minnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}}) + +declare <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float>, i32) +; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}}) +; CHECK: loop: +; CHECK: %[[VEC:.+]] = phi <4 x float> [ %0, %entry ], +; CHECK: %{{.+}} = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll new file mode 100644 index 0000000000000..92ac9161e8c89 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll @@ -0,0 +1,103 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k load_add_store_i32 -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_4F +; RUN: veczc -k load_add_store_i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_1S +; RUN: veczc -k load_add_store_v4i32 -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_V4_2F +; RUN: veczc -k load_add_store_v4i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_V4_1S + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @load_add_store_i32(i32* %aptr, i32* %bptr, i32* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load i32, i32* %arrayidxb, align 4 + %sum = add i32 %a, %b + store i32 %sum, i32* %arrayidxz, align 4 + ret void +} + +; CHECK_4F: define spir_kernel void @__vecz_v4_vp_load_add_store_i32( +; CHECK_4F: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0) +; CHECK_4F: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0) +; CHECK_4F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]] +; CHECK_4F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 4) +; CHECK_4F: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T0]] to i32 +; CHECK_4F: [[LHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]]) +; CHECK_4F: [[RHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]]) +; CHECK_4F: [[ADD:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[LHS]], <4 x i32> [[RHS]], <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]]) +; CHECK_4F: call void @llvm.vp.store.v4i32.p0(<4 x i32> [[ADD]], ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]]) + +; CHECK_1S: define spir_kernel void @__vecz_nxv4_vp_load_add_store_i32( +; CHECK_1S: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0) +; CHECK_1S: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0) +; CHECK_1S: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]] +; CHECK_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK_1S: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 2 +; CHECK_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]]) +; CHECK_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32 +; CHECK_1S: [[LHS:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK: (shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]]) +; CHECK_1S: [[RHS:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]]) +; CHECK_1S: [[ADD:%.*]] = call @llvm.vp.add.nxv4i32( [[LHS]], [[RHS]], [[TRUEMASK]], i32 [[VL]]) +; CHECK_1S: call void @llvm.vp.store.nxv4i32.p0( [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]]) + +define spir_kernel void @load_add_store_v4i32(<4 x i32>* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx + %a = load <4 x i32>, <4 x i32>* %arrayidxa, align 16 + %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16 + %sum = add <4 x i32> %a, %b + store <4 x i32> %sum, <4 x i32>* %arrayidxz, align 16 + ret void +} + +; CHECK_V4_2F: define spir_kernel void @__vecz_v2_vp_load_add_store_v4i32( +; CHECK_V4_2F: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0) +; CHECK_V4_2F: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0) +; CHECK_V4_2F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]] +; CHECK_V4_2F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 2) +; CHECK_V4_2F: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T0]] to i32 +; Each WI performs 4 elements, so multiply the VL by 4 +; CHECK_V4_2F: [[SVL:%.*]] = shl nuw nsw i32 [[VL]], 2 +; CHECK_V4_2F: [[LHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]]) +; CHECK_V4_2F: [[RHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]]) +; CHECK_V4_2F: [[ADD:%.*]] = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> [[LHS]], <8 x i32> [[RHS]], <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]]) +; CHECK_V4_2F: call void @llvm.vp.store.v8i32.p0(<8 x i32> [[ADD]], ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]]) + +; CHECK_V4_1S: define spir_kernel void @__vecz_nxv4_vp_load_add_store_v4i32( +; CHECK_V4_1S: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0) +; CHECK_V4_1S: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0) +; CHECK_V4_1S: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]] +; CHECK_V4_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK_V4_1S: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 2 +; CHECK_V4_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]]) +; CHECK_V4_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32 +; Each WI performs 4 elements, so multiply the VL by 4 +; CHECK_V4_1S: [[SVL:%.*]] = shl i32 [[VL]], 2 +; CHECK_V4_1S: [[LHS:%.*]] = call @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK: (shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\))]], i32 [[SVL]]) +; CHECK_V4_1S: [[RHS:%.*]] = call @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]]) +; CHECK_V4_1S: [[ADD:%.*]] = call @llvm.vp.add.nxv16i32( [[LHS]], [[RHS]], [[TRUEMASK]], i32 [[SVL]]) +; CHECK_V4_1S: call void @llvm.vp.store.nxv16i32.p0( [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll new file mode 100644 index 0000000000000..03492705536f4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll @@ -0,0 +1,106 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_fn( %p) { + %ret0 = call @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, zeroinitializer, zeroinitializer, i32 4) + %ret1 = call { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, zeroinitializer, zeroinitializer, zeroinitializer, i32 4) + ret void +} + +declare @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, %val, %mask, i32 %vl) + +declare { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, %cmp, %newval, %mask, i32 %vl) + +; CHECK: define @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b( %p, %val, %mask, i32 %vl) { +; CHECK: entry: +; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0 +; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry + +; CHECK: earlyexit: +; CHECK: ret poison + +; CHECK: loopentry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ] +; CHECK: [[RET_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement %mask, i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement %p, i32 [[IDX]] +; CHECK: [[VAL:%.*]] = extractelement %val, i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4 +; CHECK: [[RET_NEXT:%.*]] = insertelement [[RET_PREV]], i32 [[ATOM]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE:%.*]] = phi [ [[RET_PREV]], %loopIR ], [ [[RET_NEXT]], %if.then ] +; CHECK: [[INC]] = add i32 [[IDX]], 1 +; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl +; CHECK: br i1 [[CMP]], label %loopIR, label %exit + +; CHECK: exit: +; CHECK: ret [[MERGE]] + +; CHECK: define { , } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b( %p, %cmp, %newval, %mask, i32 %vl) { +; CHECK: entry: +; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0 +; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry + +; CHECK: earlyexit: +; CHECK: ret { , } poison + +; CHECK: loopentry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ] +; CHECK: [[RET_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[SUCCESS_PREV:%.*]] = phi [ poison, %loopentry ], [ [[MERGE_SUCCESS:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement %mask, i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement %p, i32 [[IDX]] +; CHECK: [[CMP:%.*]] = extractelement %cmp, i32 [[IDX]] +; CHECK: [[NEWVAL:%.*]] = extractelement %newval, i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEWVAL]] acquire acquire, align 4 +; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[RET:%.*]] = insertelement [[RET_PREV]], i32 [[EXT0]], i32 [[IDX]] +; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: [[SUCCESS:%.*]] = insertelement [[SUCCESS_PREV]], i1 [[EXT1]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE:%.*]] = phi [ [[RET_PREV]], %loopIR ], [ [[RET]], %if.then ] +; CHECK: [[MERGE_SUCCESS:%.*]] = phi [ [[SUCCESS_PREV]], %loopIR ], [ [[SUCCESS]], %if.then ] +; CHECK: [[INC]] = add i32 [[IDX]], 1 +; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl +; CHECK: br i1 [[CMP]], label %loopIR, label %exit + +; CHECK: exit: +; CHECK: [[RETTMP:%.*]] = insertvalue { , } poison, [[MERGE]], 0 +; CHECK: [[RETVAL:%.*]] = insertvalue { , } [[RETTMP]], [[MERGE_SUCCESS]], 1 +; CHECK: ret { , } [[RETVAL]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll new file mode 100644 index 0000000000000..0ce65b9f4ca00 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll @@ -0,0 +1,46 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; A kernel which should produce a uniform masked vector load where the mask is +; a single varying splatted bit. +define spir_kernel void @mask_varying(<4 x i32>* %aptr, <4 x i32>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %mod_idx = urem i64 %idx, 2 + %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx + %ins = insertelement <4 x i1> poison, i1 true, i32 0 + %cmp = icmp slt i64 %idx, 64 + br i1 %cmp, label %if.then, label %if.end +if.then: + %v = load <4 x i32>, <4 x i32>* %aptr + %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx + store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16 + br label %if.end +if.end: + ret void +; CHECK: define spir_kernel void @__vecz_nxv4_vp_mask_varying +; CHECK: [[CMP:%.*]] = icmp slt %{{.*}}, +; CHECK: [[RED:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, [[CMP]], {{.*}}, i32 {{.*}}) +; CHECK: [[VAL:%.*]] = load <4 x i32>, ptr %aptr +} + +declare i64 @__mux_get_global_id(i32) +declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPDv4_jDv4_b(<4 x i32>*, <4 x i1>) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll new file mode 100644 index 0000000000000..9660f9a601365 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll @@ -0,0 +1,64 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +; With VP all gathers become masked ones. +define spir_kernel void @unmasked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %rem + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv4_vp_unmasked_gather( +; CHECK: [[v:%.*]] = call @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj( %{{.*}}) +; CHECK: call void @llvm.vp.store.nxv4i32.p1( [[v]], + + +; With VP all scatters become masked ones. +define spir_kernel void @unmasked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %rem + store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv4_vp_unmasked_scatter( +; CHECK: [[v:%.*]] = call @llvm.vp.load.nxv4i32.p1( +; CHECK: call void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj( [[v]], + +; CHECK: define @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj( %0, %1, i32 %2) [[ATTRS:#[0-9]+]] { +; CHECK: %3 = call @llvm.vp.gather.nxv4i32.nxv4p1( %0, %1, i32 %2) +; CHECK: ret %3 + +; CHECK: define void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj( %0, %1, %2, i32 %3) [[ATTRS]] { +; CHECK: entry: +; CHECK: call void @llvm.vp.scatter.nxv4i32.nxv4p1( %0, %1, %2, i32 %3) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll new file mode 100644 index 0000000000000..c5f015913aead --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll @@ -0,0 +1,240 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) +declare spir_func i32 @__mux_get_sub_group_id() + +declare spir_func i1 @__mux_sub_group_all_i1(i1) +declare spir_func i1 @__mux_sub_group_any_i1(i1) + +declare spir_func i32 @__mux_sub_group_reduce_add_i32(i32) +declare spir_func i64 @__mux_sub_group_reduce_add_i64(i64) +declare spir_func float @__mux_sub_group_reduce_fadd_f32(float) +declare spir_func i32 @__mux_sub_group_reduce_smin_i32(i32) +declare spir_func i32 @__mux_sub_group_reduce_umin_i32(i32) +declare spir_func i32 @__mux_sub_group_reduce_smax_i32(i32) +declare spir_func i32 @__mux_sub_group_reduce_umax_i32(i32) +declare spir_func float @__mux_sub_group_reduce_fmin_f32(float) +declare spir_func float @__mux_sub_group_reduce_fmax_f32(float) + +define spir_kernel void @reduce_all_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %1 = icmp ne i32 %0, 0 + %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 %1) + %2 = sext i1 %call2 to i32 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_all_i32( +; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]]) +; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32 +; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_any_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %1 = icmp ne i32 %0, 0 + %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %1) + %2 = sext i1 %call2 to i32 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_any_i32( +; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]]) +; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32 +; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i32( +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.add.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call + %0 = load i64, i64 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 %0) + %arrayidx3 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %conv + store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i64( +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.add.v4i64(i64 0, <4 x i64> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]]) +; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float %0) + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv + store float %call2, float addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_add_f32( +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]]) +; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_smin_i32( +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smin.v4i32(i32 2147483647, <4 x i32> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_umin_i32( +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umin.v4i32(i32 -1, <4 x i32> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_smax_i32( +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smax.v4i32(i32 -2147483648, <4 x i32> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_umax_i32( +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umax.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float %0) + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv + store float %call2, float addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_fmin_f32( +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmin.v4f32(float 0x7FF8000000000000, <4 x float> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]]) +; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float %0) + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv + store float %call2, float addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_fmax_f32( +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmax.v4f32(float 0xFFF8000000000000, <4 x float> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]]) +; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll new file mode 100644 index 0000000000000..c632bbefc304d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll @@ -0,0 +1,203 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) +declare spir_func i32 @__mux_get_sub_group_id() + +declare spir_func i32 @__mux_sub_group_reduce_mul_i32(i32) +declare spir_func i64 @__mux_sub_group_reduce_mul_i64(i64) +declare spir_func float @__mux_sub_group_reduce_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_reduce_and_i32(i32) +declare spir_func i32 @__mux_sub_group_reduce_or_i32(i32) +declare spir_func i64 @__mux_sub_group_reduce_xor_i64(i64) + +declare spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1) +declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1) +declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1) + +; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i32( +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.mul.v4i32(i32 1, <4 x i32> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + store i32 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i64( +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.mul.v4i64(i64 1, <4 x i64> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]]) +; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call + %0 = load i64, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 %0) + %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv + store i64 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_f32( +; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1( +; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]]) +; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float %0) + %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv + store float %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_and_i32( +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.and.v4i32(i32 -1, <4 x i32> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + store i32 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_or_i32( +; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1( +; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.or.v4i32(i32 0, <4 x i32> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + store i32 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i64( +; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1( +; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.xor.v4i64(i64 0, <4 x i64> [[C]], {{.*}}) +; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]]) +; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 8 +define spir_kernel void @reduce_xor_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call + %0 = load i64, ptr addrspace(1) %arrayidx, align 8 + %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0) + %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv + store i64 %call2, ptr addrspace(1) %arrayidx3, align 8 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_and( +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[T:%.*]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[R]]) +; CHECK: [[R:%.*]] = zext i1 %call2 to i32 +; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 %1) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + %zext = zext i1 %call2 to i32 + store i32 %zext, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_or( +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[T:%.*]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[R]]) +; CHECK: [[R:%.*]] = zext i1 %call2 to i32 +; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 %1) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + %zext = zext i1 %call2 to i32 + store i32 %zext, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_xor( +; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.xor.v4i1(i1 false, <4 x i1> [[T:%.*]], {{.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[R]]) +; CHECK: [[R:%.*]] = zext i1 %call2 to i32 +; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 %1) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + %zext = zext i1 %call2 to i32 + store i32 %zext, ptr addrspace(1) %arrayidx3, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll new file mode 100644 index 0000000000000..a2da3addcbccf --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll @@ -0,0 +1,153 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | FileCheck %s + +; Tests the use of the VectorPredication choice. However, note that this option +; currently makes no difference on fixed length vectors. + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32) +declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64) +declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32) +declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float) +declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float) + +define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %{{.*}}) +} + +define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call + %0 = load i64, i64 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0) + %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call + store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_i64( +; CHECK: call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> %{{.*}}) +} + +define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_f32( +; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> %{{.*}}) +} + +define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_smin_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}}) +} + +define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_umin_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> %{{.*}}) +} + +define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_smax_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> %{{.*}}) +} + +define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_umax_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> %{{.*}}) +} + +define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_fmin_f32( +; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> %{{.*}}) +} + +define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_fmax_f32( +; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> %{{.*}}) +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll new file mode 100644 index 0000000000000..3ec97bda6fb12 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll @@ -0,0 +1,174 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | FileCheck %s + +; Tests the use of the VectorPredication choice. However, note that this option +; currently makes no difference on fixed length vectors. + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32) +declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32) +declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1) + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_mul_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_excl_mul_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_mul_f32( +; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_f(<4 x float> %{{.*}}) +define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0) + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_excl_mul_f32( +; CHECK: call <4 x float> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_f(<4 x float> %{{.*}}) +define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0) + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_and_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_and_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_or_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_or_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_xor_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_and( +; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_and_Dv4_b(<4 x i1> %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_or( +; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_or_Dv4_b(<4 x i1> %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_xor( +; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_b(<4 x i1> %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll new file mode 100644 index 0000000000000..e28025d5bccfc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k udiv -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @udiv(i32* %aptr, i32* %bptr, i32* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load i32, i32* %arrayidxb, align 4 + %sum = udiv i32 %a, %b + store i32 %sum, i32* %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv2_vp_udiv( +; CHECK: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0) +; CHECK: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0) +; CHECK: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]] +; CHECK: [[T0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 1 +; CHECK: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]]) +; CHECK: [[VL:%.*]] = trunc i64 [[T2]] to i32 +; CHECK: [[LHS:%.*]] = call @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK: (shufflevector \( insertelement \( poison, i1 true, (i32|i64) 0\), poison, zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]]) +; CHECK: [[RHS:%.*]] = call @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]]) +; CHECK: [[ADD:%.*]] = call @llvm.vp.udiv.nxv2i32( [[LHS]], [[RHS]], [[TRUEMASK]], i32 [[VL]]) +; CHECK: call void @llvm.vp.store.nxv2i32.p0( [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll new file mode 100644 index 0000000000000..7c71b06f530c3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll @@ -0,0 +1,66 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nobuiltin nounwind } + +; Test if the interleaved load is NOT defined +; CHECK-NOT: define <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}}) + +; Wide load instead +; CHECK: load <16 x double> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll new file mode 100644 index 0000000000000..7c71b06f530c3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll @@ -0,0 +1,66 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nobuiltin nounwind } + +; Test if the interleaved load is NOT defined +; CHECK-NOT: define <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}}) + +; Wide load instead +; CHECK: load <16 x double> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll new file mode 100644 index 0000000000000..c2c7b68912910 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll @@ -0,0 +1,70 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.addr.0, %e + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in) + call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out) + %0 = extractelement <4 x i32> %call1, i64 0 + %tobool = icmp ne i32 %0, 0 + %tobool2 = icmp eq i64 %call, 0 + %or.cond = and i1 %tobool2, %tobool + br i1 %or.cond, label %while.cond, label %for.inc + +while.cond: ; preds = %while.cond, %for.body + %tobool3 = icmp eq i64 %call, 0 + br i1 %tobool3, label %for.inc, label %while.cond + +for.inc: ; preds = %for.body, %while.cond + %inc = add nsw i32 %i.addr.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*) + +declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*) + +; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep + +; Vector widening results in a single load +; CHECK: load <16 x i32> +; CHECK-NOT: call {{.*}}i32 @__vecz_b_interleaved_load4_ju3ptrU3AS1 + +; CHECK: ret void + +; Check if the declaration is missing as well +; CHECK-NOT: @__vecz_b_interleaved_load4_ju3ptrU3AS1 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll new file mode 100644 index 0000000000000..2d28ba251b055 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k extract_constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @extract_constant_index(<4 x i64> addrspace(1)* %in, i32 %x, i64 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %in, i64 %call + %0 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx, align 4 + %vecext = extractelement <4 x i64> %0, i32 0; + %arrayidx1 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call + store i64 %vecext, i64 addrspace(1)* %arrayidx1, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index +; CHECK: %[[LD:.+]] = load <16 x i64> +; CHECK: %[[EXT:.+]] = shufflevector <16 x i64> %[[LD]], <16 x i64> poison, <4 x i32> +; CHECK: store <4 x i64> %[[EXT]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll new file mode 100644 index 0000000000000..7739407d482f8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: nounwind +define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4 + %vecext = extractelement <4 x float> %0, i32 %x + %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %vecext, float addrspace(1)* %arrayidx1, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index +; CHECK: %[[LD:.+]] = load <16 x float>, ptr addrspace(1) % + +; No splitting of the widened source vector +; CHECK-NOT: shufflevector + +; Extract directly from the widened source and insert directly into result +; CHECK: %[[EXT0:.+]] = extractelement <16 x float> %[[LD]], i32 %x +; CHECK: %[[INS0:.+]] = insertelement <4 x float> poison, float %[[EXT0]], i32 0 +; CHECK: %[[IDX1:.+]] = add i32 %x, 4 +; CHECK: %[[EXT1:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX1]] +; CHECK: %[[INS1:.+]] = insertelement <4 x float> %[[INS0]], float %[[EXT1]], i32 1 +; CHECK: %[[IDX2:.+]] = add i32 %x, 8 +; CHECK: %[[EXT2:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX2]] +; CHECK: %[[INS2:.+]] = insertelement <4 x float> %[[INS1]], float %[[EXT2]], i32 2 +; CHECK: %[[IDX3:.+]] = add i32 %x, 12 +; CHECK: %[[EXT3:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX3]] +; CHECK: %[[INS3:.+]] = insertelement <4 x float> %[[INS2]], float %[[EXT3]], i32 3 +; CHECK: store <4 x float> %[[INS3]], ptr addrspace(1) %{{.+}} +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll new file mode 100644 index 0000000000000..405cff01c2e34 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: nounwind +define spir_kernel void @extract_runtime_index(i32 addrspace(1)* %in, <4 x i8> %x, i8 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %vecext = extractelement <4 x i8> %x, i32 %0 + %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %call + store i8 %vecext, i8 addrspace(1)* %arrayidx1, align 1 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index +; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) % + +; No splitting of the widened source vector +; CHECK-NOT: shufflevector + +; Extract directly from the uniform source with vectorized indices and insert directly into result +; CHECK: %[[IND0:.+]] = extractelement <4 x i32> %[[LD]], i32 0 +; CHECK: %[[EXT0:.+]] = extractelement <4 x i8> %x, i32 %[[IND0]] +; CHECK: %[[INS0:.+]] = insertelement <4 x i8> poison, i8 %[[EXT0]], i32 0 +; CHECK: %[[IND1:.+]] = extractelement <4 x i32> %[[LD]], i32 1 +; CHECK: %[[EXT1:.+]] = extractelement <4 x i8> %x, i32 %[[IND1]] +; CHECK: %[[INS1:.+]] = insertelement <4 x i8> %[[INS0]], i8 %[[EXT1]], i32 1 +; CHECK: %[[IND2:.+]] = extractelement <4 x i32> %[[LD]], i32 2 +; CHECK: %[[EXT2:.+]] = extractelement <4 x i8> %x, i32 %[[IND2]] +; CHECK: %[[INS2:.+]] = insertelement <4 x i8> %[[INS1]], i8 %[[EXT2]], i32 2 +; CHECK: %[[IND3:.+]] = extractelement <4 x i32> %[[LD]], i32 3 +; CHECK: %[[EXT3:.+]] = extractelement <4 x i8> %x, i32 %[[IND3]] +; CHECK: %[[INS3:.+]] = insertelement <4 x i8> %[[INS2]], i8 %[[EXT3]], i32 3 +; CHECK: store <4 x i8> %[[INS3]], ptr addrspace(1) %{{.+}}, align 1 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll new file mode 100644 index 0000000000000..70d1908c8a9ab --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll @@ -0,0 +1,62 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: nounwind +define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %x, float addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %x, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4 + %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + %vecext = extractelement <4 x float> %0, i32 %1 + %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %vecext, float addrspace(1)* %arrayidx1, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index +; CHECK: %[[SRC:.+]] = load <16 x float>, ptr addrspace(1) % +; CHECK: %[[IDX:.+]] = load <4 x i32>, ptr addrspace(1) % + +; No splitting of the widened source vector +; CHECK-NOT: shufflevector + +; Offset the indices +; CHECK: %[[ADD:.+]] = add <4 x i32> %[[IDX]], + +; Extract directly from the widened source with vectorized indices and insert directly into result +; CHECK: %[[IND0:.+]] = extractelement <4 x i32> %[[ADD]], i32 0 +; CHECK: %[[EXT0:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND0]] +; CHECK: %[[INS0:.+]] = insertelement <4 x float> poison, float %[[EXT0]], i32 0 +; CHECK: %[[IND1:.+]] = extractelement <4 x i32> %[[ADD]], i32 1 +; CHECK: %[[EXT1:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND1]] +; CHECK: %[[INS1:.+]] = insertelement <4 x float> %[[INS0]], float %[[EXT1]], i32 1 +; CHECK: %[[IND2:.+]] = extractelement <4 x i32> %[[ADD]], i32 2 +; CHECK: %[[EXT2:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND2]] +; CHECK: %[[INS2:.+]] = insertelement <4 x float> %[[INS1]], float %[[EXT2]], i32 2 +; CHECK: %[[IND3:.+]] = extractelement <4 x i32> %[[ADD]], i32 3 +; CHECK: %[[EXT3:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND3]] +; CHECK: %[[INS3:.+]] = insertelement <4 x float> %[[INS2]], float %[[EXT3]], i32 3 +; CHECK: store <4 x float> %[[INS3]], ptr addrspace(1) %{{.+}} +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll new file mode 100644 index 0000000000000..2d767313e0ddc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll @@ -0,0 +1,57 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @constant_index(<4 x i32>* %in, i32* %inval, <4 x i32>* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call + %0 = load <4 x i32>, <4 x i32>* %arrayidx + %arrayidx2 = getelementptr inbounds i32, i32* %inval, i64 %call + %ldval = load i32, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call + %vecins = insertelement <4 x i32> %0, i32 %ldval, i32 2 + store <4 x i32> %vecins, <4 x i32>* %arrayidx3 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_constant_index + +; A single wide load +; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr % + +; The vectorized element load: +; CHECK: %[[ELTS:.+]] = load <4 x i32>, ptr % + +; No interleaved loads +; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_Dv4_ju3ptr + +; Insert elements turned into shufflevectors +; CHECK: %[[WIDE:.+]] = shufflevector <4 x i32> %[[ELTS]], <4 x i32> poison, <16 x i32> +; CHECK: %[[INS:.+]] = shufflevector <16 x i32> %[[WIDE]], <16 x i32> %[[INTO]], <16 x i32> + +; No more shuffles.. +; CHECK-NOT: shufflevector + +; We should have one widened store +; CHECK: store <16 x i32> %[[INS]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll new file mode 100644 index 0000000000000..9c023a64e57ed --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @constant_index(<4 x i32>* %in, <4 x i32>* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call + %0 = load <4 x i32>, <4 x i32>* %arrayidx + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call + %vecins = insertelement <4 x i32> %0, i32 42, i32 2 + store <4 x i32> %vecins, <4 x i32>* %arrayidx2 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_constant_index + +; A single wide load +; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr % + +; No interleaved loads +; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_Dv4_ju3ptr + +; Insert constant elements into the widened vector: +; CHECK: %[[INS0:.+]] = insertelement <16 x i32> %[[INTO]], i32 42, i32 2 +; CHECK: %[[INS1:.+]] = insertelement <16 x i32> %[[INS0]], i32 42, i32 6 +; CHECK: %[[INS2:.+]] = insertelement <16 x i32> %[[INS1]], i32 42, i32 10 +; CHECK: %[[INS3:.+]] = insertelement <16 x i32> %[[INS2]], i32 42, i32 14 + +; No shuffles.. +; CHECK-NOT: shufflevector + +; We should have one widened store +; CHECK: store <16 x i32> %[[INS3]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll new file mode 100644 index 0000000000000..05ccf997a7d0a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll @@ -0,0 +1,60 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @runtime_index(<4 x i32>* %in, <4 x i32>* %out, i32* %index) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call + %0 = load <4 x i32>, <4 x i32>* %arrayidx + %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call + store <4 x i32> %0, <4 x i32>* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32* %index, i64 %call + %1 = load i32, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call + %vecins = insertelement <4 x i32> %0, i32 42, i32 %1 + store <4 x i32> %vecins, <4 x i32>* %arrayidx3 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_runtime_index + +; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr %arrayidx, align 16 +; CHECK: %[[LD:.+]] = load <4 x i32>, ptr +; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LD]], + +; The inserts got widened +; CHECK: %[[ELT0:.+]] = extractelement <4 x i32> %[[ADD]], i32 0 +; CHECK: %[[INS0:.+1]] = insertelement <16 x i32> %[[INTO]], i32 42, i32 %[[ELT0]] +; CHECK: %[[ELT1:.+]] = extractelement <4 x i32> %[[ADD]], i32 1 +; CHECK: %[[INS1:.+]] = insertelement <16 x i32> %[[INS0]], i32 42, i32 %[[ELT1]] +; CHECK: %[[ELT2:.+]] = extractelement <4 x i32> %[[ADD]], i32 2 +; CHECK: %[[INS2:.+]] = insertelement <16 x i32> %[[INS1]], i32 42, i32 %[[ELT2]] +; CHECK: %[[ELT3:.+]] = extractelement <4 x i32> %[[ADD]], i32 3 +; CHECK: %[[INS3:.+]] = insertelement <16 x i32> %[[INS2]], i32 42, i32 %[[ELT3]] + +; No shuffles.. +; CHECK-NOT: shufflevector + +; One widened store directly storing the result +; CHECK: store <16 x i32> %[[INS3]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll new file mode 100644 index 0000000000000..00853a9b28b94 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll @@ -0,0 +1,98 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-simd-width 4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"} +!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""} +!6 = !{!"clang version 3.8.1 "} + +; Function start +; CHECK: define spir_kernel void @__vecz_v4_f +; CHECK: call i64 @__mux_get_global_id(i32 0) + +; There should be exactly six vector loads and one store in the code +; CHECK: load <16 x double> + +; And in between them there should be a barrier call +; CHECK: call void @__mux_work_group_barrier +; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}} +; CHECK: load <16 x double> +; CHECK: load <16 x double> +; CHECK: load <16 x double> +; CHECK: load <16 x double> + +; The fmuladd instrinsic will be widened.. +; CHECK: call <16 x double> @llvm.fmuladd.v16f64 +; CHECK: load <16 x double> +; CHECK: store <16 x double> + +; There shouldn't be any interleaved loads or stores left +; CHECK-NOT: call <4 x double> @__vecz_b_interleaved_load +; CHECK-NOT: call void @__vecz_b_interleaved_store + +; Function end +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll new file mode 100644 index 0000000000000..3aae73704e7c9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll @@ -0,0 +1,41 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_d +; CHECK: call i64 @__mux_get_global_id(i32 0) +; CHECK: and <16 x i64> +; CHECK: icmp slt <16 x i64> +; CHECK: sext <16 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll new file mode 100644 index 0000000000000..08a97d76842e7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_f +; CHECK: and <16 x i32> +; CHECK: icmp slt <16 x i32> +; CHECK: sext <16 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll new file mode 100644 index 0000000000000..1431fa1c19573 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_d +; CHECK: and <16 x i64> +; CHECK: icmp eq <16 x i64> +; CHECK: sext <16 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll new file mode 100644 index 0000000000000..83054e694801a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_f +; CHECK: and <16 x i32> +; CHECK: icmp eq <16 x i32> +; CHECK: sext <16 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll new file mode 100644 index 0000000000000..945ac791355c0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_d +; CHECK: and <16 x i64> +; CHECK: icmp eq <16 x i64> +; CHECK: and <16 x i64> +; CHECK: icmp sgt <16 x i64> +; CHECK: and <16 x i1> +; CHECK: sext <16 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll new file mode 100644 index 0000000000000..86139d572338b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_f +; CHECK: and <16 x i32> +; CHECK: icmp eq <16 x i32> +; CHECK: and <16 x i32> +; CHECK: icmp sgt <16 x i32> +; CHECK: and <16 x i1> +; CHECK: sext <16 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll new file mode 100644 index 0000000000000..05117b1b691dc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll @@ -0,0 +1,42 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_d +; CHECK: and <16 x i64> +; CHECK: icmp slt <16 x i64> +; CHECK: icmp sgt <16 x i64> +; CHECK: and <16 x i1> +; CHECK: sext <16 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll new file mode 100644 index 0000000000000..d33853b4e8d32 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll @@ -0,0 +1,42 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_f +; CHECK: and <16 x i32> +; CHECK: icmp slt <16 x i32> +; CHECK: icmp sgt <16 x i32> +; CHECK: and <16 x i1> +; CHECK: sext <16 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll new file mode 100644 index 0000000000000..4c887d9c66be5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll @@ -0,0 +1,79 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_local_id(i32) #0 + +; Function Attrs: nounwind readnone +declare spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) #0 + +declare spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float>, i64, float addrspace(1)*) + +declare spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64, float addrspace(1)*) +; Function Attrs: inlinehint norecurse nounwind readnone +declare spir_func float @_Z3madfff(float, float, float) local_unnamed_addr #2 + +define spir_kernel void @scalar_vector_user(float addrspace(1)* %inout, i64 %n) { +entry: + %lid = tail call i64 @__mux_get_local_id(i32 0) #0 + %inout.address = getelementptr inbounds float, float addrspace(1)* %inout, i64 %lid + br label %loop + +loop: ; preds = %entry, %loop + %madv4.prev = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4, %loop ] + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ] + %i.inc = add nuw nsw i64 %i, 1 + %cmp = icmp slt i64 %i.inc, %n + %inout.vload = tail call spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64 0, float addrspace(1)* %inout.address) + %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> poison, <4 x i32> zeroinitializer + %madv4 = tail call spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float> %inout.vload, <4 x float> %inout.vec0, <4 x float> %madv4.prev) #0 + br i1 %cmp, label %loop, label %end + +end: ; preds = %loop + %mad.vec0 = extractelement <4 x float> %madv4, i32 0 + store float %mad.vec0, float addrspace(1)* %inout.address, align 4 + tail call spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float> %madv4, i64 0, float addrspace(1)* %inout.address) + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { noduplicate } +attributes #2 = { inlinehint norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } + +; The purpose of this test is to make sure we correctly scalarize an instruction +; used by both a scalar and vector instruction. We would previously try to +; scalarize its users twice thus resulting in invalid IR. + +; CHECK: define spir_kernel void @__vecz_v4_scalar_vector_user +; CHECK: loop: +; CHECK: %madv4.prev{{.*}} = phi <16 x float> [ zeroinitializer, %entry ], [ %[[CONCAT:.+]], %loop ]{{$}} + +; make sure the above PHI incomings are unique by looking for their definitions +; one day we might be able to super-vectorize this call, but for now we instantiate and concatenate it +; CHECK: %madv4[[S0:[0-9]+]] = +; CHECK: %madv4[[S1:[0-9]+]] = +; CHECK: %madv4[[S2:[0-9]+]] = +; CHECK: %madv4[[S3:[0-9]+]] = +; CHECK: %[[C0:.+]] = shufflevector <4 x float> %madv4[[S0]], <4 x float> %madv4[[S1]], <8 x i32> +; CHECK: %[[C1:.+]] = shufflevector <4 x float> %madv4[[S2]], <4 x float> %madv4[[S3]], <8 x i32> +; CHECK: %[[CONCAT]] = shufflevector <8 x float> %[[C0]], <8 x float> %[[C1]], <16 x i32> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll new file mode 100644 index 0000000000000..6fab62e9ca4a8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll @@ -0,0 +1,47 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k vector_copy -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @vector_copy(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %call + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 16 + %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %0, <4 x i32> addrspace(1)* %arrayidx1, align 16 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +; It makes sure the vector load and store are preserved right through to packetization +; and then widened, instead of being scalarized across work-items first +; and then getting de-interleaved by the Interleaved Group Combine Pass. +; We expect a single vector loads feeding directly into a single vector store. + +; CHECK: define spir_kernel void @__vecz_v4_vector_copy +; CHECK: load <16 x i32> +; CHECK-NOT: load +; CHECK-NOT: %deinterleave{{[0-9]*}} = shufflevector +; CHECK-NOT: %interleave{{[0-9]*}} = shufflevector +; CHECK: store <16 x i32> +; CHECK-NOT: store diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll new file mode 100644 index 0000000000000..e5054ae1201e7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll @@ -0,0 +1,91 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k vector_loop -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %call.trunc = trunc i64 %call to i32 + %call.splatinsert = insertelement <4 x i32> poison, i32 %call.trunc, i32 0 + %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + %cmp = icmp eq i64 %call, 0 + br i1 %cmp, label %for.end, label %for.cond + +for.cond: ; preds = %entry, %for.body + %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ] + %call1 = call i64 @__mux_get_global_size(i32 0) + %conv = trunc i64 %call1 to i32 + %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat + %0 = extractelement <4 x i1> %cmp2, i64 0 + br i1 %0, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %3 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom3 = sext i32 %3 to i64 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3 + store i32 %2, i32 addrspace(1)* %arrayidx4, align 4 + %4 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom5 = sext i32 %4 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5 + %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %6 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom7 = sext i32 %6 to i64 + %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7 + store i32 %5, i32 addrspace(1)* %arrayidx8, align 4 + %7 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom9 = sext i32 %7 to i64 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9 + %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %9 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom11 = sext i32 %9 to i64 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 %8, i32 addrspace(1)* %arrayidx12, align 4 + %10 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom13 = sext i32 %10 to i64 + %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13 + %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4 + %12 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom15 = sext i32 %12 to i64 + %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15 + store i32 %11, i32 addrspace(1)* %arrayidx16, align 4 + %inc = add <4 x i32> %storemerge, %call.splat + br label %for.cond + +for.end: ; preds = %entry, %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_global_size(i32) + +; This test checks if a varying <4 x i32> phi is scalarized into 4 i32 phis +; and then re-packetized +; CHECK: define spir_kernel void @__vecz_v4_vector_loop +; CHECK: %[[STOREMERGE1:.+]] = phi <16 x i32> [ %[[INC2:.+]], %for.body ], [ zeroinitializer, %entry ] +; CHECK: %[[INC2]] = add <16 x i32> %[[STOREMERGE1]], [[CALL:.+]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll new file mode 100644 index 0000000000000..67965785ba932 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll @@ -0,0 +1,67 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +declare i32 @llvm.abs.i32(i32, i1) +declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) + +define spir_kernel void @absff(i32* %pa, i32* %pb) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr i32, i32* %pa, i64 %idx + %b = getelementptr i32, i32* %pb, i64 %idx + %la = load i32, i32* %a, align 16 + %res = call spir_func i32 @llvm.abs.i32(i32 %la, i1 true) + store i32 %res, i32* %b, align 16 + ret void +} + +define spir_kernel void @absvf(<2 x i32>* %pa, <2 x i32>* %pb) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <2 x i32>, <2 x i32>* %pa, i64 %idx + %b = getelementptr <2 x i32>, <2 x i32>* %pb, i64 %idx + %la = load <2 x i32>, <2 x i32>* %a, align 16 + %res = call spir_func <2 x i32> @llvm.abs.v2i32(<2 x i32> %la, i1 true) + store <2 x i32> %res, <2 x i32>* %b, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_absff(ptr %pa, ptr %pb) +; CHECK: entry: +; CHECK: %idx = call i64 @__mux_get_global_id(i32 0) +; CHECK: %a = getelementptr i32, ptr %pa, i64 %idx +; CHECK: %b = getelementptr i32, ptr %pb, i64 %idx +; CHECK: %[[T0:.*]] = load <4 x i32>, ptr %a, align 4 +; CHECK: %[[RES1:.+]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %[[T0]], i1 true) +; CHECK: store <4 x i32> %[[RES1]], ptr %b, align 4 +; CHECK: ret void + +; CHECK: define spir_kernel void @__vecz_v4_absvf(ptr %pa, ptr %pb) +; CHECK: entry: +; CHECK: %idx = call i64 @__mux_get_global_id(i32 0) +; CHECK: %a = getelementptr <2 x i32>, ptr %pa, i64 %idx +; CHECK: %b = getelementptr <2 x i32>, ptr %pb, i64 %idx +; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 8 +; CHECK: %[[RES2:.+]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %[[T0]], i1 true) +; CHECK: store <8 x i32> %[[RES2]], ptr %b, align 8 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll new file mode 100644 index 0000000000000..96fbeb3ad959c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll @@ -0,0 +1,57 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k widen_binops -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @widen_binops(<4 x i32>* %pa, <4 x i32>* %pb, <4 x i64>* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <4 x i32>, <4 x i32>* %pa, i64 %idx + %b = getelementptr <4 x i32>, <4 x i32>* %pb, i64 %idx + %d = getelementptr <4 x i64>, <4 x i64>* %pd, i64 %idx + %la = load <4 x i32>, <4 x i32>* %a, align 16 + %lb = load <4 x i32>, <4 x i32>* %b, align 16 + %xa = zext <4 x i32> %la to <4 x i64> + %xb = zext <4 x i32> %lb to <4 x i64> + %add = add nuw nsw <4 x i64> %xa, %xb + store <4 x i64> %add, <4 x i64>* %d, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v8_widen_binops(ptr %pa, ptr %pb, ptr %pd) +; CHECK: entry: + +; It checks that the zexts and add of <4 x i32> gets widened by a factor of 8, +; to produce PAIRs of <16 x i32>s. +; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16 +; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16 +; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16 +; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16 +; CHECK: %[[XA0:.+]] = zext <16 x i32> %[[LDA0]] to <16 x i64> +; CHECK: %[[XA1:.+]] = zext <16 x i32> %[[LDA1]] to <16 x i64> +; CHECK: %[[XB0:.+]] = zext <16 x i32> %[[LDB0]] to <16 x i64> +; CHECK: %[[XB1:.+]] = zext <16 x i32> %[[LDB1]] to <16 x i64> +; CHECK: %[[ADD0:.+]] = add nuw nsw <16 x i64> %[[XA0]], %[[XB0]] +; CHECK: %[[ADD1:.+]] = add nuw nsw <16 x i64> %[[XA1]], %[[XB1]] +; CHECK: store <16 x i64> %[[ADD0]], ptr %{{.+}} +; CHECK: store <16 x i64> %[[ADD1]], ptr %{{.+}} + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll new file mode 100644 index 0000000000000..4d07f4a90a961 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll @@ -0,0 +1,75 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +declare float @llvm.copysign.f32(float, float) +declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>) + +define spir_kernel void @copysignff(float* %pa, float* %pb, float* %pc) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr float, float* %pa, i64 %idx + %b = getelementptr float, float* %pb, i64 %idx + %c = getelementptr float, float* %pc, i64 %idx + %la = load float, float* %a, align 16 + %lb = load float, float* %b, align 16 + %res = call float @llvm.copysign.f32(float %la, float %lb) + store float %res, float* %c, align 16 + ret void +} + +define spir_kernel void @copysignvf(<2 x float>* %pa, <2 x float>* %pb, <2 x float>* %pc) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <2 x float>, <2 x float>* %pa, i64 %idx + %b = getelementptr <2 x float>, <2 x float>* %pb, i64 %idx + %c = getelementptr <2 x float>, <2 x float>* %pc, i64 %idx + %la = load <2 x float>, <2 x float>* %a, align 16 + %lb = load <2 x float>, <2 x float>* %b, align 16 + %res = call <2 x float> @llvm.copysign.v2f32(<2 x float> %la, <2 x float> %lb) + store <2 x float> %res, <2 x float>* %c, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_copysignff(ptr %pa, ptr %pb, ptr %pc) +; CHECK: entry: +; CHECK: %idx = call i64 @__mux_get_global_id(i32 0) +; CHECK: %a = getelementptr float, ptr %pa, i64 %idx +; CHECK: %b = getelementptr float, ptr %pb, i64 %idx +; CHECK: %c = getelementptr float, ptr %pc, i64 %idx +; CHECK: [[T0:%.*]] = load <4 x float>, ptr %a, align 4 +; CHECK: [[T1:%.*]] = load <4 x float>, ptr %b, align 4 +; CHECK: %res1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[T0]], <4 x float> [[T1]]) +; CHECK: store <4 x float> %res1, ptr %c, align 4 +; CHECK: ret void + +; CHECK: define spir_kernel void @__vecz_v4_copysignvf(ptr %pa, ptr %pb, ptr %pc) +; CHECK: entry: +; CHECK: %idx = call i64 @__mux_get_global_id(i32 0) +; CHECK: %a = getelementptr <2 x float>, ptr %pa, i64 %idx +; CHECK: %b = getelementptr <2 x float>, ptr %pb, i64 %idx +; CHECK: %c = getelementptr <2 x float>, ptr %pc, i64 %idx +; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 8 +; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 8 +; CHECK: %res1 = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[T0]], <8 x float> [[T1]]) +; CHECK: store <8 x float> %res1, ptr %c, align 8 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll new file mode 100644 index 0000000000000..7b11bc9e63808 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll @@ -0,0 +1,57 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx + %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx + %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx + %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx + %la = load <4 x float>, <4 x float>* %a, align 16 + %lb = load <4 x float>, <4 x float>* %b, align 16 + %lc = load <4 x float>, <4 x float>* %c, align 16 + %fma = call <4 x float> @llvm.fma.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc) + store <4 x float> %fma, <4 x float>* %d, align 16 + ret void +} + +declare <4x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd) +; CHECK: entry: + +; It checks that the fma intrinsic of <4 x float> gets widened by a factor of 8, +; to produce a PAIR of <16 x float>s. +; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]]) +; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]]) +; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16 +; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16 + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll new file mode 100644 index 0000000000000..1251115351205 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll @@ -0,0 +1,64 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k fmin_vector_scalar -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +; Function Attrs: nounwind readnone +declare spir_func <4 x float> @_Z4fminDv4_ff(<4 x float>, float) + +; Note that we have to declare the scalar version, because when we vectorize +; an already-vector builtin, we have to scalarize it first. This is the case +; even for Vector Widening, where we don't actually create a call to the +; scalar version, but we retrieve the wide version via the scalar version, +; so the declaration still needs to exist. + +; Function Attrs: inlinehint nounwind readnone +declare spir_func float @_Z4fminff(float, float) + +; Function Attrs: inlinehint nounwind readnone +declare spir_func <16 x float> @_Z4fminDv16_fS_(<16 x float>, <16 x float>) + +define spir_kernel void @fmin_vector_scalar(<4 x float>* %pa, float* %pb, <4 x float>* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx + %b = getelementptr float, float* %pb, i64 %idx + %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx + %la = load <4 x float>, <4 x float>* %a, align 16 + %lb = load float, float* %b, align 4 + %res = tail call spir_func <4 x float> @_Z4fminDv4_ff(<4 x float> %la, float %lb) + store <4 x float> %res, <4 x float>* %d, align 16 + ret void +} + + +; CHECK: define spir_kernel void @__vecz_v4_fmin_vector_scalar(ptr %pa, ptr %pb, ptr %pd) +; CHECK: entry: + +; It checks that the fmin builtin gets widened by a factor of 4, while its +; scalar operand is sub-splatted to the required <16 x float>. +; CHECK: %[[LDA:.+]] = load <16 x float>, ptr %{{.+}} +; CHECK: %[[LDB:.+]] = load <4 x float>, ptr %{{.+}} +; CHECK: %[[SPL:.+]] = shufflevector <4 x float> %[[LDB]], <4 x float> poison, <16 x i32> +; CHECK: %[[RES:.+]] = call <16 x float> @llvm.minnum.v16f32(<16 x float> %[[LDA]], <16 x float> %[[SPL]]) +; CHECK: store <16 x float> %[[RES]], ptr %{{.+}} + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll new file mode 100644 index 0000000000000..2760239937542 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll @@ -0,0 +1,57 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx + %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx + %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx + %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx + %la = load <4 x float>, <4 x float>* %a, align 16 + %lb = load <4 x float>, <4 x float>* %b, align 16 + %lc = load <4 x float>, <4 x float>* %c, align 16 + %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc) + store <4 x float> %fma, <4 x float>* %d, align 16 + ret void +} + +declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd) +; CHECK: entry: + +; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8, +; to produce a PAIR of <16 x float>s. +; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]]) +; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]]) +; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16 +; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16 + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll new file mode 100644 index 0000000000000..c092dbd97ca09 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll @@ -0,0 +1,91 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %idx2 = shl i64 %idx, 1 + %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx2 + %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx2 + %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx2 + %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx2 + %la = load <4 x float>, <4 x float>* %a, align 16 + %lb = load <4 x float>, <4 x float>* %b, align 16 + %lc = load <4 x float>, <4 x float>* %c, align 16 + %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc) + store <4 x float> %fma, <4 x float>* %d, align 16 + ret void +} + +declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd) +; CHECK: entry: + +; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8, +; to produce a PAIR of <16 x float>s. + +; It concatenates the 8 x <4 x float> inputs into 2 x <16 x float> values +; CHECK: %[[CA0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CA1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CA2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CA3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[SA0:.+]] = shufflevector <8 x float> %[[CA0]], <8 x float> %[[CA1]], <16 x i32> +; CHECK: %[[SA1:.+]] = shufflevector <8 x float> %[[CA2]], <8 x float> %[[CA3]], <16 x i32> + +; CHECK: %[[CB0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CB1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CB2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CB3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[SB0:.+]] = shufflevector <8 x float> %[[CB0]], <8 x float> %[[CB1]], <16 x i32> +; CHECK: %[[SB1:.+]] = shufflevector <8 x float> %[[CB2]], <8 x float> %[[CB3]], <16 x i32> + +; CHECK: %[[CC0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CC1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CC2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[CC3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> +; CHECK: %[[SC0:.+]] = shufflevector <8 x float> %[[CC0]], <8 x float> %[[CC1]], <16 x i32> +; CHECK: %[[SC1:.+]] = shufflevector <8 x float> %[[CC2]], <8 x float> %[[CC3]], <16 x i32> + +; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[SA0]], <16 x float> %[[SB0]], <16 x float> %[[SC0]]) +; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[SA1]], <16 x float> %[[SB1]], <16 x float> %[[SC1]]) + +; It splits the 2 x <16 x float> results into 8 <4 x float> values +; CHECK: %[[RES0:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> +; CHECK: %[[RES1:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> +; CHECK: %[[RES2:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> +; CHECK: %[[RES3:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> +; CHECK: %[[RES4:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> +; CHECK: %[[RES5:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> +; CHECK: %[[RES6:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> +; CHECK: %[[RES7:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> +; CHECK: store <4 x float> %[[RES0]], ptr %{{.+}}, align 16 +; CHECK: store <4 x float> %[[RES1]], ptr %{{.+}}, align 16 +; CHECK: store <4 x float> %[[RES2]], ptr %{{.+}}, align 16 +; CHECK: store <4 x float> %[[RES3]], ptr %{{.+}}, align 16 +; CHECK: store <4 x float> %[[RES4]], ptr %{{.+}}, align 16 +; CHECK: store <4 x float> %[[RES5]], ptr %{{.+}}, align 16 +; CHECK: store <4 x float> %[[RES6]], ptr %{{.+}}, align 16 +; CHECK: store <4 x float> %[[RES7]], ptr %{{.+}}, align 16 + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll new file mode 100644 index 0000000000000..daf71de9b2446 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll @@ -0,0 +1,74 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx + %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx + %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx + %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx + %la = load <4 x float>, <4 x float>* %a, align 16 + %lb = load <4 x float>, <4 x float>* %b, align 16 + %lc = load <4 x float>, <4 x float>* %c, align 16 + br label %loop + +loop: + %n = phi i32 [ %dec, %loop ], [ 10, %entry ] + %acc = phi <4 x float> [ %fma, %loop ], [ %la, %entry ] + %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %acc, <4 x float> %lb, <4 x float> %lc) + %dec = sub i32 %n, 1 + %cmp = icmp ne i32 %dec, 0 + br i1 %cmp, label %loop, label %end + +end: + store <4 x float> %fma, <4 x float>* %d, align 16 + ret void +} + +declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd) +; CHECK: entry: + +; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8, +; to produce a PAIR of <16 x float>s. +; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16 +; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16 + +; CHECK: loop: +; CHECK: %[[ACC0:.+]] = phi <16 x float> [ %[[FMA0:.+]], %loop ], [ %[[LDA0]], %entry ] +; CHECK: %[[ACC1:.+]] = phi <16 x float> [ %[[FMA1:.+]], %loop ], [ %[[LDA1]], %entry ] + +; CHECK: %[[FMA0]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[ACC0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]]) +; CHECK: %[[FMA1]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[ACC1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]]) + +; CHECK: end: +; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16 +; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16 + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll new file mode 100644 index 0000000000000..1974c22c15a81 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_calls(i8* %pa, i8* %pb, i8* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr i8, i8* %pa, i64 %idx + %b = getelementptr i8, i8* %pb, i64 %idx + %d = getelementptr i8, i8* %pd, i64 %idx + %la = load i8, i8* %a, align 16 + %lb = load i8, i8* %b, align 16 + %res = tail call i8 @llvm.fshl.i8(i8 %la, i8 %lb, i8 4) + store i8 %res, i8* %d, align 16 + ret void +} + +declare i8 @llvm.fshl.i8(i8, i8, i8) + +; CHECK: define spir_kernel void @__vecz_v16_test_calls(ptr %pa, ptr %pb, ptr %pd) +; CHECK: entry: + +; It checks that the fshl intrinsic of i8 gets widened by a factor of 16 +; CHECK: %[[LDA:.+]] = load <16 x i8>, ptr %{{.+}} +; CHECK: %[[LDB:.+]] = load <16 x i8>, ptr %{{.+}} +; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> {{<(i8 4(, )?)+>|splat \(i8 4\)}}) +; CHECK: store <16 x i8> %[[RES]], ptr %{{.+}} + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll new file mode 100644 index 0000000000000..6b6f41e066ae1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_calls(i8* %pa, i8* %pb, i8* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr i8, i8* %pa, i64 %idx + %b = getelementptr i8, i8* %pb, i64 %idx + %d = getelementptr i8, i8* %pd, i64 %idx + %la = load i8, i8* %a, align 16 + %lb = load i8, i8* %b, align 16 + %res = tail call i8 @llvm.fshr.i8(i8 %la, i8 %lb, i8 2) + store i8 %res, i8* %d, align 16 + ret void +} + +declare i8 @llvm.fshr.i8(i8, i8, i8) + +; CHECK: define spir_kernel void @__vecz_v16_test_calls(ptr %pa, ptr %pb, ptr %pd) +; CHECK: entry: + +; It checks that the fshr intrinsic of i8 gets widened by a factor of 16 +; CHECK: %[[LDA:.+]] = load <16 x i8>, ptr %{{.+}} +; CHECK: %[[LDB:.+]] = load <16 x i8>, ptr %{{.+}} +; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> {{<(i8 2(, )?)+>|splat \(i8 2\)}}) +; CHECK: store <16 x i8> %[[RES]], ptr %{{.+}} + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll new file mode 100644 index 0000000000000..38ea8eb57c60e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k widen_shufflevector -vecz-simd-width=2 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: nounwind +define spir_kernel void @widen_shufflevector(<2 x float> addrspace(1)* %a, <2 x float> addrspace(1)* %b, <4 x float> addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidxa = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 %call + %arrayidxb = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %b, i64 %call + %la = load <2 x float>, <2 x float> addrspace(1)* %arrayidxa, align 4 + %lb = load <2 x float>, <2 x float> addrspace(1)* %arrayidxb, align 4 + %shuffle = shufflevector <2 x float> %la, <2 x float> %lb, <4 x i32> + %arrayidx1 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %shuffle, <4 x float> addrspace(1)* %arrayidx1, align 1 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v2_widen_shufflevector +; CHECK: %[[LDA:.+]] = load <4 x float>, ptr addrspace(1) % +; CHECK: %[[LDB:.+]] = load <4 x float>, ptr addrspace(1) % +; CHECK: %[[SHF:.+]] = shufflevector <4 x float> %[[LDA]], <4 x float> %[[LDB]], <8 x i32> +; CHECK: store <8 x float> %[[SHF]], ptr addrspace(1) % +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll new file mode 100644 index 0000000000000..15ce1517417b2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll @@ -0,0 +1,53 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_sqrt -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func float @_Z4sqrtf(float) +declare spir_func <2 x float> @_Z4sqrtDv2_f(<2 x float>) +declare spir_func <4 x float> @_Z4sqrtDv4_f(<4 x float>) +declare spir_func <8 x float> @_Z4sqrtDv8_f(<8 x float>) +declare spir_func <16 x float> @_Z4sqrtDv16_f(<16 x float>) + +define spir_kernel void @test_sqrt(<2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %out2, + <4 x float> addrspace(1)* %in4, <4 x float> addrspace(1)* %out4) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %arrayin2 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in2, i64 %gid + %arrayin4 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in4, i64 %gid + %arrayout2 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out2, i64 %gid + %arrayout4 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out4, i64 %gid + %ld2 = load <2 x float>, <2 x float> addrspace(1)* %arrayin2, align 16 + %ld4 = load <4 x float>, <4 x float> addrspace(1)* %arrayin4, align 16 + %sqrt2 = call spir_func <2 x float> @_Z4sqrtDv2_f(<2 x float> %ld2) + %sqrt4 = call spir_func <4 x float> @_Z4sqrtDv4_f(<4 x float> %ld4) + store <2 x float> %sqrt2, <2 x float> addrspace(1)* %arrayout2, align 16 + store <4 x float> %sqrt4, <4 x float> addrspace(1)* %arrayout4, align 16 + ret void +} + +; The purpose of this test is to check that the vector context is able to +; supply the packetizer with two versions of the builtin vectorized to two +; different widths. +; +; CHECK: define spir_kernel void @__vecz_v4_test_sqrt +; CHECK: call spir_func <8 x float> @_Z4sqrtDv8_f(<8 x float> %{{.*}}) +; CHECK: call spir_func <16 x float> @_Z4sqrtDv16_f(<16 x float> %{{.*}}) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll new file mode 100644 index 0000000000000..7a7d4428bdd4c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll @@ -0,0 +1,69 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%struct.testStruct = type { <3 x i32> } + +define spir_kernel void @alloca_alias(i32 addrspace(1)* %out, i32 %index) { +entry: + %myStructs = alloca [2 x %struct.testStruct], align 16 + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = bitcast [2 x %struct.testStruct]* %myStructs to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* nonnull %0) + %1 = trunc i64 %call to i32 + %conv = add nuw nsw i32 %1, 2 + %2 = insertelement <4 x i32> poison, i32 %conv, i64 0 + %conv2 = add nuw nsw i32 %1, 3 + %3 = insertelement <4 x i32> %2, i32 %conv2, i64 1 + %4 = insertelement <4 x i32> %3, i32 %1, i64 2 + %i = getelementptr inbounds [2 x %struct.testStruct], [2 x %struct.testStruct]* %myStructs, i64 0, i64 1, i32 0 + %storetmp8 = bitcast <3 x i32>* %i to <4 x i32>* + store <4 x i32> %4, <4 x i32>* %storetmp8, align 16 + %idxprom = sext i32 %index to i64 + %i9 = getelementptr inbounds [2 x %struct.testStruct], [2 x %struct.testStruct]* %myStructs, i64 0, i64 %idxprom, i32 0 + %castToVec410 = bitcast <3 x i32>* %i9 to <4 x i32>* + %loadVec411 = load <4 x i32>, <4 x i32>* %castToVec410, align 16 + %extractVec12 = shufflevector <4 x i32> %loadVec411, <4 x i32> poison, <3 x i32> + %5 = mul i64 %call, 3 + %vstore_base = getelementptr i32, i32 addrspace(1)* %out, i64 %5 + %vstore_extract = extractelement <3 x i32> %extractVec12, i32 0 + %6 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 0 + store i32 %vstore_extract, i32 addrspace(1)* %6, align 4 + %vstore_extract1 = extractelement <3 x i32> %extractVec12, i32 1 + %7 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 1 + store i32 %vstore_extract1, i32 addrspace(1)* %7, align 4 + %vstore_extract2 = extractelement <3 x i32> %extractVec12, i32 2 + %8 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 2 + store i32 %vstore_extract2, i32 addrspace(1)* %8, align 4 + call void @llvm.lifetime.end.p0i8(i64 32, i8* nonnull %0) + ret void +} + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8*) + +declare i64 @__mux_get_global_id(i32) + +declare spir_func void @_Z7vstore3Dv3_imPU3AS1i(<3 x i32>, i64, i32 addrspace(1)*) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8*) + +; CHECK: spir_kernel void @__vecz_v4_alloca_alias +; CHECK: alloca [4 x [2 x %struct.testStruct{{.*}}]] +; CHECK-NOT: = alloca .* diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll new file mode 100644 index 0000000000000..3d39ca518818f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll @@ -0,0 +1,65 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; REQUIRES: arm + +; RUN: veczc -k short3_char3_codegen -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "armv7-unknown-linux-gnueabihf" + +; Function Attrs: nounwind +define spir_kernel void @short3_char3_codegen(i8 addrspace(1)* %src, i16 addrspace(1)* %dest) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 { +entry: + %call = call i32 @__mux_get_global_id(i32 0) #3 + %call1 = call spir_func <3 x i8> @_Z6vload3jPU3AS1Kc(i32 %call, i8 addrspace(1)* %src) #3 + %call3 = call spir_func <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8> %call1) #3 + call spir_func void @_Z7vstore3Dv3_sjPU3AS1s(<3 x i16> %call3, i32 %call, i16 addrspace(1)* %dest) #3 + ret void +} + +declare i32 @__mux_get_global_id(i32) #1 + +declare spir_func <3 x i8> @_Z6vload3jPU3AS1Kc(i32, i8 addrspace(1)*) #1 + +declare spir_func <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8>) #1 + +declare spir_func void @_Z7vstore3Dv3_sjPU3AS1s(<3 x i16>, i32, i16 addrspace(1)*) #1 + +; Function Attrs: inlinehint nounwind +declare spir_func signext i16 @_Z13convert_shortc(i8 signext) #2 + +; Function Attrs: inlinehint nounwind +declare spir_func <16 x i16> @_Z15convert_short16Dv16_c(<16 x i8>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { inlinehint nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nobuiltin nounwind } + +!opencl.spir.version = !{!0, !0, !0, !0, !0} +!opencl.ocl.version = !{!1, !1, !1, !1, !1} + +!0 = !{i32 2, i32 0} +!1 = !{i32 1, i32 2} +!2 = !{i32 1, i32 1} +!3 = !{!"none", !"none"} +!4 = !{!"char*", !"short*"} +!5 = !{!"", !""} + +; Assert call to neon intrinsic exists +; CHECK: call void @llvm.arm.neon.vst3.p1.v4i16 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll new file mode 100644 index 0000000000000..4a3f38ba7ad0c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll @@ -0,0 +1,60 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +%opencl.event_t = type opaque + +; Function Attrs: nounwind +define spir_kernel void @test(i32 addrspace(1)* %input, i32 addrspace(3)* %output, i32 addrspace(1)* %elements) { + %ev = alloca %opencl.event_t*, align 8 + %1 = call i64 @__mux_get_global_id(i32 0) + %2 = call i64 @__mux_get_group_id(i32 0) + %3 = call i64 @__mux_get_local_size(i32 0) + %4 = mul i64 %3, %2 + %5 = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %4 + %6 = mul i64 %3, %2 + %7 = getelementptr inbounds i32, i32 addrspace(3)* %output, i64 %6 + %8 = getelementptr inbounds i32, i32 addrspace(1)* %elements, i64 %2 + %9 = load i32, i32 addrspace(1)* %8, align 4 + %10 = sext i32 %9 to i64 + %11 = load %opencl.event_t*, %opencl.event_t** %ev, align 8 + %12 = call spir_func %opencl.event_t* @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event(i32 addrspace(1)* %5, i32 addrspace(3)* %7, i64 %10, %opencl.event_t* %11) + %13 = trunc i64 %3 to i32 + call spir_func void @_Z17wait_group_eventsiP9ocl_event(i32 %13, %opencl.event_t** nonnull %ev) + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_group_id(i32) +declare i64 @__mux_get_local_size(i32) +declare spir_func %opencl.event_t* @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event(i32 addrspace(1)*, i32 addrspace(3)*, i64, %opencl.event_t*) +declare spir_func void @_Z17wait_group_eventsiP9ocl_event(i32, %opencl.event_t**) + +; CHECK: define spir_kernel void @__vecz_v4_test + +; Check if we have one and exactly one call to async_workgroup copy +; CHECK: call spir_func ptr @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event +; CHECK-NOT: async_workgroup_copy + +; Check if we have one and exactly one call to wait_group_events +; CHECK: call spir_func void @_Z17wait_group_eventsiP9ocl_event +; CHECK-NOT: wait_group_events +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll new file mode 100644 index 0000000000000..786c7236e1585 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll @@ -0,0 +1,83 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %do.body + +do.body: ; preds = %do.body, %entry + %sub = add nsw i32 %conv, -1 + %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire + %1 = extractvalue { i32, i1 } %0, 0 + %sub2 = add nsw i32 %conv, -1 + %cmp = icmp eq i32 %1, %sub2 + br i1 %cmp, label %do.end, label %do.body + +do.end: ; preds = %do.body + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %do.body + +do.body: ; preds = %do.body, %entry + %sub = add nsw i32 %conv, -1 + %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub acq_rel + %sub2 = add nsw i32 %conv, -1 + %cmp = icmp eq i32 %0, %sub2 + br i1 %cmp, label %do.end, label %do.body + +do.end: ; preds = %do.body + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst + %idxprom = sext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %conv, i32 addrspace(1)* %arrayidx + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; We no longer support instantiating atomic instructions in diverged blocks, +; since they require masking. FileCheck does not support comments, so the CHECKs +; have been removed or reversed in the following lines +; CHECK-NOT: define spir_kernel void @__vecz_v4_atomic_cmpxchg_builtin +; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire +; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire +; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire +; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll new file mode 100644 index 0000000000000..c403cf419d301 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll @@ -0,0 +1,83 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %do.body + +do.body: ; preds = %do.body, %entry + %sub = add nsw i32 %conv, -1 + %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire + %1 = extractvalue { i32, i1 } %0, 0 + %sub2 = add nsw i32 %conv, -1 + %cmp = icmp eq i32 %1, %sub2 + br i1 %cmp, label %do.end, label %do.body + +do.end: ; preds = %do.body + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %do.body + +do.body: ; preds = %do.body, %entry + %sub = add nsw i32 %conv, -1 + %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub acq_rel + %sub2 = add nsw i32 %conv, -1 + %cmp = icmp eq i32 %0, %sub2 + br i1 %cmp, label %do.end, label %do.body + +do.end: ; preds = %do.body + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst + %idxprom = sext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %conv, i32 addrspace(1)* %arrayidx + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; We no longer support instantiating atomic instructions in diverged blocks, +; since they require masking. FileCheck does not support comments, so the CHECKs +; have been removed or reversed in the following lines +; CHECK-NOT: define spir_kernel void @__vecz_v4_atomic_atomicrmw_builtin +; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel +; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel +; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel +; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll new file mode 100644 index 0000000000000..e87ff74f7a6e3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll @@ -0,0 +1,81 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %do.body + +do.body: ; preds = %do.body, %entry + %sub = add nsw i32 %conv, -1 + %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire + %1 = extractvalue { i32, i1 } %0, 0 + %sub2 = add nsw i32 %conv, -1 + %cmp = icmp eq i32 %1, %sub2 + br i1 %cmp, label %do.end, label %do.body + +do.end: ; preds = %do.body + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %do.body + +do.body: ; preds = %do.body, %entry + %sub = add nsw i32 %conv, -1 + %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub acq_rel + %sub2 = add nsw i32 %conv, -1 + %cmp = icmp eq i32 %0, %sub2 + br i1 %cmp, label %do.end, label %do.body + +do.end: ; preds = %do.body + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst + %idxprom = sext i32 %0 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %conv, i32 addrspace(1)* %arrayidx + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define spir_kernel void @__vecz_v4_atomic_rmw +; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst +; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst +; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst +; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll new file mode 100644 index 0000000000000..08fc176beee7f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll @@ -0,0 +1,64 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i32 %a, i32 %b, i32* %c, float %rf) { +entry: + %d = alloca i32 + %e = alloca i32 + %f = alloca float + %gid = call i64 @__mux_get_global_id(i32 0) + %sum = add i32 %a, %b + store i32 %sum, i32* %d, align 4 + store i32 %sum, i32* %e, align 4 + %call = call spir_func i32 @foo(i32* %e) + %d.load = load i32, i32* %d, align 4 + %e.load = load i32, i32* %e, align 4 + %c0 = getelementptr i32, i32* %c, i64 %gid + store i32 %d.load, i32* %c0, align 4 + %c1 = getelementptr i32, i32* %c0, i64 1 + store i32 %e.load, i32* %c1, align 4 + store float %rf, float* %f + %ri = bitcast float* %f to i32* + %ri.load = load i32, i32* %ri, align 4 + %c2 = getelementptr i32, i32* %c1, i64 2 + store i32 %ri.load, i32* %c2, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @foo(i32*) + +; CHECK: define spir_kernel void @__vecz_v4_test(i32 %a, i32 %b, ptr %c, float %rf) +; CHECK: entry: +; CHECK: %e = alloca i32 +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %sum = add i32 %a, %b +; CHECK: store i32 %sum, ptr %e +; CHECK: %call = call spir_func i32 @foo(ptr{{.*}} %e) +; CHECK: %e.load = load i32, ptr %e +; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid +; CHECK: store i32 %sum, ptr %c0 +; CHECK: %c1 = getelementptr i32, ptr %c0, i64 1 +; CHECK: store i32 %e.load, ptr %c1 +; CHECK: %0 = bitcast float %rf to i32 +; CHECK: %c2 = getelementptr i32, ptr %c1, i64 2 +; CHECK: store i32 %0, ptr %c2, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll new file mode 100644 index 0000000000000..71035cb07e9e8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll @@ -0,0 +1,73 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Note: *not* running LLVM's mem2reg pass as before LLVM 15 it crashes for the +; same reason we used to! +; RUN: veczc -vecz-passes=vecz-mem2reg -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p) { + %data = alloca i32, align 4 + %1 = tail call i64 @__mux_get_global_id(i32 0) #4 + %2 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %1 + %3 = load i32, ptr addrspace(1) %2, align 4 + store i32 %3, ptr %data, align 4 + %4 = load <2 x i16>, ptr %data, align 2 + ret void +} + +define spir_kernel void @load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) { + %data = alloca i32, align 4 + %1 = tail call i64 @__mux_get_global_id(i32 0) #4 + %2 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %1 + %3 = load i32, ptr addrspace(1) %2, align 4 + store i32 %3, ptr %data, align 4 + %4 = load i16, ptr %data, align 2 + ret void +} + +define spir_kernel void @store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) { + %data = alloca i32, align 4 + %1 = tail call i64 @__mux_get_global_id(i32 0) #4 + %2 = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %1 + %3 = load i16, ptr addrspace(1) %2, align 4 + store i16 %3, ptr %data, align 2 + %4 = load i32, ptr %data, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define spir_kernel void @__vecz_v4_load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p) +; CHECK-NOT: alloca i32 +; CHECK: %3 = load i32, ptr addrspace(1) %2, align 4 +; CHECK: %4 = bitcast i32 %3 to <2 x i16> + +; Note: we can't optimize this as the allocated type size and loaded type sizes +; don't match. Maybe we could trunc %3 from i32 to i16? + +; CHECK: define spir_kernel void @__vecz_v4_load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) +; CHECK: %data = alloca i32, align 4 +; CHECK: %4 = load i16, ptr %data, align 2 + +; Note: we can't optimize this as the allocated type size and loaded type sizes +; don't match. Maybe we could trunc %3 from i32 to i16? + +; CHECK: define spir_kernel void @__vecz_v4_store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) +; CHECK: %data = alloca i32, align 4 +; CHECK: %4 = load i32, ptr %data, align 4 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll new file mode 100644 index 0000000000000..4a2c09ca9ff69 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll @@ -0,0 +1,79 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @test(i32* %in, i32* %out) { +entry: + %in.addr = alloca i32*, align 8 + %out.addr = alloca i32*, align 8 + %gid = alloca i64, align 8 + store i32* %in, i32** %in.addr, align 8 + store i32* %out, i32** %out.addr, align 8 + %call = call i64 @__mux_get_global_id(i32 0) + store i64 %call, i64* %gid, align 8 + %0 = load i64, i64* %gid, align 8 + %rem = urem i64 %0, 16 + %cmp = icmp eq i64 %rem, 1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %1 = load i64, i64* %gid, align 8 + %2 = load i32*, i32** %in.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %1 + %3 = load i32, i32* %arrayidx, align 4 + %4 = load i64, i64* %gid, align 8 + %5 = load i32*, i32** %in.addr, align 8 + %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 %4 + %call2 = call spir_func i32 bitcast (i32 (i32, i32 addrspace(1)*)* @foo to i32 (i32, i32*)*)(i32 %3, i32* %arrayidx1) + %6 = load i64, i64* %gid, align 8 + %7 = load i32*, i32** %out.addr, align 8 + %arrayidx3 = getelementptr inbounds i32, i32* %7, i64 %6 + store i32 %call2, i32* %arrayidx3, align 4 + br label %if.end + +if.else: ; preds = %entry + %8 = load i64, i64* %gid, align 8 + %9 = load i32*, i32** %in.addr, align 8 + %arrayidx4 = getelementptr inbounds i32, i32* %9, i64 %8 + %10 = load i32, i32* %arrayidx4, align 4 + %11 = load i64, i64* %gid, align 8 + %12 = load i32*, i32** %out.addr, align 8 + %arrayidx5 = getelementptr inbounds i32, i32* %12, i64 %11 + store i32 %10, i32* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @foo(i32, i32 addrspace(1)*) + +; CHECK: define spir_kernel void @__vecz_v4_test( +; CHECK: call spir_func i32 @__vecz_b_masked_foo( +; CHECK: call spir_func i32 @__vecz_b_masked_foo( +; CHECK: call spir_func i32 @__vecz_b_masked_foo( +; CHECK: call spir_func i32 @__vecz_b_masked_foo( +; CHECK: ret void + +; CHECK: define private spir_func i32 @__vecz_b_masked_foo(i32{{( %0)?}}, ptr{{( %1)?}}, i1{{( %2)?}} +; CHECK: call spir_func i32 @foo(i32 %0, ptr %1) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll new file mode 100644 index 0000000000000..890f63e748592 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll @@ -0,0 +1,70 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @split_branch(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 { +entry: + %x = call i64 @__mux_get_global_id(i32 0) #2 + %y = call i64 @__mux_get_global_id(i32 1) #2 + %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x + %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y + %varying = load i32, i32 addrspace(1)* %a_gep + %uniform = load i32, i32 addrspace(1)* %b_gep + %cmp_v = icmp sgt i32 %varying, 0 + %cmp_u = icmp sgt i32 %uniform, 0 + %and_vu = and i1 %cmp_v, %cmp_u + br i1 %and_vu, label %if.then, label %if.end + +if.then: ; preds = %entry + %inc = add i32 %uniform, 1 + br label %if.end + +if.end: ; preds = %if.then, %entry + %result = phi i32 [ %inc, %if.then ], [ %varying, %entry ] + %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x + store i32 %result, i32 addrspace(1)* %d_gep + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks that a conditional branch based on an AND of both +; a uniform and a varying value gets split into two separate branches +; CHECK: define spir_kernel void @__vecz_v4_split_branch + +; CHECK: %cmp_v = icmp sgt i32 %varying, 0 +; CHECK: %cmp_u = icmp sgt i32 %uniform, 0 + +; ensure the original binary operator got deleted +; CHECK-NOT: and i1 +; CHECK: br i1 %cmp_u, label %entry.cond_split, label %if.end + +; CHECK: entry.cond_split: +; CHECK: br i1 %cmp_v, label %if.then, label %if.end + +; CHECK: if.then: +; CHECK: %inc = add i32 %uniform, 1 +; CHECK: br label %if.end + +; CHECK: if.end: +; CHECK: %[[RESULT:.+]] = phi i32 [ %inc, %if.then ], [ %varying, %entry.cond_split ], [ %varying, %entry ] +; CHECK: store i32 %[[RESULT]], ptr addrspace(1) %{{.+}} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll new file mode 100644 index 0000000000000..37d1ff7cebffa --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll @@ -0,0 +1,70 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @split_branch(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 { +entry: + %x = call i64 @__mux_get_global_id(i32 0) #2 + %y = call i64 @__mux_get_global_id(i32 1) #2 + %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x + %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y + %varying = load i32, i32 addrspace(1)* %a_gep + %uniform = load i32, i32 addrspace(1)* %b_gep + %cmp_v = icmp sgt i32 %varying, 0 + %cmp_u = icmp sgt i32 %uniform, 0 + %or_vu = or i1 %cmp_v, %cmp_u + br i1 %or_vu, label %if.then, label %if.end + +if.then: ; preds = %entry + %inc = add i32 %uniform, 1 + br label %if.end + +if.end: ; preds = %if.then, %entry + %result = phi i32 [ %inc, %if.then ], [ %varying, %entry ] + %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x + store i32 %result, i32 addrspace(1)* %d_gep + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks that a conditional branch based on an OR of both +; a uniform and a varying value gets split into two separate branches +; CHECK: define spir_kernel void @__vecz_v4_split_branch + +; CHECK: %cmp_v = icmp sgt i32 %varying, 0 +; CHECK: %cmp_u = icmp sgt i32 %uniform, 0 + +; ensure the original binary operator got deleted +; CHECK-NOT: or i1 +; CHECK: br i1 %cmp_u, label %if.then, label %entry.cond_split + +; CHECK: entry.cond_split: +; CHECK: br i1 %cmp_v, label %if.then, label %if.end + +; CHECK: if.then: +; CHECK: %inc = add i32 %uniform, 1 +; CHECK: br label %if.end + +; CHECK: if.end: +; CHECK: %[[RESULT:.+]] = phi i32 [ %inc, %if.then ], [ %varying, %entry.cond_split ] +; CHECK: store i32 %[[RESULT]], ptr addrspace(1) %{{.+}} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll new file mode 100644 index 0000000000000..141543d69b0fd --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll @@ -0,0 +1,118 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @saddsatc(i8 addrspace(1)* %lhs, i8 addrspace(1)* %rhs) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %lhs, i64 %call + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %rhs, i64 %call + %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1 + %call2 = tail call spir_func i8 @_Z7add_satcc(i8 %0, i8 %1) + store i8 %call2, i8 addrspace(1)* %arrayidx1, align 1 + ret void +} + +define spir_kernel void @uaddsatc(i8 addrspace(1)* %lhs, i8 addrspace(1)* %rhs) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %lhs, i64 %call + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %rhs, i64 %call + %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1 + %call2 = tail call spir_func i8 @_Z7add_sathh(i8 %0, i8 %1) + store i8 %call2, i8 addrspace(1)* %arrayidx1, align 1 + ret void +} + +define spir_kernel void @saddsati(i32 addrspace(1)* %lhs, i32 addrspace(1)* %rhs) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %lhs, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %rhs, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 1 + %call2 = tail call spir_func i32 @_Z7add_satii(i32 %0, i32 %1) + store i32 %call2, i32 addrspace(1)* %arrayidx1, align 1 + ret void +} + +define spir_kernel void @uaddsati(i32 addrspace(1)* %lhs, i32 addrspace(1)* %rhs) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %lhs, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %rhs, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 1 + %call2 = tail call spir_func i32 @_Z7add_satjj(i32 %0, i32 %1) + store i32 %call2, i32 addrspace(1)* %arrayidx1, align 1 + ret void +} + +define spir_kernel void @saddsati4(<4 x i32> addrspace(1)* %lhs, <4 x i32> addrspace(1)* %rhs) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %lhs, i64 %call + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %rhs, i64 %call + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx1, align 1 + %call2 = tail call spir_func <4 x i32> @_Z7add_satDv2_iS_(<4 x i32> %0, <4 x i32> %1) + store <4 x i32> %call2, <4 x i32> addrspace(1)* %arrayidx1, align 1 + ret void +} + +define spir_kernel void @uaddsati4(<4 x i32> addrspace(1)* %lhs, <4 x i32> addrspace(1)* %rhs) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %lhs, i64 %call + %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %rhs, i64 %call + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx1, align 1 + %call2 = tail call spir_func <4 x i32> @_Z7add_satDv2_jS_(<4 x i32> %0, <4 x i32> %1) + store <4 x i32> %call2, <4 x i32> addrspace(1)* %arrayidx1, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare spir_func i8 @_Z7add_satcc(i8, i8) +declare spir_func i8 @_Z7add_sathh(i8, i8) +declare spir_func i32 @_Z7add_satii(i32, i32) +declare spir_func i32 @_Z7add_satjj(i32, i32) +declare spir_func <4 x i32> @_Z7add_satDv2_iS_(<4 x i32>, <4 x i32>) +declare spir_func <4 x i32> @_Z7add_satDv2_jS_(<4 x i32>, <4 x i32>) + +; CHECK: define spir_kernel void @__vecz_v4_saddsatc( +; CHECK: = call i8 @llvm.sadd.sat.i8(i8 %{{.*}}, i8 %{{.*}}) + +; CHECK: define spir_kernel void @__vecz_v4_uaddsatc( +; CHECK: = call i8 @llvm.uadd.sat.i8(i8 %{{.*}}, i8 %{{.*}}) + +; CHECK: define spir_kernel void @__vecz_v4_saddsati( +; CHECK: = call i32 @llvm.sadd.sat.i32(i32 %{{.*}}, i32 %{{.*}}) + +; CHECK: define spir_kernel void @__vecz_v4_uaddsati( +; CHECK: = call i32 @llvm.uadd.sat.i32(i32 %{{.*}}, i32 %{{.*}}) + +; CHECK: define spir_kernel void @__vecz_v4_saddsati4( +; CHECK: = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) + +; CHECK: define spir_kernel void @__vecz_v4_uaddsati4( +; CHECK: = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll new file mode 100644 index 0000000000000..1bcc968885303 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll @@ -0,0 +1,41 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k clampkernel -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @clampkernel(float %a, float* %c) { +entry: + %clmp = call spir_func float @_Z5clampfff(float %a, float 0.0, float 1.0) + store float %clmp, float* %c, align 4 + ret void +} + +define spir_func float @_Z5clampfff(float %x, float %y, float %z) { +entry: + %call.i.i = tail call spir_func float @_Z13__abacus_fmaxff(float %x, float %y) + %call1.i.i = tail call spir_func float @_Z13__abacus_fminff(float %call.i.i, float %z) + ret float %call1.i.i +; CHECK-LABEL: float @_Z5clampfff( +; CHECK: [[TMP:%.*]] = call float @llvm.maxnum.f32(float %x, float %y) +; CHECK: = call float @llvm.minnum.f32(float [[TMP]], float %z) +} + +declare spir_func float @_Z13__abacus_fminff(float, float) +declare spir_func float @_Z13__abacus_fmaxff(float, float) + diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll new file mode 100644 index 0000000000000..e99d01d477e1f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll @@ -0,0 +1,53 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @fmaxff(float %a, float %b, float* %c) { +entry: + %max = call spir_func float @_Z4fmaxff(float %a, float %b) + store float %max, float* %c, align 4 + ret void +} + +define spir_kernel void @fmaxvf(<2 x float> %a, float %b, <2 x float>* %c) { +entry: + %max = call spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float> %a, float %b) + store <2 x float> %max, <2 x float>* %c, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare spir_func float @_Z4fmaxff(float, float) +declare spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float>, float) + +; CHECK: define spir_kernel void @__vecz_v4_fmaxff(float %a, float %b, ptr %c) +; CHECK: entry: +; CHECK: %0 = call float @llvm.maxnum.f32(float %a, float %b) +; CHECK: store float %0, ptr %c, align 4 +; CHECK: ret void + +; CHECK: define spir_kernel void @__vecz_v4_fmaxvf(<2 x float> %a, float %b, ptr %c) +; CHECK: entry: +; CHECK: %.splatinsert = insertelement <2 x float> {{.*}}, float %b, {{(i32|i64)}} 0 +; CHECK: %.splat = shufflevector <2 x float> %.splatinsert, <2 x float> {{.*}}, <2 x i32> zeroinitializer +; CHECK: %0 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %.splat) +; CHECK: store <2 x float> %0, ptr %c, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll new file mode 100644 index 0000000000000..65b7e5697a68b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll @@ -0,0 +1,53 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @fminff(float %a, float %b, float* %c) { +entry: + %min = call spir_func float @_Z4fminff(float %a, float %b) + store float %min, float* %c, align 4 + ret void +} + +define spir_kernel void @fminvf(<2 x float> %a, float %b, <2 x float>* %c) { +entry: + %min = call spir_func <2 x float> @_Z4fminDv2_ff(<2 x float> %a, float %b) + store <2 x float> %min, <2 x float>* %c, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare spir_func float @_Z4fminff(float, float) +declare spir_func <2 x float> @_Z4fminDv2_ff(<2 x float>, float) + +; CHECK: define spir_kernel void @__vecz_v4_fminff(float %a, float %b, ptr %c) +; CHECK: entry: +; CHECK: %0 = call float @llvm.minnum.f32(float %a, float %b) +; CHECK: store float %0, ptr %c, align 4 +; CHECK: ret void + +; CHECK: define spir_kernel void @__vecz_v4_fminvf(<2 x float> %a, float %b, ptr %c) +; CHECK: entry: +; CHECK: %.splatinsert = insertelement <2 x float> {{.*}}, float %b, {{(i32|i64)}} 0 +; CHECK: %.splat = shufflevector <2 x float> %.splatinsert, <2 x float> {{.*}}, <2 x i32> zeroinitializer +; CHECK: %0 = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %.splat) +; CHECK: store <2 x float> %0, ptr %c, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll new file mode 100644 index 0000000000000..86591570fbcab --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll @@ -0,0 +1,122 @@ + +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_memset_i8(i64* %z) { + %dst = bitcast i64* %z to i8* + call void @llvm.memset.p0i8.i64(i8* %dst, i8 42, i64 18, i32 8, i1 false) + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memset_i8(ptr %z) +; CHECK: %dst = bitcast ptr %z to ptr +; CHECK: %1 = getelementptr inbounds i8, ptr %dst, i64 0 +; CHECK: store i64 3038287259199220266, ptr %1, align 8 +; CHECK: %2 = getelementptr inbounds i8, ptr %dst, i64 8 +; CHECK: store i64 3038287259199220266, ptr %2, align 8 +; CHECK: %dst1 = getelementptr inbounds i8, ptr %dst, i64 16 +; CHECK: store i8 42, ptr %dst1, align 1 +; CHECK: %dst2 = getelementptr inbounds i8, ptr %dst, i64 17 +; CHECK: store i8 42, ptr %dst2, align 1 +; CHECK: ret void +; CHECK: } + +define spir_kernel void @test_memset_i16(i64* %z) { + %dst = bitcast i64* %z to i16* + call void @llvm.memset.p0i16.i64(i16* %dst, i8 42, i64 18, i32 8, i1 false) + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memset_i16(ptr %z) +; CHECK: %dst = bitcast ptr %z to ptr +; CHECK: %1 = getelementptr inbounds i8, ptr %dst, i64 0 +; CHECK: store i64 3038287259199220266, ptr %1, align 8 +; CHECK: %2 = getelementptr inbounds i8, ptr %dst, i64 8 +; CHECK: store i64 3038287259199220266, ptr %2, align 8 +; CHECK: %dst1 = getelementptr inbounds i8, ptr %dst, i64 16 +; CHECK: store i8 42, ptr %dst1, align 1 +; CHECK: %dst2 = getelementptr inbounds i8, ptr %dst, i64 17 +; CHECK: store i8 42, ptr %dst2, align 1 +; CHECK: ret void +; CHECK: } + +define spir_kernel void @test_memcpy_i8(i64* %a, i64* %z) { + %src = bitcast i64* %a to i8* + %dst = bitcast i64* %z to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 18, i32 8, i1 false) + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memcpy_i8(ptr %a, ptr %z) +; CHECK: %src = bitcast ptr %a to ptr +; CHECK: %dst = bitcast ptr %z to ptr +; CHECK: %1 = getelementptr inbounds i8, ptr %src, i64 0 +; CHECK: %2 = getelementptr inbounds i8, ptr %dst, i64 0 +; CHECK: %src1 = load i64, ptr %1, align 8 +; CHECK: store i64 %src1, ptr %2, align 8 +; CHECK: %3 = getelementptr inbounds i8, ptr %src, i64 8 +; CHECK: %4 = getelementptr inbounds i8, ptr %dst, i64 8 +; CHECK: %src2 = load i64, ptr %3, align 8 +; CHECK: store i64 %src2, ptr %4, align 8 +; CHECK: %5 = getelementptr inbounds i8, ptr %src, i64 16 +; CHECK: %dst3 = getelementptr inbounds i8, ptr %dst, i64 16 +; CHECK: %src4 = load i8, ptr %5, align 1 +; CHECK: store i8 %src4, ptr %dst3, align 1 +; CHECK: %6 = getelementptr inbounds i8, ptr %src, i64 17 +; CHECK: %dst5 = getelementptr inbounds i8, ptr %dst, i64 17 +; CHECK: %src6 = load i8, ptr %6, align 1 +; CHECK: store i8 %src6, ptr %dst5, align 1 +; CHECK: ret void +; CHECK: } + +define spir_kernel void @test_memcpy_i16(i64* %a, i64* %z) { + %src = bitcast i64* %a to i16* + %dst = bitcast i64* %z to i16* + call void @llvm.memcpy.p0i16.p0i16.i64(i16* %dst, i16* %src, i64 18, i32 8, i1 false) + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memcpy_i16(ptr %a, ptr %z) +; CHECK: %src = bitcast ptr %a to ptr +; CHECK: %dst = bitcast ptr %z to ptr +; CHECK: %1 = getelementptr inbounds i8, ptr %src, i64 0 +; CHECK: %2 = getelementptr inbounds i8, ptr %dst, i64 0 +; CHECK: %src1 = load i64, ptr %1, align 8 +; CHECK: store i64 %src1, ptr %2, align 8 +; CHECK: %3 = getelementptr inbounds i8, ptr %src, i64 8 +; CHECK: %4 = getelementptr inbounds i8, ptr %dst, i64 8 +; CHECK: %src2 = load i64, ptr %3, align 8 +; CHECK: store i64 %src2, ptr %4, align 8 +; CHECK: %5 = getelementptr inbounds i8, ptr %src, i64 16 +; CHECK: %dst3 = getelementptr inbounds i8, ptr %dst, i64 16 +; CHECK: %src4 = load i8, ptr %5, align 1 +; CHECK: store i8 %src4, ptr %dst3, align 1 +; CHECK: %6 = getelementptr inbounds i8, ptr %src, i64 17 +; CHECK: %dst5 = getelementptr inbounds i8, ptr %dst, i64 17 +; CHECK: %src6 = load i8, ptr %6, align 1 +; CHECK: store i8 %src6, ptr %dst5, align 1 +; CHECK: ret void +; CHECK: } + +declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1) +declare void @llvm.memset.p0i16.i64(i16*, i8, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1) +declare void @llvm.memcpy.p0i16.p0i16.i64(i16*, i16*, i64, i32, i1) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll new file mode 100644 index 0000000000000..7ad572a7cebed --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll @@ -0,0 +1,37 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k memcpy_align -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @memcpy_align(ptr align(16) %out, ptr align(8) %in) { +entry: +; CHECK: %[[A:.*]] = getelementptr inbounds i8, ptr %in, i64 0 +; CHECK: %[[B:.*]] = getelementptr inbounds i8, ptr %out, i64 0 +; CHECK: %[[C:.*]] = load i64, ptr %[[A]], align 8 +; CHECK: store i64 %[[C]], ptr %[[B]], align 16 + +; CHECK: %[[D:.*]] = getelementptr inbounds i8, ptr %in, i64 8 +; CHECK: %[[E:.*]] = getelementptr inbounds i8, ptr %out, i64 8 +; CHECK: %[[F:.*]] = load i64, ptr %[[D]], align 8 +; CHECK: store i64 %[[F]], ptr %[[E]], align 8 + call void @llvm.memcpy.p0.p0.i32(ptr noundef align(16) %out, ptr noundef align(8) %in, i32 16, i1 false) + ret void +} + +declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll new file mode 100644 index 0000000000000..0a1c85af00cda --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll @@ -0,0 +1,53 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_rhadd -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_normalize(float %a, float %b, i32* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %norm = call spir_func float @_Z9normalizef(float %a) + %normi = fptosi float %norm to i32 + %c0 = getelementptr i32, i32* %c, i64 %gid + store i32 %normi, i32* %c0, align 4 + ret void +} + +define spir_kernel void @test_rhadd(i32 %a, i32 %b, i32* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %add = call spir_func i32 @_Z5rhaddjj(i32 %a, i32 %b) + %c0 = getelementptr i32, i32* %c, i64 %gid + store i32 %add, i32* %c0, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare spir_func float @_Z9normalizef(float) +declare spir_func i32 @_Z5rhaddjj(i32, i32) + +; CHECK-NOT: define spir_kernel void @__vecz_v4_test_normalize(float %a, float %b, ptr %c) + +; CHECK: define spir_kernel void @__vecz_v4_test_rhadd(i32 %a, i32 %b, ptr %c) +; CHECK: entry: +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %add = call spir_func i32 @_Z5rhaddjj(i32 %a, i32 %b) +; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid +; CHECK: store i32 %add, ptr %c0, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll new file mode 100644 index 0000000000000..379428725bb39 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll @@ -0,0 +1,67 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(float %a, float %b, i32* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cmp = call spir_func i32 @_Z9isgreaterff(float %a, float %b) + %c0 = getelementptr i32, i32* %c, i64 %gid + store i32 %cmp, i32* %c0, align 4 + %cmp1 = call spir_func i32 @_Z6islessff(float %a, float %b) + %c1 = getelementptr i32, i32* %c0, i32 1 + store i32 %cmp1, i32* %c1, align 4 + %cmp2 = call spir_func i32 @_Z7isequalff(float %a, float %b) + %c2 = getelementptr i32, i32* %c0, i32 2 + store i32 %cmp2, i32* %c2, align 4 + %cmp3 = call spir_func i32 @opt_Z7isequalff(float %a, float %b) + %c3 = getelementptr i32, i32* %c0, i32 3 + store i32 %cmp3, i32* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z9isgreaterff(float, float) +declare spir_func i32 @_Z6islessff(float, float) +declare spir_func i32 @_Z7isequalff(float, float) + +; Test that a non-builtin function is inlined. +define spir_func i32 @opt_Z7isequalff(float, float) { + ret i32 zeroinitializer +} + +; CHECK: define spir_kernel void @__vecz_v4_test(float %a, float %b, ptr %c) +; CHECK: entry: +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %relational = fcmp ogt float %a, %b +; CHECK: %relational[[R1:[0-9]+]] = zext i1 %relational to i32 +; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid +; CHECK: store i32 %relational[[R1]], ptr %c0, align 4 +; CHECK: %relational[[R2:[0-9]+]] = fcmp olt float %a, %b +; CHECK: %relational[[R3:[0-9]+]] = zext i1 %relational[[R2:[0-9]+]] to i32 +; CHECK: %c1 = getelementptr i32, ptr %c0, {{(i32|i64)}} 1 +; CHECK: store i32 %relational[[R3:[0-9]+]], ptr %c1, align 4 +; CHECK: %relational[[R4:[0-9]+]] = fcmp oeq float %a, %b +; CHECK: %relational[[R5:[0-9]+]] = zext i1 %relational[[R4:[0-9]+]] to i32 +; CHECK: %c2 = getelementptr i32, ptr %c0, {{(i32|i64)}} 2 +; CHECK: store i32 %relational[[R5:[0-9]+]], ptr %c2, align 4 +; CHECK: %c3 = getelementptr i32, ptr %c0, {{(i32|i64)}} 3 +; CHECK: store i32 0, ptr %c3, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll new file mode 100644 index 0000000000000..d6bc1e0d2c71d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll @@ -0,0 +1,65 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +declare i64 @__mux_get_global_id(i32) + +declare spir_func float @_Z5fractfPf(float, float*) +declare spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float>, <2 x float>*) +declare spir_func <4 x float> @_Z5fractDv4_fPS_(<4 x float>, <4 x float>*) +declare spir_func <8 x float> @_Z5fractDv8_fPS_(<8 x float>, <8 x float>*) + +; FIXME: Both of these are instantiating when we have vector equivalents. + +define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr) { + %iouta = alloca float + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidx.x = getelementptr inbounds float, float* %xptr, i64 %idx + %x = load float, float* %arrayidx.x, align 4 + %out = call spir_func float @_Z5fractfPf(float %x, float* %iouta) + %arrayidx.out = getelementptr inbounds float, float* %outptr, i64 %idx + %arrayidx.iout = getelementptr inbounds float, float* %ioutptr, i64 %idx + store float %out, float* %arrayidx.out, align 4 + %iout = load float, float* %iouta, align 4 + store float %iout, float* %arrayidx.iout, align 4 + ret void +; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}}) +; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}}) +; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}}) +; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}}) +} + +define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x float>* %ioutptr) { + %iouta = alloca <2 x float> + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidx.x = getelementptr inbounds <2 x float>, <2 x float>* %xptr, i64 %idx + %x = load <2 x float>, <2 x float>* %arrayidx.x, align 8 + %out = call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> %x, <2 x float>* %iouta) + %arrayidx.out = getelementptr inbounds <2 x float>, <2 x float>* %outptr, i64 %idx + %arrayidx.iout = getelementptr inbounds <2 x float>, <2 x float>* %ioutptr, i64 %idx + store <2 x float> %out, <2 x float>* %arrayidx.out, align 8 + %iout = load <2 x float>, <2 x float>* %iouta, align 8 + store <2 x float> %iout, <2 x float>* %arrayidx.iout, align 8 + ret void +; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}}) +; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}}) +; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}}) +; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}}) +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll new file mode 100644 index 0000000000000..6ee06a5479108 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll @@ -0,0 +1,160 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Kernels + +; We should be able to handle intrinsics +; CHECK-LABEL: define spir_kernel void @__vecz_v4_instrinsic(ptr %in1, ptr %in2, ptr %in3, ptr %out) +; CHECK: call <4 x float> @llvm.fmuladd.v4f32(<4 x float> {{%.*}}, <4 x float> {{%.*}}, <4 x float> {{%.*}}) +define spir_kernel void @instrinsic(ptr %in1, ptr %in2, ptr %in3, ptr %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr %in1, i64 %call + %0 = load float, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %in2, i64 %call + %1 = load float, ptr %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds float, ptr %in3, i64 %call + %2 = load float, ptr %arrayidx2, align 4 + %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2) + %arrayidx3 = getelementptr inbounds float, ptr %out, i64 %call + store float %3, ptr %arrayidx3, align 4 + ret void +} + +; We should be able to handle builtins for which we have a vector declaration +; in the module. +; CHECK-LABEL: define spir_kernel void @__vecz_v4_builtin(ptr %in, ptr %out) +; CHECK: = call spir_func <4 x i32> @_Z3absDv4_i(<4 x i32> {{%.*}}) +define spir_kernel void @builtin(ptr %in, ptr %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call + %0 = load i32, ptr %arrayidx, align 4 + %call1 = tail call spir_func i32 @_Z3absi(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr %out, i64 %call + store i32 %call1, ptr %arrayidx2, align 4 + ret void +} + +; We should be able to handle user functions for which we have a definition +; CHECK-LABEL: define spir_kernel void @__vecz_v4_user_defined(ptr %in, ptr %out) +; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}}) +define spir_kernel void @user_defined(ptr %in, ptr %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call + %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call + call spir_func void @defined(ptr %add.ptr, ptr %add.ptr1) + ret void +} + +; We should be able to handle user functions (or builtins) for which we have no +; definition +; CHECK-LABEL: define spir_kernel void @__vecz_v4_user_undefined(ptr %in, ptr %out) +; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}}) +define spir_kernel void @user_undefined(ptr %in, ptr %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call + %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call + call spir_func void @undefined(ptr %add.ptr, ptr %add.ptr1) + ret void +} + +; We should be able to handle user functions (or builtins) which we can't +; inline +; CHECK-LABEL: define spir_kernel void @__vecz_v4_cantinline(ptr %in, ptr %out) +; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}}) +; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}}) +define spir_kernel void @cantinline(ptr %in, ptr %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call + %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call + call spir_func void @dontinline(ptr %add.ptr, ptr %add.ptr1) + ret void +} + +; If we can't duplicate a function, we can't packetize it. +; CHECK-NOT: @__vecz_v4_cantduplicate +define spir_kernel void @cantduplicate(ptr %in, ptr %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call + %0 = load i32, ptr %arrayidx, align 4 + %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1 + %arrayidx2 = getelementptr inbounds i32, ptr %out, i64 %call + store i32 %call1, ptr %arrayidx2, align 4 + ret void +} + +; The optnone attribute has no impact when directly running the packetizer +; pass. The higher-level vectorization factor decisions must take this into +; account instead. +; CHECK-LABEL: define spir_kernel void @__vecz_v4_optnone(ptr %in, ptr %out) +define spir_kernel void @optnone(ptr %in, ptr %out) #2 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr %out, i64 %call + store i32 %0, ptr %arrayidx1, align 4 + ret void +} + +; Declaration only functions + +declare float @llvm.fmuladd.f32(float, float, float) +declare spir_func i32 @_Z3absi(i32) +declare spir_func <4 x i32> @_Z3absDv4_i(<4 x i32>) +declare spir_func i32 @_Z3clzi(i32) #1 +declare i64 @__mux_get_global_id(i32) +declare spir_func void @undefined(ptr, ptr) + +; Functions with definitions + +define spir_func void @defined(ptr %in, ptr %out) { +entry: + %0 = load i32, ptr %in, align 4 + store i32 %0, ptr %out, align 4 + ret void +} + +define spir_func void @dontinline(ptr %in, ptr %out) #0 { +entry: + %0 = load i32, ptr %in, align 4 + store i32 %0, ptr %out, align 4 + ret void +} + +; Attributes + +attributes #0 = { noinline } +attributes #1 = { noduplicate } +attributes #2 = { optnone noinline } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll new file mode 100644 index 0000000000000..2df00a15e33cf --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll @@ -0,0 +1,92 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r) +define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { +entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer + %call = call i64 @__mux_get_global_id(i32 0) + +; Test that this cmpxchg is packetized by generating a call to an all-true masked version. +; CHECK: [[A0:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, +; CHECK-SAME: <4 x i32> {{<(i32 2(, )?)+>|splat \(i32 2\)}}, +; CHECK-SAME: <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}} + %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic +; CHECK: [[EXT0:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 0 + %val0 = extractvalue { i32, i1 } %old0, 0 +; CHECK: [[EXT1:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 1 + %success0 = extractvalue { i32, i1 } %old0, 1 + + %out = getelementptr i32, ptr %q, i64 %call +; Stored as a vector +; CHECK: store <4 x i32> [[EXT0]], ptr + store i32 %val0, ptr %out, align 4 + +; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call + %outsuccess = getelementptr i8, ptr %r, i64 %call +; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[EXT1]] to <4 x i8> + %outbyte = zext i1 %success0 to i8 +; Stored as a vector +; CHECK: store <4 x i8> [[ZEXT0]], ptr [[PTR]], align 1 + store i8 %outbyte, ptr %outsuccess, align 1 + + ; Test a couple of insert/extract patterns + + ; Test inserting a uniform value into a varying literal struct +; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> zeroinitializer, 1 +; CHECK: [[EXT2:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS0]], 1 +; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[EXT2]] to <4 x i8> +; CHECK: store <4 x i8> [[ZEXT1]], ptr [[PTR]], align 1 + %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1 + %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1 + %outbyte0 = zext i1 %testextract0 to i8 + store i8 %outbyte0, ptr %outsuccess, align 1 + + ; Test inserting a varying value into a varying literal struct +; CHECK: [[LD:%.*]] = load <4 x i8>, ptr +; CHECK: [[VBOOL:%.*]] = trunc <4 x i8> [[LD]] to <4 x i1> +; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> [[VBOOL]], 1 +; CHECK: [[EXT3:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS1]], 1 +; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[EXT3]] to <4 x i8> +; CHECK: store <4 x i8> [[ZEXT2]], ptr [[PTR]], align 1 + %byte1 = load i8, ptr %outsuccess, align 1 + %bool1 = trunc i8 %byte1 to i1 + %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1 + %testextract1 = extractvalue { i32, i1 } %testinsertvarying0, 1 + %outbyte1 = zext i1 %testextract1 to i8 + store i8 %outbyte1, ptr %outsuccess, align 1 + + ; Test inserting a varying value into a uniform literal struct +; CHECK: [[INS2:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i1> [[VBOOL]], 1 +; CHECK: [[EXT4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS2]], 1 +; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[EXT4]] to <4 x i8> +; CHECK: store <4 x i8> [[ZEXT3]], ptr [[PTR]], align 1 + %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1 + %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1 + %outbyte2 = zext i1 %testextract2 to i8 + store i8 %outbyte2, ptr %outsuccess, align 1 + + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll new file mode 100644 index 0000000000000..0894b60d9fc7a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll @@ -0,0 +1,58 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i32 addrspace(1)* %out) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) #1 + %conv = trunc i64 %gid to i32 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3 + store i32 %conv, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } + +!opencl.kernels = !{!0} +!opencl.spir.version = !{!7} +!opencl.ocl.version = !{!7} +!opencl.used.extensions = !{!8} +!opencl.used.optional.core.features = !{!8} +!opencl.compiler.options = !{!8} + +!0 = !{void (i32 addrspace(1)*)* @test, !1, !2, !3, !4, !5, !6} +!1 = !{!"kernel_arg_addr_space", i32 1} +!2 = !{!"kernel_arg_access_qual", !"none"} +!3 = !{!"kernel_arg_type", !"int*"} +!4 = !{!"kernel_arg_base_type", !"int*"} +!5 = !{!"kernel_arg_type_qual", !""} +!6 = !{!"kernel_arg_name", !"out"} +!7 = !{i32 1, i32 2} +!8 = !{} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK-NEXT: entry: +; CHECK-NEXT: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: %conv = trunc i64 %gid to i32 +; CHECK-NEXT: %arrayidx = getelementptr inbounds {{(nuw )?}}{{i32|i8}}, ptr addrspace(1) %out, i64 {{3|12}} +; CHECK-NEXT: store i32 %conv, ptr addrspace(1) %arrayidx, align 4 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll new file mode 100644 index 0000000000000..e2d1ef91aec8e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll @@ -0,0 +1,41 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare spir_func i32 @__mux_get_global_id(i32); + +define spir_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %out2) { +entry: + %gid = call i32 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 3 + store i32 %gid, i32 addrspace(1)* %arrayidx, align 4 + + %arrayidx2 = getelementptr inbounds i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %out2, i32 %gid + store i32 addrspace(1)* %arrayidx, i32 addrspace(1)* addrspace(1)* %arrayidx2, align 4 + + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK-NEXT: entry: +; CHECK-NEXT: %gid = call i32 @__mux_get_global_id(i32 0) +; CHECK-NEXT: %arrayidx = getelementptr inbounds {{(nuw )?}}{{i32|i8}}, ptr addrspace(1) %out, i32 {{3|12}} +; CHECK: store i32 %gid, ptr addrspace(1) %arrayidx, align 4 +; CHECK: store <4 x ptr addrspace(1)> %{{.+}}, ptr addrspace(1) %{{.+}} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll new file mode 100644 index 0000000000000..bcc6bfd84b57a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll @@ -0,0 +1,74 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -vecz-auto -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@entry_test_alloca.lm = external unnamed_addr addrspace(3) constant [16 x <2 x float>], align 8 + +define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in, <2 x float> addrspace(1)* nocapture %out, i32 %offset) local_unnamed_addr { +entry: + %a.sroa.0 = alloca <2 x float>, align 8 + %b.sroa.2 = alloca <2 x float>, align 8 + %call = tail call i64 @__mux_get_global_id(i32 0) + %call1 = tail call i64 @__mux_get_local_id(i32 0) + %a.sroa.0.0..sroa_cast = bitcast <2 x float>* %a.sroa.0 to i8* + %b.sroa.2.0..sroa_cast = bitcast <2 x float>* %b.sroa.2 to i8* + %arrayidx2 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %call1 + %0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx2, align 8 + %conv = sext i32 %offset to i64 + %add = add i64 %call1, %conv + %arrayidx4 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %add + %1 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx4, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup10 + %mul.le.le = fmul <2 x float> %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0., %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8. + %arrayidx17 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i64 %call + store <2 x float> %mul.le.le, <2 x float> addrspace(1)* %arrayidx17, align 8 + ret void + +for.body: ; preds = %for.cond.cleanup10, %entry + %i.038 = phi i32 [ 0, %entry ], [ %inc15, %for.cond.cleanup10 ] + store volatile <2 x float> %0, <2 x float>* %a.sroa.0, align 8 + store volatile <2 x float> %1, <2 x float>* %b.sroa.2, align 8 + br label %for.body11 + +for.cond.cleanup10: ; preds = %for.body11 + %inc15 = add nuw nsw i32 %i.038, 1 + %cmp = icmp ult i32 %inc15, 16 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body11: ; preds = %for.body11, %for.body + %i6.037 = phi i32 [ 0, %for.body ], [ %inc, %for.body11 ] + %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0. = load volatile <2 x float>, <2 x float>* %a.sroa.0, align 8 + %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8. = load volatile <2 x float>, <2 x float>* %b.sroa.2, align 8 + %inc = add nuw nsw i32 %i6.037, 1 + %cmp8 = icmp ult i32 %inc, 16 + br i1 %cmp8, label %for.body11, label %for.cond.cleanup10 +} + +declare i64 @__mux_get_global_id(i32) local_unnamed_addr +declare i64 @__mux_get_local_id(i32) local_unnamed_addr + +; Check that all the allocas come before anything else +; CHECK: define spir_kernel void @__vecz_v4_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <8 x float>, align 16 +; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <8 x float>, align 16 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll new file mode 100644 index 0000000000000..7108df3732999 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll @@ -0,0 +1,208 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_uniform_if(i32 %a, i32* %b) { +entry: + %cmp = icmp eq i32 %a, 1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx1, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_varying_if(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_uniform_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_varying_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_nested_loops(i32* %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.inc12, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end14 + +for.body: ; preds = %for.cond + %sub2 = sub nsw i32 24, %conv + br label %for.cond3 + +for.cond3: ; preds = %for.body6, %for.body + %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ] + %cmp4 = icmp slt i32 %storemerge, 24 + br i1 %cmp4, label %for.body6, label %for.inc12 + +for.body6: ; preds = %for.cond3 + %add = add nsw i32 %storemerge1, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add7 = add i32 %storemerge1, %storemerge + %add8 = add i32 %add7, %0 + %add9 = add nsw i32 %storemerge, %conv + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10 + store i32 %add8, i32* %arrayidx11, align 4 + %inc = add nsw i32 %storemerge1, 1 + br label %for.cond3 + +for.inc12: ; preds = %for.cond3 + %inc13 = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end14: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; A nested loop, in the form of +; +; int gid = get_global_id(0); +; for (int i = 16 - gid; i < 16; ++i) { +; for (int j = 24 - gid; i < 24; ++j) { +; b[i + gid] = a[j + gid] + i + j; +; } +; } +; +; The important bit is that both of the loops have their iterations dependent on +; the global ID +; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b) +; CHECK: entry: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ] +; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ] +; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16 +; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false +; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true +; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false +; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]] +; CHECK: br label %[[FORBODY:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND3:.+]] + +; CHECK: [[FORCOND3]]: +; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ] +; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ] +; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24 +; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false +; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true +; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false +; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]] +; CHECK: br label %[[FORBODY6:.+]] + +; CHECK: [[FORBODY6]]: +; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]]) +; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]] +; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]]) +; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]]) +; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]] + +; CHECK: [[FORINC12]]: +; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]]) +; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]] + +; CHECK: [[FOREND14]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll new file mode 100644 index 0000000000000..0384d9959e24a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll @@ -0,0 +1,208 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 1 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_uniform_if(i32 %a, i32* %b) { +entry: + %cmp = icmp eq i32 %a, 1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx1, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_varying_if(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 1) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_uniform_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 1) + %conv = trunc i64 %call to i32 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_varying_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 1) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_nested_loops(i32* %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 1) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.inc12, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end14 + +for.body: ; preds = %for.cond + %sub2 = sub nsw i32 24, %conv + br label %for.cond3 + +for.cond3: ; preds = %for.body6, %for.body + %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ] + %cmp4 = icmp slt i32 %storemerge, 24 + br i1 %cmp4, label %for.body6, label %for.inc12 + +for.body6: ; preds = %for.cond3 + %add = add nsw i32 %storemerge1, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add7 = add i32 %storemerge1, %storemerge + %add8 = add i32 %add7, %0 + %add9 = add nsw i32 %storemerge, %conv + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10 + store i32 %add8, i32* %arrayidx11, align 4 + %inc = add nsw i32 %storemerge1, 1 + br label %for.cond3 + +for.inc12: ; preds = %for.cond3 + %inc13 = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end14: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; A nested loop, in the form of +; +; int gid = get_global_id(1); +; for (int i = 16 - gid; i < 16; ++i) { +; for (int j = 24 - gid; i < 24; ++j) { +; b[i + gid] = a[j + gid] + i + j; +; } +; } +; +; The important bit is that both of the loops have their iterations dependent on +; the global ID +; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b) +; CHECK: entry: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ] +; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ] +; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16 +; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false +; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true +; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false +; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]] +; CHECK: br label %[[FORBODY:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND3:.+]] + +; CHECK: [[FORCOND3]]: +; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ] +; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ] +; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24 +; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false +; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true +; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false +; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]] +; CHECK: br label %[[FORBODY6:.+]] + +; CHECK: [[FORBODY6]]: +; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]]) +; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]] +; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]]) +; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]]) +; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]] + +; CHECK: [[FORINC12]]: +; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]]) +; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]] + +; CHECK: [[FOREND14]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll new file mode 100644 index 0000000000000..e6c92b8290d92 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll @@ -0,0 +1,208 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 2 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_uniform_if(i32 %a, i32* %b) { +entry: + %cmp = icmp eq i32 %a, 1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx1, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_varying_if(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 2) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_uniform_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 2) + %conv = trunc i64 %call to i32 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_varying_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 2) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_nested_loops(i32* %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 2) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.inc12, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end14 + +for.body: ; preds = %for.cond + %sub2 = sub nsw i32 24, %conv + br label %for.cond3 + +for.cond3: ; preds = %for.body6, %for.body + %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ] + %cmp4 = icmp slt i32 %storemerge, 24 + br i1 %cmp4, label %for.body6, label %for.inc12 + +for.body6: ; preds = %for.cond3 + %add = add nsw i32 %storemerge1, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add7 = add i32 %storemerge1, %storemerge + %add8 = add i32 %add7, %0 + %add9 = add nsw i32 %storemerge, %conv + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10 + store i32 %add8, i32* %arrayidx11, align 4 + %inc = add nsw i32 %storemerge1, 1 + br label %for.cond3 + +for.inc12: ; preds = %for.cond3 + %inc13 = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end14: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; A nested loop, in the form of +; +; int gid = get_global_id(1); +; for (int i = 16 - gid; i < 16; ++i) { +; for (int j = 24 - gid; i < 24; ++j) { +; b[i + gid] = a[j + gid] + i + j; +; } +; } +; +; The important bit is that both of the loops have their iterations dependent on +; the global ID +; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b) +; CHECK: entry: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ] +; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ] +; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16 +; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false +; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true +; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false +; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]] +; CHECK: br label %[[FORBODY:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND3:.+]] + +; CHECK: [[FORCOND3]]: +; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ] +; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ] +; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24 +; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false +; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true +; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false +; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]] +; CHECK: br label %[[FORBODY6:.+]] + +; CHECK: [[FORBODY6]]: +; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]]) +; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]] +; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]]) +; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]]) +; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]] + +; CHECK: [[FORINC12]]: +; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]]) +; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]] + +; CHECK: [[FOREND14]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll new file mode 100644 index 0000000000000..99c0a220d0727 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_varying_if_ptr(i32 %a, ptr %b, ptr %on_true, ptr %on_false) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds ptr, ptr %b, i64 %idxprom + store ptr %on_true, ptr %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds ptr, ptr %b, i64 42 + store ptr %on_false, ptr %arrayidx2, align 4 + br label %if.end + +if.end: + ret void +} + +; CHECK: define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] { +; CHECK: br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: store ptr [[A]], ptr [[B]], align 4 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll new file mode 100644 index 0000000000000..b8a23afb5a39c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll @@ -0,0 +1,168 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_uniform_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_uniform_if(i32 %a, i32* %b) { +entry: + %cmp = icmp eq i32 %a, 1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx1, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_varying_if(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_uniform_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_varying_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_nested_loops(i32* %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.inc12, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end14 + +for.body: ; preds = %for.cond + %sub2 = sub nsw i32 24, %conv + br label %for.cond3 + +for.cond3: ; preds = %for.body6, %for.body + %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ] + %cmp4 = icmp slt i32 %storemerge, 24 + br i1 %cmp4, label %for.body6, label %for.inc12 + +for.body6: ; preds = %for.cond3 + %add = add nsw i32 %storemerge1, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add7 = add i32 %storemerge1, %storemerge + %add8 = add i32 %add7, %0 + %add9 = add nsw i32 %storemerge, %conv + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10 + store i32 %add8, i32* %arrayidx11, align 4 + %inc = add nsw i32 %storemerge1, 1 + br label %for.cond3 + +for.inc12: ; preds = %for.cond3 + %inc13 = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end14: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This tests a uniform if statement that shouldn't be touched by the CFC pass +; CHECK: define spir_kernel void @__vecz_v4_test_uniform_if(i32 %a, ptr %b) +; CHECK: br i1 %cmp, label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: store i32 11, ptr %arrayidx, align 4 + +; CHECK: if.else: +; CHECK: store i32 13, ptr %arrayidx1, align 4 + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll new file mode 100644 index 0000000000000..508d105fa78f7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll @@ -0,0 +1,176 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_uniform_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_uniform_if(i32 %a, i32* %b) { +entry: + %cmp = icmp eq i32 %a, 1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx1, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_varying_if(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_uniform_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_varying_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_nested_loops(i32* %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.inc12, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end14 + +for.body: ; preds = %for.cond + %sub2 = sub nsw i32 24, %conv + br label %for.cond3 + +for.cond3: ; preds = %for.body6, %for.body + %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ] + %cmp4 = icmp slt i32 %storemerge, 24 + br i1 %cmp4, label %for.body6, label %for.inc12 + +for.body6: ; preds = %for.cond3 + %add = add nsw i32 %storemerge1, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add7 = add i32 %storemerge1, %storemerge + %add8 = add i32 %add7, %0 + %add9 = add nsw i32 %storemerge, %conv + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10 + store i32 %add8, i32* %arrayidx11, align 4 + %inc = add nsw i32 %storemerge1, 1 + br label %for.cond3 + +for.inc12: ; preds = %for.cond3 + %inc13 = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end14: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This tests for a uniform loop that should remain untouched by the CFC pass +; CHECK: define spir_kernel void @__vecz_v4_test_uniform_loop(i32 %a, ptr %b) +; CHECK: br label %for.cond + +; CHECK: for.cond: +; CHECK: %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] +; CHECK: %cmp = icmp slt i32 %storemerge, 16 +; CHECK: br i1 %cmp, label %for.body, label %for.end + +; CHECK: for.body: +; CHECK: %add = add nsw i32 %storemerge, %a +; CHECK: %idxprom = sext i32 %add2 to i64 +; CHECK: %arrayidx = getelementptr i32, ptr %b, i64 %idxprom +; CHECK: store i32 %add, ptr %arrayidx, align 4 +; CHECK: %inc = add nsw i32 %storemerge, 1 +; CHECK: br label %for.cond + +; CHECK: for.end: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll new file mode 100644 index 0000000000000..c4a2b075b4664 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll @@ -0,0 +1,166 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_varying_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_uniform_if(i32 %a, i32* %b) { +entry: + %cmp = icmp eq i32 %a, 1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx1, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_varying_if(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_uniform_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_varying_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_nested_loops(i32* %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.inc12, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end14 + +for.body: ; preds = %for.cond + %sub2 = sub nsw i32 24, %conv + br label %for.cond3 + +for.cond3: ; preds = %for.body6, %for.body + %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ] + %cmp4 = icmp slt i32 %storemerge, 24 + br i1 %cmp4, label %for.body6, label %for.inc12 + +for.body6: ; preds = %for.cond3 + %add = add nsw i32 %storemerge1, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add7 = add i32 %storemerge1, %storemerge + %add8 = add i32 %add7, %0 + %add9 = add nsw i32 %storemerge, %conv + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10 + store i32 %add8, i32* %arrayidx11, align 4 + %inc = add nsw i32 %storemerge1, 1 + br label %for.cond3 + +for.inc12: ; preds = %for.cond3 + %inc13 = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end14: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Check for a varying that needs masked operations +; CHECK: define spir_kernel void @__vecz_v4_test_varying_if(i32 %a, ptr %b) +; CHECK: %cmp = icmp eq i64 %conv, %call +; CHECK: %cmp.not = xor i1 %cmp, true +; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 11, ptr %arrayidx, i1 %cmp) +; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 13, ptr %arrayidx2, i1 %cmp.not) + +; Note that the entry mask would be removed by any DCE pass +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll new file mode 100644 index 0000000000000..77184596228ce --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll @@ -0,0 +1,185 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_varying_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_uniform_if(i32 %a, i32* %b) { +entry: + %cmp = icmp eq i32 %a, 1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx1, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_varying_if(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @test_uniform_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_varying_loop(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %a + %add2 = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add2 to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 %add, i32* %arrayidx, align 4 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +define spir_kernel void @test_nested_loops(i32* %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %sub = sub nsw i32 16, %conv + br label %for.cond + +for.cond: ; preds = %for.inc12, %entry + %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ] + %cmp = icmp slt i32 %storemerge, 16 + br i1 %cmp, label %for.body, label %for.end14 + +for.body: ; preds = %for.cond + %sub2 = sub nsw i32 24, %conv + br label %for.cond3 + +for.cond3: ; preds = %for.body6, %for.body + %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ] + %cmp4 = icmp slt i32 %storemerge, 24 + br i1 %cmp4, label %for.body6, label %for.inc12 + +for.body6: ; preds = %for.cond3 + %add = add nsw i32 %storemerge1, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %add7 = add i32 %storemerge1, %storemerge + %add8 = add i32 %add7, %0 + %add9 = add nsw i32 %storemerge, %conv + %idxprom10 = sext i32 %add9 to i64 + %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10 + store i32 %add8, i32* %arrayidx11, align 4 + %inc = add nsw i32 %storemerge1, 1 + br label %for.cond3 + +for.inc12: ; preds = %for.cond3 + %inc13 = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end14: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; The loop's start condition depends on the global ID +; Note that the mask names are hardcoded in vecz, if they change they need to be +; changed here as well. We do need them though, to make sure that we are +; checking for the correct stuff. Since we don't have any duplicate names, they +; should all be deterministic. +; CHECK: define spir_kernel void @__vecz_v4_test_varying_loop(i32 %a, ptr %b) +; CHECK: br label %for.cond + +; CHECK: for.cond: +; CHECK: %for.cond.entry_mask = phi i1 [ true, %entry ], [ %for.body.exit_mask, %for.body ] +; CHECK: %for.end.loop_exit_mask = phi i1 [ false, %entry ], [ %for.end.loop_exit_mask.update, %for.body ] +; CHECK: %cmp = icmp slt i32 %storemerge, 16 +; CHECK: %for.body.exit_mask = select i1 %for.cond.entry_mask, i1 %cmp, i1 false +; CHECK: %cmp.not = xor i1 %cmp, true +; CHECK: %for.end.exit_mask = select i1 %for.cond.entry_mask, i1 %cmp.not, i1 false +; CHECK: %for.end.loop_exit_mask.update = or i1 %for.end.loop_exit_mask, %for.end.exit_mask +; CHECK: br label %for.body + +; CHECK: for.body: +; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %add, ptr %arrayidx, i1 %for.body.exit_mask) +; CHECK: %[[EXIT_MASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %for.body.exit_mask) +; CHECK: br i1 %[[EXIT_MASK_ANY]], label %for.cond, label %for.cond.pure_exit + +; CHECK: for.cond.pure_exit: +; CHECK: br label %for.end + +; CHECK: for.end: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll new file mode 100644 index 0000000000000..07d638f131350 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll @@ -0,0 +1,65 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k convert3 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @convert3(i64 addrspace(1)* %src, float addrspace(1)* %dest) local_unnamed_addr { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func <3 x i64> @_Z6vload3mPU3AS1Kl(i64 %call, i64 addrspace(1)* %src) + %call2 = tail call spir_func <3 x float> @_Z14convert_float3Dv3_l(<3 x i64> %call1) + tail call spir_func void @_Z7vstore3Dv3_fmPU3AS1f(<3 x float> %call2, i64 %call, float addrspace(1)* %dest) + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) local_unnamed_addr + +; Function Attrs: convergent nounwind +declare spir_func void @_Z7vstore3Dv3_fmPU3AS1f(<3 x float>, i64, float addrspace(1)*) local_unnamed_addr + +; Function Attrs: convergent nounwind readnone +declare spir_func <3 x float> @_Z14convert_float3Dv3_l(<3 x i64>) local_unnamed_addr + +; Function Attrs: convergent nounwind +declare spir_func <3 x i64> @_Z6vload3mPU3AS1Kl(i64, i64 addrspace(1)*) local_unnamed_addr + +; Note that we have to declare the scalar version, because when we vectorize +; an already-vector builtin, we have to scalarize it first. The scalar call +; exists during the intermediate stage between scalarization and packetization, +; and so has to exist in the module. + +; Function Attrs: convergent nounwind readnone +declare spir_func float @_Z13convert_floatl(i64) local_unnamed_addr + +; Function Attrs: convergent nounwind readnone +declare spir_func <2 x float> @_Z14convert_float2Dv2_l(<2 x i64>) local_unnamed_addr + +; With SIMD width 2, should have 3 x convert_float2. + +; CHECK: define spir_kernel void @__vecz_v2_convert3 +; CHECK: call <2 x i64> @__vecz_b_interleaved_load8_3 +; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK-NOT: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK: call void @__vecz_b_interleaved_store4_3_Dv2_fu3ptrU3AS1(<2 x float> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll new file mode 100644 index 0000000000000..422e2be0e3237 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll @@ -0,0 +1,61 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k convert4 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nofree nounwind +define spir_kernel void @convert4(<4 x i64> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %in, i64 %call + %0 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx, align 32 + %call1 = tail call spir_func <4 x float> @_Z14convert_float4Dv4_l(<4 x i64> %0) + %arrayidx2 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call + store <4 x float> %call1, <4 x float> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) local_unnamed_addr + +; Function Attrs: convergent nounwind readnone +declare spir_func <4 x float> @_Z14convert_float4Dv4_l(<4 x i64>) local_unnamed_addr + +; Note that we have to declare the scalar version, because when we vectorize +; an already-vector builtin, we have to scalarize it first. The scalar call +; exists during the intermediate stage between scalarization and packetization, +; and so has to exist in the module. + +; Function Attrs: convergent nounwind readnone +declare spir_func float @_Z13convert_floatl(i64) local_unnamed_addr + +; Function Attrs: convergent nounwind readnone +declare spir_func <2 x float> @_Z14convert_float2Dv2_l(<2 x i64>) local_unnamed_addr + +; With SIMD width 2, should have 4 x convert_float2. + +; CHECK: call <2 x i64> @__vecz_b_interleaved_load8_4 +; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK-NOT: call spir_func <2 x float> @_Z14convert_float2Dv2_l +; CHECK: call void @__vecz_b_interleaved_store4_4_Dv2_fu3ptrU3AS1(<2 x float> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll new file mode 100644 index 0000000000000..f4f363b7e5c17 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll @@ -0,0 +1,47 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k convert_contiguity -w 4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @convert_contiguity(float addrspace(1)* %m_ptr) { + %1 = call i64 @__mux_get_global_id(i32 0) + %2 = call spir_func i32 @_Z12convert_uintm(i64 %1) + %3 = icmp slt i32 %2, 100 + %4 = select i1 %3, float 1.000000e+00, float 0.000000e+00 + %5 = call spir_func i64 @_Z12convert_longi(i32 %2) + %6 = getelementptr inbounds float, float addrspace(1)* %m_ptr, i64 %5 + store float %4, float addrspace(1)* %6, align 4 + ret void +} + +; Function Attrs: nounwind readnone +declare spir_func i32 @_Z12convert_uintm(i64) + +; Function Attrs: nounwind readnone +declare spir_func i64 @_Z12convert_longi(i32) + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) + +; It checks that the store address was identified as congituous through the +; OpenCL convert builtin function + +; CHECK: void @__vecz_v4_convert_contiguity +; CHECK: store <4 x float> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll new file mode 100644 index 0000000000000..48bfa3ad25429 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll @@ -0,0 +1,45 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 0 + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + %c3.load = load i64, i64* %c3, align 4 + %c4 = getelementptr i64, i64* %c3, i64 %gid + store i64 %c3.load, i64* %c4, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Test if the scatter store is defined correctly +; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] { +; CHECK: %[[V1:[0-9]+]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, +; CHECK: ret <4 x i64> %[[V1]] + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll new file mode 100644 index 0000000000000..bcbf179616d32 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll @@ -0,0 +1,45 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 0 + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + %c3.load = load i64, i64* %c3, align 4 + %c4 = getelementptr i64, i64* %c3, i64 %gid + store i64 %c3.load, i64* %c4, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Test if the scatter store is defined correctly +; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] { +; CHECK: call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x i64> poison) +; CHECK: ret <4 x i64> + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll new file mode 100644 index 0000000000000..d54d31595e7f8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll @@ -0,0 +1,62 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare void @__mux_work_group_barrier(i32, i32, i32) + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) + +; Test if the interleaved load is defined correctly +; CHECK: define <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}}) +; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer +; CHECK: %[[TMP1:.*]] = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> +; CHECK: %[[TMP2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %[[TMP1]], i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> poison) +; CHECK: ret <4 x double> %[[TMP2]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll new file mode 100644 index 0000000000000..ca5b39de6e149 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll @@ -0,0 +1,79 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"} +!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""} +!6 = !{!"clang version 3.8.1 "} + +; Test if the interleaved load is defined correctly +; CHECK: define <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}}) +; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer +; CHECK: %1 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> +; CHECK: %2 = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> poison) +; CHECK: ret <4 x double> %2 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll new file mode 100644 index 0000000000000..1e8c1c3f67979 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll @@ -0,0 +1,63 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare void @__mux_work_group_barrier(i32, i32, i32) + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) + +; Test if the interleaved store is defined correctly +; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}}) +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer +; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> +; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll new file mode 100644 index 0000000000000..5fd7ad27aa856 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll @@ -0,0 +1,80 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"} +!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""} +!6 = !{!"clang version 3.8.1 "} + +; Test if the interleaved store is defined correctly +; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}}) +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer +; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> +; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll new file mode 100644 index 0000000000000..8de9ec81b534c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll @@ -0,0 +1,32 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { + %b = bitcast i32 addrspace(2)* %in to <4 x i32> addrspace(2)* + %v = call <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(<4 x i32> addrspace(2)* %b, <4 x i1> zeroinitializer) + ret void +} + +declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(<4 x i32> addrspace(2)*, <4 x i1>) +; CHECK-LABEL: define <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(ptr addrspace(2){{.*}}, <4 x i1>{{.*}}) { +; CHECK: %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32 4, <4 x i1> %1, <4 x i32> poison) +; CHECK: ret <4 x i32> %2 +; CHECK: } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll new file mode 100644 index 0000000000000..394eb61e4aaff --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll @@ -0,0 +1,83 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k masked_gather -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %cmp = icmp eq i64 %rem, 0 + br i1 %cmp, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call + %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %idxprom4 = sext i32 %2 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4 + store i32 42, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %cmp = icmp eq i64 %rem, 0 + br i1 %cmp, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 42, i32 addrspace(1)* %arrayidx3, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Test if the masked gather load is defined correctly +; CHECK: define <4 x i32> @__vecz_b_masked_gather_load4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x ptr addrspace(1)>{{( %0)?}}, <4 x i1>{{( %1)?}}) +; CHECK: entry: +; CHECK: %2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> poison) +; CHECK: ret <4 x i32> %2 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll new file mode 100644 index 0000000000000..1b7e191cce0a3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll @@ -0,0 +1,90 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_local_id(i32 0) #5 + %conv = trunc i64 %call to i32 + %cmp = icmp sgt i32 %conv, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %call2 = call i64 @__mux_get_global_id(i32 0) #5 + %conv3 = trunc i64 %call2 to i32 + %idxprom = sext i32 %conv3 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(2)* %arrayidx, align 4 + %idxprom4 = sext i32 %conv3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4 + store i32 %0, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.else: ; preds = %entry + %call8 = call i64 @__mux_get_local_size(i32 0) #5 + %call9 = call i64 @__mux_get_group_id(i32 0) #5 + %mul = mul i64 %call9, %call8 + %add = add i64 %mul, %call + %sext = shl i64 %add, 32 + %idxprom11 = ashr exact i64 %sext, 32 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 42, i32 addrspace(1)* %arrayidx12, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_local_id(i32) #1 + +declare i64 @__mux_get_global_id(i32) #1 + +declare i64 @__mux_get_local_size(i32) #1 + +declare i64 @__mux_get_group_id(i32) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { noinline } +attributes #3 = { argmemonly nounwind } +attributes #4 = { argmemonly nounwind readonly } +attributes #5 = { nobuiltin nounwind } +attributes #6 = { nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 2, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"int*", !"int*"} +!4 = !{!"kernel_arg_base_type", !"int*", !"int*"} +!5 = !{!"kernel_arg_type_qual", !"const", !""} +!6 = !{!"clang version 3.8.1 "} + + + +; Test if the masked load is defined correctly +; CHECK: define <4 x i32> @__vecz_b_masked_load4_Dv4_ju3ptrU3AS2Dv4_b(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}}) +; CHECK: entry: +; CHECK: %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> poison) +; CHECK: ret <4 x i32> %2 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll new file mode 100644 index 0000000000000..bc33844fafee2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll @@ -0,0 +1,85 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k masked_scatter -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %cmp = icmp eq i64 %rem, 0 + br i1 %cmp, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call + %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4 + %idxprom4 = sext i32 %2 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4 + store i32 42, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %rem = urem i64 %call, 3 + %cmp = icmp eq i64 %rem, 0 + br i1 %cmp, label %if.else, label %if.then + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.else: ; preds = %entry + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call + store i32 42, i32 addrspace(1)* %arrayidx3, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Test if the masked scatter store is defined correctly +; CHECK: define void @__vecz_b_masked_scatter_store4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, <4 x ptr addrspace(1)>{{( %1)?}}, <4 x i1>{{( %2)?}}) +; CHECK: entry: +; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 4, <4 x i1> %2) #[[ATTRS:[0-9]+]] +; CHECK: ret void + +; CHECK: attributes #[[ATTRS]] = { diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll new file mode 100644 index 0000000000000..21412fc239186 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll @@ -0,0 +1,90 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_local_id(i32 0) #5 + %conv = trunc i64 %call to i32 + %cmp = icmp sgt i32 %conv, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %call2 = call i64 @__mux_get_global_id(i32 0) #5 + %conv3 = trunc i64 %call2 to i32 + %idxprom = sext i32 %conv3 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(2)* %arrayidx, align 4 + %idxprom4 = sext i32 %conv3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4 + store i32 %0, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.else: ; preds = %entry + %call8 = call i64 @__mux_get_local_size(i32 0) #5 + %call9 = call i64 @__mux_get_group_id(i32 0) #5 + %mul = mul i64 %call9, %call8 + %add = add i64 %mul, %call + %sext = shl i64 %add, 32 + %idxprom11 = ashr exact i64 %sext, 32 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 42, i32 addrspace(1)* %arrayidx12, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_local_id(i32) #1 + +declare i64 @__mux_get_global_id(i32) #1 + +declare i64 @__mux_get_local_size(i32) #1 + +declare i64 @__mux_get_group_id(i32) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { noinline } +attributes #3 = { argmemonly nounwind } +attributes #4 = { argmemonly nounwind readonly } +attributes #5 = { nobuiltin nounwind } +attributes #6 = { nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 2, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"int*", !"int*"} +!4 = !{!"kernel_arg_base_type", !"int*", !"int*"} +!5 = !{!"kernel_arg_type_qual", !"const", !""} +!6 = !{!"clang version 3.8.1 "} + + + +; Test if the masked store is defined correctly +; CHECK: define void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) +; CHECK: entry: +; CHECK: call void @llvm.masked.store.v4i32.p1(<4 x i32> %0, ptr addrspace(1) %1, i32 4, <4 x i1> %2) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll new file mode 100644 index 0000000000000..1f736694807fa --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll @@ -0,0 +1,46 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 0 + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + %c3.load = load i64, i64* %c3, align 4 + %c4 = getelementptr i64, i64* %c3, i64 %gid + store i64 %c3.load, i64* %c4, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Test if the scatter store is defined correctly +; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) [[ATTRS:#[0-9]+]] { +; CHECK: entry +; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) +; CHECK: ret void + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll new file mode 100644 index 0000000000000..326b7cf69d6a0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll @@ -0,0 +1,46 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 0 + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + %c3.load = load i64, i64* %c3, align 4 + %c4 = getelementptr i64, i64* %c3, i64 %gid + store i64 %c3.load, i64* %c4, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Test if the scatter store is defined correctly +; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) [[ATTRS:#[0-9]+]] { +; CHECK: entry: +; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) +; CHECK: ret void + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll new file mode 100644 index 0000000000000..2d31999d37d37 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) { + ; Dummy uses of the builtins, as we don't define any with zero uses. + %a = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> zeroinitializer) + %b = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32> zeroinitializer) + ret void +} + +declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32>) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %0) { +; CHECK: entry: +; CHECK: %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> , <4 x i32> +; CHECK: %[[ADD1:.+]] = add <4 x i32> %0, %[[SHUF1]] +; CHECK: %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> , <4 x i32> +; CHECK: %[[RESULT:.+]] = add <4 x i32> %[[ADD1]], %[[SHUF2]] +; CHECK: ret <4 x i32> %[[RESULT]] +; CHECK: } + +declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32>) +; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32> %0) { +; CHECK: entry: +; CHECK: %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> , <4 x i32> +; CHECK: %[[ADD1:.+]] = add <4 x i32> %0, %[[SHUF1]] +; CHECK: %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> , <4 x i32> +; CHECK: %[[ADD2:.+]] = add <4 x i32> %[[ADD1]], %[[SHUF2]] +; CHECK: %[[ROTATE:.+]] = shufflevector <4 x i32> %[[ADD2]], <4 x i32> poison, <4 x i32> +; CHECK: %[[RESULT:.+]] = insertelement <4 x i32> %[[ROTATE]], i32 0, i64 0 +; CHECK: ret <4 x i32> %[[RESULT]] +; CHECK: } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll new file mode 100644 index 0000000000000..1ce3ddc2368c6 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll @@ -0,0 +1,77 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.addr.0, %e + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in) + call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out) + %0 = extractelement <4 x i32> %call1, i64 0 + %tobool = icmp ne i32 %0, 0 + %tobool2 = icmp eq i64 %call, 0 + %or.cond = and i1 %tobool2, %tobool + br i1 %or.cond, label %while.cond, label %for.inc + +while.cond: ; preds = %while.cond, %for.body + %tobool3 = icmp eq i64 %call, 0 + br i1 %tobool3, label %for.inc, label %while.cond + +for.inc: ; preds = %for.body, %while.cond + %inc = add nsw i32 %i.addr.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*) + +declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*) + +; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep + +; Check if we have the packetized and only the packetized version of the memop. +; Vecz should assert if this test fails, as we will not define the interleaved +; op with width of 1. +; Interleaved Group Combine gets rid of all the interleaved loads created by +; the re-vectorization process +; CHECK: load <4 x i32> +; CHECK: load <4 x i32> +; CHECK: load <4 x i32> +; CHECK: load <4 x i32> +; CHECK-NOT: call {{.*}}i32 @__vecz_b_interleaved_load4_ju3ptrU3AS1 + +; CHECK: ret void + +; Check if the declaration is missing as well +; CHECK-NOT: @__vecz_b_interleaved_load4_ju3ptrU3AS1 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll new file mode 100644 index 0000000000000..8bfa6cd569ea9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll @@ -0,0 +1,58 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Let vecz pick the right vectorization factor for this kernel +; RUN: veczc --vecz-auto -k foo -k bar --device-sg-sizes 6,7,8,9 -S < %s | FileCheck %s +; RUN: veczc --vecz-auto -k foo:4 -k bar:4 --device-sg-sizes 6,7,8,9 -S < %s | FileCheck %s + +; Check we auto-vectorize to 8, despite any other options telling us a +; different vectorization factor. A factor of 8 is 'best' here because it's a +; power of two. +; CHECK: define void @__vecz_v8_foo( +define void @foo(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 { + %id = call i64 @__mux_get_global_id(i32 0) + %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id + %x = load i32, ptr addrspace(1) %in.addr + %sglid = call i32 @__mux_get_sub_group_local_id() +; CHECK: = add <8 x i32> + %y = add i32 %x, %sglid + %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id + store i32 %y, ptr addrspace(1) %out.addr + ret void +} + +; Check we auto-vectorize to 7, despite any other options telling us a +; different vectorization factor. A factor of 8 is 'best' here because it's a +; power of two, but a factor of 7 works well because it won't need a tail. +; CHECK: define void @__vecz_v7_bar( +define void @bar(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !reqd_work_group_size !0 { + %id = call i64 @__mux_get_global_id(i32 0) + %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id + %x = load i32, ptr addrspace(1) %in.addr + %sglid = call i32 @__mux_get_sub_group_local_id() +; CHECK: = add <7 x i32> + %y = add i32 %x, %sglid + %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id + store i32 %y, ptr addrspace(1) %out.addr + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare i32 @__mux_get_sub_group_local_id() + +attributes #0 = { "mux-kernel"="entry-point" } + +!0 = !{i64 14, i64 1, i64 1} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll new file mode 100644 index 0000000000000..cca59611013d3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll @@ -0,0 +1,186 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=cfg-convert -S < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_uniform_if_then_in_divergent_block( +; CHECK-SAME: ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) +define spir_kernel void @uniform_if_then_in_divergent_block(ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) #4 !reqd_work_group_size !10 { +; CHECK: entry: +; CHECK: [[CMP_NOT:%.*]] = icmp slt i32 %0, %threshold +; CHECK: %cmp.not.ROSCC = icmp eq i1 [[CMP_NOT]], false +; CHECK: %cmp.not.ROSCC_any = call i1 @__vecz_b_divergence_any(i1 %cmp.not.ROSCC) +; CHECK: br i1 %cmp.not.ROSCC_any, label %entry.ROSCC, label %entry.if.end17_crit_edge +entry: + %cosa = alloca float, align 4 + %call = tail call i64 @__mux_get_global_id(i32 0) #5 + %sext = mul i64 %call, 51539607552 + %idx.ext = ashr exact i64 %sext, 32 + %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %accum_ptr, i64 %idx.ext + %0 = load i32, ptr addrspace(1) %add.ptr, align 4 + %cmp.not = icmp slt i32 %0, %threshold + br i1 %cmp.not, label %entry.if.end17_crit_edge, label %if.then + +; CHECK: entry.ROSCC: +; CHECK: [[CMP_NOT_NOT:%.*]] = xor i1 [[CMP_NOT]], true +; CHECK: br label %if.then + +entry.if.end17_crit_edge: ; preds = %entry + br label %if.end17 + +; Ensure that only active lanes (masked by %cmp.not.not) contribute towards the +; %or.cond branch. +; CHECK: if.then: +; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %mul7 = fmul float %2, -2.950000e+01 +; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00 +; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01 +; CHECK: %or.cond = and i1 %cmp11, %cmp14 +; CHECK: %or.cond_active = select i1 [[CMP_NOT_NOT]], i1 %or.cond, i1 false +; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active) +; CHECK: br i1 %or.cond_active_any, label %if.then.if.end_crit_edge, label %if.then16 +if.then: ; preds = %entry + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cosa) #6 + store float 0.000000e+00, ptr %cosa, align 4 + %call4 = call spir_func float @_Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa) #7 + %1 = load float, ptr %cosa, align 4 + %mul7 = fmul float %1, -2.950000e+01 + %cmp11 = fcmp uge float %mul7, 0.000000e+00 + %cmp14 = fcmp ult float %mul7, 6.400000e+01 + %or.cond = and i1 %cmp11, %cmp14 + br i1 %or.cond, label %if.then.if.end_crit_edge, label %if.then16 + +if.then.if.end_crit_edge: ; preds = %if.then + br label %if.end + +if.then16: ; preds = %if.then + %sext2 = shl i64 %call, 32 + %idxprom = ashr exact i64 %sext2, 32 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom + store float %mul7, ptr addrspace(1) %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then.if.end_crit_edge, %if.then16 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cosa) #6 + br label %if.end17 + +if.end17: ; preds = %entry.if.end17_crit_edge, %if.end + ret void +} + +define spir_kernel void @uniform_if_else_in_divergent_block(ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) #4 !reqd_work_group_size !10 { +; CHECK: entry: +; CHECK: [[CMP_NOT:%.*]] = icmp slt i32 %0, %threshold +; CHECK: %cmp.not.ROSCC = icmp eq i1 [[CMP_NOT]], false +; CHECK: %cmp.not.ROSCC_any = call i1 @__vecz_b_divergence_any(i1 %cmp.not.ROSCC) +; CHECK: br i1 %cmp.not.ROSCC_any, label %entry.ROSCC, label %entry.if.end17_crit_edge +entry: + %cosa = alloca float, align 4 + %call = tail call i64 @__mux_get_global_id(i32 0) #5 + %sext = mul i64 %call, 51539607552 + %idx.ext = ashr exact i64 %sext, 32 + %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %accum_ptr, i64 %idx.ext + %0 = load i32, ptr addrspace(1) %add.ptr, align 4 + %cmp.not = icmp slt i32 %0, %threshold + br i1 %cmp.not, label %entry.if.end17_crit_edge, label %if.then + +; CHECK: entry.ROSCC: +; CHECK: [[CMP_NOT_NOT:%.*]] = xor i1 [[CMP_NOT]], true +; CHECK: br label %if.then + +entry.if.end17_crit_edge: ; preds = %entry + br label %if.end17 + +; Ensure that only active lanes (masked by %cmp.not.not) contribute towards the +; %or.cond branch. +; CHECK: if.then: +; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]]) +; CHECK: %mul7 = fmul float %2, -2.950000e+01 +; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00 +; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01 +; CHECK: %or.cond = and i1 %cmp11, %cmp14 +; CHECK: %or.cond_active = select i1 [[CMP_NOT_NOT]], i1 %or.cond, i1 false +; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active) +; CHECK: br i1 %or.cond_active_any, label %if.else.crit_edge, label %if.then16 +if.then: ; preds = %entry + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cosa) #6 + store float 0.000000e+00, ptr %cosa, align 4 + %call4 = call spir_func float @_Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa) #7 + %1 = load float, ptr %cosa, align 4 + %mul7 = fmul float %1, -2.950000e+01 + %cmp11 = fcmp uge float %mul7, 0.000000e+00 + %cmp14 = fcmp ult float %mul7, 6.400000e+01 + %or.cond = and i1 %cmp11, %cmp14 + br i1 %or.cond, label %if.else.crit_edge, label %if.then16 + +if.else.crit_edge: ; preds = %if.then + br label %if.else + +if.then16: ; preds = %if.then + %sext2 = shl i64 %call, 32 + %idxprom = ashr exact i64 %sext2, 32 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom + store float %mul7, ptr addrspace(1) %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom + store float 1.0, ptr addrspace(1) %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then16 + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cosa) #6 + br label %if.end17 + +if.end17: ; preds = %entry.if.end17_crit_edge, %if.end + ret void +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: nounwind +declare spir_func float @_Z6sincosfPf(float, ptr) #2 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +; Function Attrs: alwaysinline norecurse nounwind memory(read) +declare i64 @__mux_get_global_id(i32) #3 + +attributes #0 = { norecurse nounwind "mux-kernel"="entry-point" "mux-local-mem-usage"="0" "mux-no-subgroups" "mux-orig-fn"="get_lines" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "vecz-mode"="auto" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) "vecz-mode"="auto" } +attributes #2 = { nounwind "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "vecz-mode"="auto" } +attributes #3 = { alwaysinline norecurse nounwind memory(read) "vecz-mode"="auto" } +attributes #4 = { norecurse nounwind "mux-base-fn-name"="get_lines" "mux-kernel"="entry-point" "mux-local-mem-usage"="0" "mux-no-subgroups" "mux-orig-fn"="get_lines" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "vecz-mode"="auto" } +attributes #5 = { alwaysinline norecurse nounwind memory(read) } +attributes #6 = { nounwind } +attributes #7 = { nobuiltin nounwind "no-builtins" } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!2} +!opencl.spir.version = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!2 = !{i32 1, i32 2} +!10 = !{i32 2, i32 1, i32 1} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll new file mode 100644 index 0000000000000..f300fff8801f7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i32 @__mux_get_local_id(i32); +declare i32 @__mux_get_local_size(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %id = call i32 @__mux_get_local_id(i32 0) + %size = call i32 @__mux_get_local_size(i32 0) + br label %loop + +loop: + %index = phi i32 [0, %entry], [%inc, %loop] + %load = load i32, i32 addrspace(1)* %in + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index + store i32 %load, i32 addrspace(1)* %slot + %inc = add i32 %index, 1 + %cmp = icmp ne i32 %inc, %id + br i1 %cmp, label %loop, label %merge + +merge: + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: loop: +; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %loop.entry_mask{{[0-9]*}} to i4 +; CHECK: %[[MASK:[^ ]+]] = icmp ne i4 %[[BITCAST]], 0 +; CHECK: %[[LOAD:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]]) +; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[LOAD]], ptr addrspace(1) %{{.+}}, i1 %[[MASK]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll new file mode 100644 index 0000000000000..2f720c7a49ec0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll @@ -0,0 +1,64 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i32 @__mux_get_local_id(i32); +declare i32 @__mux_get_local_size(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %id = call i32 @__mux_get_local_id(i32 0) + %size = call i32 @__mux_get_local_size(i32 0) + br label %loop + +loop: + %index = phi i32 [0, %entry], [%inc, %nested_merge] + br label %koop + +koop: + %kndex = phi i32 [%index, %loop], [%knc, %koop] + %load = load i32, i32 addrspace(1)* %in + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index + store i32 %load, i32 addrspace(1)* %slot + %knc = add i32 %kndex, 1 + %kmp = icmp ne i32 %knc, %id + br i1 %kmp, label %koop, label %nested_merge + +nested_merge: + %old = atomicrmw add i32 addrspace(1)* %in, i32 42 acq_rel + %inc = add i32 %index, 1 + %cmp = icmp ne i32 %inc, %size + br i1 %cmp, label %loop, label %merge + +merge: + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: koop: +; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %koop.entry_mask{{[0-9]*}} to i4 +; CHECK: %[[MASK:[^ ]+]] = icmp ne i4 %[[BITCAST]], 0 +; CHECK: %[[LOAD:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]]) +; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[LOAD]], ptr addrspace(1) %{{.+}}, i1 %[[MASK]]) +; CHECK: nested_merge: +; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel +; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel +; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel +; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll new file mode 100644 index 0000000000000..a20dc32f71b38 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll @@ -0,0 +1,78 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k multiple_dimensions_0 -vecz-simd-width 4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_size(i32) #1 + +; Function Attrs: convergent nounwind +define spir_kernel void @multiple_dimensions_0(i32 addrspace(1)* %output) #2 { +entry: + %call.i = call i64 @__mux_get_global_id(i32 0) #3 + %call1.i = call i64 @__mux_get_global_size(i32 1) #3 + %mul.i = mul i64 %call1.i, %call.i + %call2.i = call i64 @__mux_get_global_size(i32 2) #3 + %mul3.i = mul i64 %mul.i, %call2.i + %call4.i = call i64 @__mux_get_global_id(i32 1) #3 + %mul6.i = mul i64 %call2.i, %call4.i + %add.i = add i64 %mul6.i, %mul3.i + %call7.i = call i64 @__mux_get_global_id(i32 2) #3 + %add8.i = add i64 %add.i, %call7.i + %conv = trunc i64 %add8.i to i32 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %add8.i + store i32 %conv, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!llvm.ident = !{!2} +!opencl.kernels = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"} +!3 = !{void (i32 addrspace(1)*)* @multiple_dimensions_0, !4, !5, !6, !7, !8, !9} +!4 = !{!"kernel_arg_addr_space", i32 1} +!5 = !{!"kernel_arg_access_qual", !"none"} +!6 = !{!"kernel_arg_type", !"int*"} +!7 = !{!"kernel_arg_base_type", !"int*"} +!8 = !{!"kernel_arg_type_qual", !""} +!9 = !{!"kernel_arg_name", !"output"} + +; Function start +; CHECK: define spir_kernel void @__vecz_v4_multiple_dimensions_0 + +; make sure the stride calculation uses the correct operand of the multiply +; CHECK: %[[CALL1:.+]] = call i64 @__mux_get_global_size(i32 1) +; CHECK: %[[CALL2:.+]] = call i64 @__mux_get_global_size(i32 2) +; CHECK: %[[NEWMUL:.+]] = mul i64 %[[CALL1]], %[[CALL2]] +; CHECK: call void @__vecz_b_interleaved_store4_V_Dv4_ju3ptrU3AS1({{.+}} %[[NEWMUL]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll new file mode 100644 index 0000000000000..bc4270c9e2a8c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll @@ -0,0 +1,200 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k entry -vecz-passes="builtin-inlining,function(instcombine,early-cse),cfg-convert,packetizer" -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Laid out, this struct is 80 bytes +%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] } + +; Function Attrs: norecurse nounwind +define spir_kernel void @entry(i64 addrspace(1)* %result, %struct.S2* %result2) { +entry: + %gid = call i64 @__mux_get_local_id(i32 0) + %sa = alloca %struct.S2, align 16 + %sb = alloca %struct.S2, align 16 + %sa_i8 = bitcast %struct.S2* %sa to i8* + %sb_i8 = bitcast %struct.S2* %sb to i8* + %sb_i8as = addrspacecast i8* %sb_i8 to i8 addrspace(1)* + %rsi = ptrtoint i64 addrspace(1)* %result to i64 + %rsit = trunc i64 %rsi to i8 + call void @llvm.memset.p0i8.i64(i8* %sa_i8, i8 %rsit, i64 80, i32 16, i1 false) + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %sb_i8as, i8 0, i64 80, i32 16, i1 false) + %lr = addrspacecast %struct.S2* %result2 to %struct.S2 addrspace(1)* + %lri = bitcast %struct.S2 addrspace(1)* %lr to i64 addrspace(1)* + %cond = icmp eq i64 addrspace(1)* %result, %lri + br i1 %cond, label %middle, label %end + +middle: + call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %sb_i8as, i8* %sa_i8, i64 80, i32 16, i1 false) + br label %end + +end: + %g_343 = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 0 + %g_343_load = load i16, i16* %g_343 + %g_343_zext = zext i16 %g_343_load to i64 + %resp = getelementptr i64, i64 addrspace(1)* %result, i64 %gid + store i64 %g_343_zext, i64 addrspace(1)* %resp, align 8 + %result2_i8 = bitcast %struct.S2* %result2 to i8* + call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 16, i1 false) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1) + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture readonly, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) + +declare i64 @__mux_get_local_id(i32) + +; Note: Between LLVM 17 and LLVM 18, optimizations to alignments were moved to +; their own pass. We don't run that pass here, resulting in a difference in +; alignment values between LLVM versions. Because of that, we don't check +; alignment of any loads or stores + +; Sanity checks: Make sure the non-vecz entry function is still in place and +; contains memset and memcpy. This is done in order to prevent future bafflement +; in case some pass optimizes them out. +; CHECK: define spir_kernel void @entry +; CHECK: entry: +; CHECK: call void @llvm.memset +; CHECK: call void @llvm.memset +; CHECK: middle: +; CHECK: call void @llvm.memcpy +; CHECK: end: +; CHECK: call void @llvm.memcpy + +; And now for the actual checks + +; Check if the kernel was vectorized +; CHECK: define spir_kernel void @__vecz_v{{[0-9]+}}_entry +; CHECK: %[[SB_I8AS:.*]] = addrspacecast ptr %sb to ptr addrspace(1) + +; Check if the memset and memcpy calls have been removed +; CHECK-NOT: call void @llvm.memset +; CHECK-NOT: call void @llvm.memcpy + +; Check if the calculation of the stored value for the second memset is in place +; CHECK: %ms64val + +; Check if the generated loads and stores are in place +; Check the stores for the first memset +; CHECK: store i64 %ms64val, ptr %sa +; CHECK: %[[V14:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 8 +; CHECK: store i64 %ms64val, ptr %[[V14]] +; CHECK: %[[V15:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 16 +; CHECK: store i64 %ms64val, ptr %[[V15]] +; CHECK: %[[V16:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 24 +; CHECK: store i64 %ms64val, ptr %[[V16]] +; CHECK: %[[V17:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 32 +; CHECK: store i64 %ms64val, ptr %[[V17]] +; CHECK: %[[V18:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 40 +; CHECK: store i64 %ms64val, ptr %[[V18]] +; CHECK: %[[V19:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 48 +; CHECK: store i64 %ms64val, ptr %[[V19]] +; CHECK: %[[V20:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 56 +; CHECK-EQ14: %[[V20:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %sa, i64 0, i32 3, i64 8 +; CHECK: %[[V21:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 64 +; CHECK: %[[V22:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 72 + +; Check the stores for the second memset +; CHECK: store i64 0, ptr addrspace(1) %[[SB_I8AS]] +; CHECK: %[[V24:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 8 +; CHECK: store i64 0, ptr addrspace(1) %[[V24]] +; CHECK: %[[V26:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 16 +; CHECK: store i64 0, ptr addrspace(1) %[[V26]] +; CHECK: %[[V28:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 24 +; CHECK: store i64 0, ptr addrspace(1) %[[V28]] +; CHECK: %[[V30:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 32 +; CHECK: store i64 0, ptr addrspace(1) %[[V30]] +; CHECK: %[[V32:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 40 +; CHECK: store i64 0, ptr addrspace(1) %[[V32]] +; CHECK: %[[V33:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 48 +; CHECK: store i64 0, ptr addrspace(1) %[[V33]] +; CHECK: %[[V35T:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 56 +; CHECK-EQ14: %[[V35T:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %sb, i64 0, i32 3, i64 8 +; CHECK-EQ14: %[[V35:[0-9]+]] = bitcast i8* %[[V35T]] to i64* +; CHECK-EQ14: %[[SB_I8AS18:.+]] = addrspacecast i64* %[[V35]] to i64 addrspace(1)* +; CHECK: store i64 0, ptr addrspace(1) %[[V35T]] +; CHECK: %[[V36:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 64 +; CHECK: store i64 0, ptr addrspace(1) %[[V36]] +; CHECK: %[[V38:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 72 +; CHECK: store i64 0, ptr addrspace(1) %[[V38]] + + +; Check the loads and stores for the first memcpy +; CHECK:middle: ; preds = %entry +; CHECK: %[[SA_I822:.+]] = load i64, ptr %sa +; CHECK: store i64 %[[SA_I822]], ptr addrspace(1) %[[SB_I8AS]] +; CHECK: %[[SA_I824:.+]] = load i64, ptr %[[V14]] +; CHECK: store i64 %[[SA_I824]], ptr addrspace(1) %[[V24]] +; CHECK: %[[SA_I826:.+]] = load i64, ptr %[[V15]] +; CHECK: store i64 %[[SA_I826]], ptr addrspace(1) %[[V26]] +; CHECK: %[[SA_I828:.+]] = load i64, ptr %[[V16]] +; CHECK: store i64 %[[SA_I828]], ptr addrspace(1) %[[V28]] +; CHECK: %[[SA_I830:.+]] = load i64, ptr %[[V17]] +; CHECK: store i64 %[[SA_I830]], ptr addrspace(1) %[[V30]] +; CHECK: %[[SA_I832:.+]] = load i64, ptr %[[V18]] +; CHECK: store i64 %[[SA_I832]], ptr addrspace(1) %[[V32]] +; CHECK: %[[SA_I834:.+]] = load i64, ptr %[[V19]] +; CHECK: store i64 %[[SA_I834]], ptr addrspace(1) %[[V33]] +; CHECK: %[[SA_I836:.+]] = load i64, ptr %[[V20]] +; CHECK: store i64 %[[SA_I836]], ptr addrspace(1) %[[V35T]] +; CHECK: %[[SA_I838:.+]] = load i64, ptr %[[V21]] +; CHECK: store i64 %[[SA_I838]], ptr addrspace(1) %[[V36]] +; CHECK: %[[SA_I840:.+]] = load i64, ptr %[[V22]] +; CHECK: store i64 %[[SA_I840]], ptr addrspace(1) %[[V38]] + +; Check the loads and stores for the second memcpy +; CHECK:end: ; preds = %middle, %entry +; CHECK: %[[SB_I8AS42:.+]] = load i64, ptr addrspace(1) %[[SB_I8AS]] +; CHECK: store i64 %[[SB_I8AS42]], ptr %result2 +; CHECK: %[[V42:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 8 +; CHECK: %[[SB_I8AS44:.+]] = load i64, ptr addrspace(1) %[[V24]] +; CHECK: store i64 %[[SB_I8AS44]], ptr %[[V42]] +; CHECK: %[[V43:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 16 +; CHECK: %[[SB_I8AS46:.+]] = load i64, ptr addrspace(1) %[[V26]] +; CHECK: store i64 %[[SB_I8AS46]], ptr %[[V43]] +; CHECK: %[[V44:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 24 +; CHECK: %[[SB_I8AS48:.+]] = load i64, ptr addrspace(1) %[[V28]] +; CHECK: store i64 %[[SB_I8AS48]], ptr %[[V44]] +; CHECK: %[[V45:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 32 +; CHECK: %[[SB_I8AS50:.+]] = load i64, ptr addrspace(1) %[[V30]] +; CHECK: store i64 %[[SB_I8AS50]], ptr %[[V45]] +; CHECK: %[[V46:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 40 +; CHECK: %[[SB_I8AS52:.+]] = load i64, ptr addrspace(1) %[[V32]] +; CHECK: store i64 %[[SB_I8AS52]], ptr %[[V46]] +; CHECK: %[[V47:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 48 +; CHECK: %[[SB_I8AS54:.+]] = load i64, ptr addrspace(1) %[[V33]] +; CHECK: store i64 %[[SB_I8AS54]], ptr %[[V47]] +; CHECK: %[[V48:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 56 +; CHECK-EQ14: %[[V48:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %result2, i64 0, i32 3, i64 8 +; CHECK: %[[SB_I8AS56:.+]] = load i64, ptr addrspace(1) %[[V35T]] +; CHECK: store i64 %[[SB_I8AS56]], ptr %[[V48]] +; CHECK: %[[V49:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 64 +; CHECK: %[[SB_I8AS58:.+]] = load i64, ptr addrspace(1) %[[V36]] +; CHECK: store i64 %[[SB_I8AS58]], ptr %[[V49]] +; CHECK: %[[V50:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 72 +; CHECK: %[[SB_I8AS60:.+]] = load i64, ptr addrspace(1) %[[V38]] +; CHECK: store i64 %[[SB_I8AS60]], ptr %[[V50]] + +; End of function +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll new file mode 100644 index 0000000000000..cf228937ec2bc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll @@ -0,0 +1,91 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k entry -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Laid out, this struct is 80 bytes +%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] } + +; Function Attrs: norecurse nounwind +define spir_kernel void @entry(i64 addrspace(1)* %result, %struct.S2* %result2) { +entry: + %gid = call i64 @__mux_get_local_id(i32 0) + %sa = alloca %struct.S2, align 16 + %sb = alloca %struct.S2, align 16 + %sa_i8 = bitcast %struct.S2* %sa to i8* + %sb_i8 = bitcast %struct.S2* %sb to i8* + %sb_i8as = addrspacecast i8* %sb_i8 to i8 addrspace(1)* + %rsi = ptrtoint i64 addrspace(1)* %result to i64 + %rsit = trunc i64 %rsi to i8 + call void @llvm.memset.p0i8.i64(i8* %sa_i8, i8 %rsit, i64 80, i32 4, i1 false) + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %sb_i8as, i8 0, i64 80, i32 4, i1 false) + %lr = addrspacecast %struct.S2* %result2 to %struct.S2 addrspace(1)* + %lri = bitcast %struct.S2 addrspace(1)* %lr to i64 addrspace(1)* + %cond = icmp eq i64 addrspace(1)* %result, %lri + br i1 %cond, label %middle, label %end + +middle: + call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %sb_i8as, i8* %sa_i8, i64 80, i32 4, i1 false) + br label %end + +end: + %g_343 = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 0 + %g_343_load = load i16, i16* %g_343 + %g_343_zext = zext i16 %g_343_load to i64 + %resp = getelementptr i64, i64 addrspace(1)* %result, i64 %gid + store i64 %g_343_zext, i64 addrspace(1)* %resp, align 8 + %result2_i8 = bitcast %struct.S2* %result2 to i8* + call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 4, i1 false) + call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 4, i1 false) + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1) + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture readonly, i64, i32, i1) +declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) + +declare i64 @__mux_get_local_id(i32) + +; Sanity checks: Make sure the non-vecz entry function is still in place and +; contains memset and memcpy. This is done in order to prevent future bafflement +; in case some pass optimizes them out. +; CHECK: define spir_kernel void @entry +; CHECK: entry: +; CHECK: call void @llvm.memset +; CHECK: call void @llvm.memset +; CHECK: middle: +; CHECK: call void @llvm.memcpy +; CHECK: end: +; CHECK: call void @llvm.memcpy + +; And now for the actual checks + +; Check if the kernel was vectorized +; CHECK: define spir_kernel void @__vecz_v{{[0-9]+}}_entry + +; Check if the memset and memcpy calls are still there +; CHECK: call void @llvm.memset +; CHECK: call void @llvm.memcpy + +; End of function +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll new file mode 100644 index 0000000000000..ebf2ef88aa2c8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll @@ -0,0 +1,87 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +declare void @llvm.assume(i1) +declare i32 @llvm.expect.i32(i32, i32) + +; CHECK: define spir_kernel void @__vecz_v4_assume( +; CHECK: [[A:%.*]] = load <4 x i32>, ptr %arrayidxa, align 4 +; CHECK: [[B:%.*]] = load <4 x i32>, ptr %arrayidxb, align 4 +; CHECK: [[SUM:%.*]] = add <4 x i32> %0, %1 +; CHECK: [[CMP:%.*]] = icmp sgt <4 x i32> [[SUM]], zeroinitializer +; CHECK: [[E0:%.*]] = extractelement <4 x i1> [[CMP]], i64 0 +; CHECK: [[E1:%.*]] = extractelement <4 x i1> [[CMP]], i64 1 +; CHECK: [[E2:%.*]] = extractelement <4 x i1> [[CMP]], i64 2 +; CHECK: [[E3:%.*]] = extractelement <4 x i1> [[CMP]], i64 3 +; CHECK: call void @llvm.assume(i1 [[E0]]) +; CHECK: call void @llvm.assume(i1 [[E1]]) +; CHECK: call void @llvm.assume(i1 [[E2]]) +; CHECK: call void @llvm.assume(i1 [[E3]]) +; CHECK: store <4 x i32> [[SUM]], ptr %arrayidxz, align 4 +define spir_kernel void @assume(ptr %aptr, ptr %bptr, ptr %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx + %a = load i32, ptr %arrayidxa, align 4 + %b = load i32, ptr %arrayidxb, align 4 + %sum = add i32 %a, %b + %cond = icmp sgt i32 %sum, 0 + call void @llvm.assume(i1 %cond) + store i32 %sum, ptr %arrayidxz, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_expect( +; CHECK: [[A:%.*]] = load <4 x i32>, ptr %arrayidxa, align 4 +; CHECK: [[B:%.*]] = load <4 x i32>, ptr %arrayidxb, align 4 +; CHECK: [[SUM:%.*]] = add <4 x i32> %0, %1 +; CHECK: [[E0:%.*]] = extractelement <4 x i32> [[SUM]], i64 0 +; CHECK: [[E1:%.*]] = extractelement <4 x i32> [[SUM]], i64 1 +; CHECK: [[E2:%.*]] = extractelement <4 x i32> [[SUM]], i64 2 +; CHECK: [[E3:%.*]] = extractelement <4 x i32> [[SUM]], i64 3 +; CHECK: [[EX0:%.*]] = call i32 @llvm.expect.i32(i32 [[E0]], i32 42) +; CHECK: [[EX1:%.*]] = call i32 @llvm.expect.i32(i32 [[E1]], i32 42) +; CHECK: [[EX2:%.*]] = call i32 @llvm.expect.i32(i32 [[E2]], i32 42) +; CHECK: [[EX3:%.*]] = call i32 @llvm.expect.i32(i32 [[E3]], i32 42) +; CHECK: [[C0:%.*]] = insertelement <4 x i32> poison, i32 [[EX0]], i64 0 +; CHECK: [[C1:%.*]] = insertelement <4 x i32> [[C0]], i32 [[EX1]], i64 1 +; CHECK: [[C2:%.*]] = insertelement <4 x i32> [[C1]], i32 [[EX2]], i64 2 +; CHECK: [[C3:%.*]] = insertelement <4 x i32> [[C2]], i32 [[EX3]], i64 3 +; CHECK: store <4 x i32> [[C3]], ptr %arrayidxz, align 4 + +define spir_kernel void @expect(ptr %aptr, ptr %bptr, ptr %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx + %a = load i32, ptr %arrayidxa, align 4 + %b = load i32, ptr %arrayidxb, align 4 + %sum = add i32 %a, %b + %cond = icmp sgt i32 %sum, 0 + %v = call i32 @llvm.expect.i32(i32 %sum, i32 42) + store i32 %v, ptr %arrayidxz, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll new file mode 100644 index 0000000000000..a9b0dbaad5388 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k extract_constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @extract_constant_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4 + %vecext = extractelement <4 x float> %0, i32 0; + %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %vecext, float addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index +; CHECK: call <4 x float> @__vecz_b_interleaved_load4_4_Dv4 +; CHECK: getelementptr float +; CHECK: store <4 x float> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll new file mode 100644 index 0000000000000..4512408948dc3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll @@ -0,0 +1,50 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: nounwind +define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4 + %vecext = extractelement <4 x float> %0, i32 %x + %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %vecext, float addrspace(1)* %arrayidx1, align 4 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index +; CHECK: load float, {{(ptr|float)}} +; CHECK: load float, {{(ptr|float)}} +; CHECK: load float, {{(ptr|float)}} +; CHECK: load float, {{(ptr|float)}} +; CHECK: icmp eq i32 0, %x +; CHECK: select i1 +; CHECK: icmp eq i32 1, %x +; CHECK: select i1 +; CHECK: icmp eq i32 2, %x +; CHECK: select i1 +; CHECK: icmp eq i32 3, %x +; CHECK: select i1 +; CHECK: store float +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll new file mode 100644 index 0000000000000..55d15033ecfc2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll @@ -0,0 +1,76 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S -vecz-passes="function(mem2reg,instcombine),cfg-convert,gvn,packetizer" < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%struct.testStruct = type { [2 x i32] } + + +; Check that we de-duplicate the GEPs used across this kernel (using a +; combination of instcombine and GVN). +; CHECK: spir_kernel void @__vecz_v{{[0-9]+}}_gep_duplication +; CHECK: entry: +; CHECK: getelementptr inbounds {{(nuw )?}}{{\[2 x i32]|i8}}, ptr %myStruct, {{i64 0, i64 1|i64 4}} +; CHECK-NOT: getelementptr {{.*}}%myStruct +define spir_kernel void @gep_duplication(ptr addrspace(1) align 4 %out) { +entry: + %out.addr = alloca ptr addrspace(1), align 8 + %global_id = alloca i64, align 8 + %myStruct = alloca %struct.testStruct, align 4 + store ptr addrspace(1) %out, ptr %out.addr, align 8 + %call = call i64 @__mux_get_global_id(i32 0) #2 + store i64 %call, ptr %global_id, align 8 + %x = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0 + %arrayidx = getelementptr inbounds [2 x i32], ptr %x, i64 0, i64 0 + store i32 0, ptr %arrayidx, align 4 + %x1 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0 + %arrayidx2 = getelementptr inbounds [2 x i32], ptr %x1, i64 0, i64 1 + store i32 1, ptr %arrayidx2, align 4 + %0 = load i64, ptr %global_id, align 8 + %and = and i64 %0, 1 + %tobool = icmp ne i64 %and, 0 + br i1 %tobool, label %if.then, label %if.else + +if.then: ; preds = %entry + %x3 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0 + %arrayidx4 = getelementptr inbounds [2 x i32], ptr %x3, i64 0, i64 0 + %1 = load i32, ptr %arrayidx4, align 4 + %2 = load ptr addrspace(1), ptr %out.addr, align 8 + %3 = load i32, ptr %global_id, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx5 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %idxprom + store i32 %1, ptr addrspace(1) %arrayidx5, align 4 + br label %if.end + +if.else: ; preds = %entry + %x6 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0 + %arrayidx7 = getelementptr inbounds [2 x i32], ptr %x6, i64 0, i64 1 + %4 = load i32, ptr %arrayidx7, align 4 + %5 = load ptr addrspace(1), ptr %out.addr, align 8 + %6 = load i32, ptr %global_id, align 4 + %idxprom8 = sext i32 %6 to i64 + %arrayidx9 = getelementptr inbounds i32, ptr addrspace(1) %5, i64 %idxprom8 + store i32 %4, ptr addrspace(1) %arrayidx9, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll new file mode 100644 index 0000000000000..005dbcaa64966 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll @@ -0,0 +1,53 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=gep-elim -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i32:32-f80:128-n8:16:32:64-S128" +target triple = "spir-unknown-unknown" + +%struct.mystruct = type { [2 x i32], ptr } + +; Function Attrs: norecurse nounwind +define spir_kernel void @test(ptr addrspace(1) nocapture writeonly align 4 %output) { +entry: + %foo = alloca [4 x %struct.mystruct], align 4 + %call = tail call spir_func i32 @__mux_get_global_id(i32 0) + store i32 20, ptr %foo, align 4 + %arrayidx4 = getelementptr inbounds [2 x i32], ptr %foo, i32 0, i32 1 + store i32 22, ptr %arrayidx4, align 4 + %y31 = getelementptr inbounds %struct.mystruct, ptr %foo, i32 0, i32 1 + store ptr %foo, ptr %y31, align 4 + %mul = shl nuw nsw i32 %call, 2 + store i32 1, ptr %foo, align 4 + %0 = load ptr, ptr %y31, align 4 + %1 = load i32, ptr %0, align 4 + %add98 = add nsw i32 %mul, %1 + %arrayidx117 = getelementptr inbounds i32, ptr addrspace(1) %output, i32 %mul + store i32 %add98, ptr addrspace(1) %arrayidx117, align 4 + ret void +} + +declare i32 @__mux_get_global_id(i32) + +; CHECK: define spir_kernel void @__vecz_v4_test( + +; Make sure all three GEPs are retained +; CHECK: %arrayidx4 = getelementptr inbounds [2 x i32], ptr %foo, i32 0, i32 1 +; CHECK: %y31 = getelementptr inbounds %struct.mystruct, ptr %foo, i32 0, i32 1 +; CHECK: %arrayidx117 = getelementptr inbounds i32, ptr addrspace(1) %output, i32 %mul +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll new file mode 100644 index 0000000000000..b45e215814d49 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll @@ -0,0 +1,30 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(void (i32)*, i32) { +entry: + call void %0 (i32 %1) + ret void +} + +; This is really a check to see if opt crashed or not +; CHECK: define spir_kernel void @test( diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll new file mode 100644 index 0000000000000..ce041960424b9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll @@ -0,0 +1,141 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check VECZ debug info for inlined DILocation metadata nodes + +; RUN: veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = '/tmp/inlined_function.ll' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: alwaysinline +define spir_func i32 @k_one(i32 %x, i32 %y) #0 !dbg !4 { +entry: + call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !9, metadata !38), !dbg !39 + call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !10, metadata !38), !dbg !39 + %mul = mul nsw i32 %x, %y, !dbg !40 + ret i32 %mul, !dbg !40 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +; Function Attrs: nounwind +define spir_kernel void @functions_one(i32 addrspace(1)* %in1i, i32 addrspace(1)* %in2i, float addrspace(1)* %in1f, float addrspace(1)* %in2f, i32 addrspace(1)* %out1i, float addrspace(1)* %out1f) #2 !dbg !11 { +entry: + call void @llvm.dbg.value(metadata i32 addrspace(1)* %in1i, i64 0, metadata !18, metadata !38), !dbg !41 + call void @llvm.dbg.value(metadata i32 addrspace(1)* %in2i, i64 0, metadata !19, metadata !38), !dbg !41 + call void @llvm.dbg.value(metadata float addrspace(1)* %in1f, i64 0, metadata !20, metadata !38), !dbg !41 + call void @llvm.dbg.value(metadata float addrspace(1)* %in2f, i64 0, metadata !21, metadata !38), !dbg !41 + call void @llvm.dbg.value(metadata i32 addrspace(1)* %out1i, i64 0, metadata !22, metadata !38), !dbg !41 + call void @llvm.dbg.value(metadata float addrspace(1)* %out1f, i64 0, metadata !23, metadata !38), !dbg !41 + %call = call i64 @__mux_get_global_id(i32 0) #4, !dbg !42 + call void @llvm.dbg.value(metadata i64 %call, i64 0, metadata !24, metadata !38), !dbg !42 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1i, i64 %call, !dbg !43 + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !43 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2i, i64 %call, !dbg !43 + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !43 + call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !9, metadata !38), !dbg !44 + call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !10, metadata !38), !dbg !44 + %mul.i = mul nsw i32 %0, %1, !dbg !46 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out1i, i64 %call, !dbg !43 + store i32 %mul.i, i32 addrspace(1)* %arrayidx3, align 4, !dbg !43 + ret void, !dbg !47 +} + +declare i64 @__mux_get_global_id(i32) #3 + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 + +attributes #0 = { alwaysinline } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nobuiltin } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!29} +!llvm.module.flags = !{!36} +!llvm.ident = !{!37} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2) +!1 = !DIFile(filename: "kernel.opencl", directory: "Aorta/vecz_build") +!2 = !{} +!3 = !{!4, !11} +!4 = distinct !DISubprogram(name: "k_one", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !8) +!5 = !DISubroutineType(types: !6) +!6 = !{!7, !7, !7} +!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!8 = !{!9, !10} +!9 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1, line: 1, type: !7) +!10 = !DILocalVariable(name: "y", arg: 2, scope: !4, file: !1, line: 1, type: !7) +!11 = distinct !DISubprogram(name: "functions_one", scope: !1, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17) +!12 = !DISubroutineType(types: !13) +!13 = !{null, !14, !14, !15, !15, !14, !15} +!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 64) +!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 64) +!16 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float) +!17 = !{!18, !19, !20, !21, !22, !23, !24} +!18 = !DILocalVariable(name: "in1i", arg: 1, scope: !11, file: !1, line: 6, type: !14) +!19 = !DILocalVariable(name: "in2i", arg: 2, scope: !11, file: !1, line: 6, type: !14) +!20 = !DILocalVariable(name: "in1f", arg: 3, scope: !11, file: !1, line: 6, type: !15) +!21 = !DILocalVariable(name: "in2f", arg: 4, scope: !11, file: !1, line: 6, type: !15) +!22 = !DILocalVariable(name: "out1i", arg: 5, scope: !11, file: !1, line: 6, type: !14) +!23 = !DILocalVariable(name: "out1f", arg: 6, scope: !11, file: !1, line: 6, type: !15) +!24 = !DILocalVariable(name: "tid", scope: !11, file: !1, line: 7, type: !25) +!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !26, line: 33, baseType: !27) +!26 = !DIFile(filename: "Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "Aorta/vecz_build") +!27 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !26, line: 31, baseType: !28) +!28 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) +!29 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32 addrspace(1)*, float addrspace(1)*)* @functions_one, !30, !31, !32, !33, !34, !35} +!30 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1} +!31 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"} +!32 = !{!"kernel_arg_type", !"int*", !"int*", !"float*", !"float*", !"int*", !"float*"} +!33 = !{!"kernel_arg_base_type", !"int*", !"int*", !"float*", !"float*", !"int*", !"float*"} +!34 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""} +!35 = !{!"reqd_work_group_size", i32 32, i32 1, i32 1} +!36 = !{i32 2, !"Debug Info Version", i32 3} +!37 = !{!"clang version 3.8.1 "} +!38 = !DIExpression() +!39 = !DILocation(line: 1, scope: !4) +!40 = !DILocation(line: 2, scope: !4) +!41 = !DILocation(line: 6, scope: !11) +!42 = !DILocation(line: 7, scope: !11) +!43 = !DILocation(line: 8, scope: !11) +!44 = !DILocation(line: 1, scope: !4, inlinedAt: !45) +!45 = distinct !DILocation(line: 8, scope: !11) +!46 = !DILocation(line: 2, scope: !4, inlinedAt: !45) +!47 = !DILocation(line: 9, scope: !11) + +; CHECK: spir_func i32 @k_one +; CHECK-SAME: !dbg [[HELPER_DI:![0-9]+]] + +; CHECK: define spir_kernel void @__vecz_v4_functions_one +; CHECK-SAME: !dbg [[KERN_DI:![0-9]+]] + +; CHECK: %[[LOAD1:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4 +; CHECK: %[[LOAD2:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4 +; CHECK: #dbg_value(i32 %[[LOAD1]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1:![0-9]+]] +; CHECK: #dbg_value(i32 %[[LOAD2]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1]] +; CHECK: %{{.*}} = mul nsw i32 %[[LOAD1]], %[[LOAD2]], !dbg [[DI_LOC2:![0-9]+]] + +; CHECK: [[HELPER_SUBPROGRAM:![0-9]+]] = distinct !DISubprogram(name: "k_one", + +; CHECK: [[DI_LOC1]] = !DILocation(line: 1, scope: [[HELPER_SUBPROGRAM]], inlinedAt: [[DI_INLINED_AT:![0-9]+]]) +; CHECK: [[DI_INLINED_AT]] = distinct !DILocation(line: 8, +; CHECK: [[DI_LOC2]] = !DILocation(line: 2, scope: [[HELPER_SUBPROGRAM]], inlinedAt: [[DI_INLINED_AT]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll new file mode 100644 index 0000000000000..24947313c290b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll @@ -0,0 +1,148 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Regression test for debug info bug related to creating llvm.dbg.value +; intrinsics across all lanes even when scalarization masks disable some +; of the lanes. This occurs when we scalarize insertelement instructions. + +; RUN: veczc -k unaligned_load -vecz-passes="function(instcombine,adce),scalarize,packetizer,instcombine" -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_unaligned_load +define spir_kernel void @unaligned_load(i32 addrspace(1)* %in, i32 addrspace(1)* %offsets, i32 addrspace(1)* %out) #0 !dbg !7 { +entry: + %in.addr = alloca i32 addrspace(1)*, align 8 + %offsets.addr = alloca i32 addrspace(1)*, align 8 + %out.addr = alloca i32 addrspace(1)*, align 8 +; CHECK: %tmp = alloca <16 x i32>, align 16 + %tid = alloca i32, align 4 + %tmp = alloca <3 x i32>, align 16 + store i32 addrspace(1)* %in, i32 addrspace(1)** %in.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in.addr, metadata !11, metadata !29), !dbg !30 + store i32 addrspace(1)* %offsets, i32 addrspace(1)** %offsets.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %offsets.addr, metadata !12, metadata !29), !dbg !30 + store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30 + call void @llvm.dbg.declare(metadata i32* %tid, metadata !14, metadata !29), !dbg !31 + %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31 + %conv = trunc i64 %call to i32, !dbg !31 + store i32 %conv, i32* %tid, align 4, !dbg !31 + call void @llvm.dbg.declare(metadata <3 x i32>* %tmp, metadata !15, metadata !29), !dbg !32 + %0 = load i32 addrspace(1)*, i32 addrspace(1)** %in.addr, align 8, !dbg !32 +; CHECK: %[[TMP_LD:.+]] = call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr(ptr nonnull %tmp) +; FIXME: This llvm.dbg.value marks a 'kill location' and denotes the +; termination of the previous value assigned to %tmp - we could probably do +; better here by manifesting a vectorized value? +; CHECK: #dbg_value(i32 poison, [[VAR:![0-9]+]], +; CHECK-SAME: !DIExpression({{.*}}), +; CHECK-SAME: !{{[0-9]+}} + %1 = load i32, i32* %tid, align 4, !dbg !32 + %mul = mul nsw i32 3, %1, !dbg !32 + %idx.ext = sext i32 %mul to i64, !dbg !32 + %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %idx.ext, !dbg !32 + %call1 = call spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64 0, i32 addrspace(1)* %add.ptr) #3, !dbg !32 + %extractVec = shufflevector <3 x i32> %call1, <3 x i32> poison, <4 x i32> , !dbg !32 + %storetmp = bitcast <3 x i32>* %tmp to <4 x i32>*, !dbg !32 + store <4 x i32> %extractVec, <4 x i32>* %storetmp, align 16, !dbg !32 + %2 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !33 + %3 = extractelement <3 x i32> %2, i64 0, !dbg !33 + %4 = load i32, i32* %tid, align 4, !dbg !33 + %mul2 = mul nsw i32 3, %4, !dbg !33 + %idxprom = sext i32 %mul2 to i64, !dbg !33 + %5 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !33 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom, !dbg !33 + store i32 %3, i32 addrspace(1)* %arrayidx, align 4, !dbg !33 + %6 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !34 + %7 = extractelement <3 x i32> %6, i64 1, !dbg !34 + %8 = load i32, i32* %tid, align 4, !dbg !34 + %mul3 = mul nsw i32 3, %8, !dbg !34 + %add = add nsw i32 %mul3, 1, !dbg !34 + %idxprom4 = sext i32 %add to i64, !dbg !34 + %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %idxprom4, !dbg !34 + store i32 %7, i32 addrspace(1)* %arrayidx5, align 4, !dbg !34 + %10 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !35 + %11 = extractelement <3 x i32> %10, i64 2, !dbg !35 + %12 = load i32, i32* %tid, align 4, !dbg !35 + %mul6 = mul nsw i32 3, %12, !dbg !35 + %add7 = add nsw i32 %mul6, 2, !dbg !35 + %idxprom8 = sext i32 %add7 to i64, !dbg !35 + %13 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !35 + %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %13, i64 %idxprom8, !dbg !35 + store i32 %11, i32 addrspace(1)* %arrayidx9, align 4, !dbg !35 + ret void, !dbg !36 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +declare i64 @__mux_get_global_id(i32) #2 + +declare spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64, i32 addrspace(1)*) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nobuiltin } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!21} +!llvm.module.flags = !{!27} +!llvm.ident = !{!28} + +; Now check we're actually looking at the right variable. +; CHECK: [[VAR]] = !DILocalVariable(name: "tmp", + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3) +!1 = !DIFile(filename: "kernel.opencl", directory: "/home/Aorta/vecz_build") +!2 = !{} +!3 = !{!4} +!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, align: 64) +!5 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!6 = !{!7} +!7 = distinct !DISubprogram(name: "unaligned_load", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !4, !4, !4} +!10 = !{!11, !12, !13, !14, !15} +!11 = !DILocalVariable(name: "in", arg: 1, scope: !7, file: !1, line: 1, type: !4) +!12 = !DILocalVariable(name: "offsets", arg: 2, scope: !7, file: !1, line: 1, type: !4) +!13 = !DILocalVariable(name: "out", arg: 3, scope: !7, file: !1, line: 1, type: !4) +!14 = !DILocalVariable(name: "tid", scope: !7, file: !1, line: 2, type: !5) +!15 = !DILocalVariable(name: "tmp", scope: !7, file: !1, line: 3, type: !16) +!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int3", file: !17, line: 64, baseType: !18) +!17 = !DIFile(filename: "/home//Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/home/Aorta/vecz_build") +!18 = !DICompositeType(tag: DW_TAG_array_type, baseType: !5, size: 128, align: 128, flags: DIFlagVector, elements: !19) +!19 = !{!20} +!20 = !DISubrange(count: 3) +!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @unaligned_load, !22, !23, !24, !25, !26} +!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1} +!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} +!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"} +!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"} +!26 = !{!"kernel_arg_type_qual", !"", !"", !""} +!27 = !{i32 2, !"Debug Info Version", i32 3} +!28 = !{!"clang version 3.8.1 "} +!29 = !DIExpression() +!30 = !DILocation(line: 1, scope: !7) +!31 = !DILocation(line: 2, scope: !7) +!32 = !DILocation(line: 3, scope: !7) +!33 = !DILocation(line: 4, scope: !7) +!34 = !DILocation(line: 5, scope: !7) +!35 = !DILocation(line: 6, scope: !7) +!36 = !DILocation(line: 7, scope: !7) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll new file mode 100644 index 0000000000000..0ecccdb14e767 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @constant_index(<4 x i32>* %in, <4 x i32>* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call + %0 = load <4 x i32>, <4 x i32>* %arrayidx + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call + %vecins = insertelement <4 x i32> %0, i32 42, i32 2 + store <4 x i32> %vecins, <4 x i32>* %arrayidx2 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_constant_index + +; We should only have 3 loads since one of the elements will be replaced +; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr +; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr +; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr +; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr + +; We should have four stores, one of which would use the constant given +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll new file mode 100644 index 0000000000000..146f7d15f0d0d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k runtime_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @runtime_index(<4 x i32>* %in, <4 x i32>* %out, i32* %index) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call + %0 = load <4 x i32>, <4 x i32>* %arrayidx + %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call + store <4 x i32> %0, <4 x i32>* %arrayidx1 + %arrayidx2 = getelementptr inbounds i32, i32* %index, i64 %call + %1 = load i32, i32* %arrayidx2 + %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call + %vecins = insertelement <4 x i32> %0, i32 42, i32 %1 + store <4 x i32> %vecins, <4 x i32>* %arrayidx3 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_runtime_index + +; Four icmps and selects +; CHECK: icmp eq <4 x i32> %{{.+}}, zeroinitializer +; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}} +; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}} +; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}} +; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}} +; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}} +; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}} + +; Four stores +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: store <4 x i32> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll new file mode 100644 index 0000000000000..990b7cdcec49f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll @@ -0,0 +1,95 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width 4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @test(half addrspace(1)* nocapture readonly %p, float addrspace(1)* nocapture %f) local_unnamed_addr #0 { +entry: + %data = alloca [1 x i16], align 2 + %0 = bitcast [1 x i16]* %data to i8* + %arraydecay = getelementptr inbounds [1 x i16], [1 x i16]* %data, i64 0, i64 0 + %1 = bitcast [1 x i16]* %data to half* + %call = tail call i64 @__mux_get_global_id(i32 0) #5 + %arrayidx7 = getelementptr inbounds half, half addrspace(1)* %p, i64 %call + %arrayidx = bitcast half addrspace(1)* %arrayidx7 to i16 addrspace(1)* + %2 = load i16, i16 addrspace(1)* %arrayidx, align 2, !tbaa !9 + store i16 %2, i16* %arraydecay, align 2, !tbaa !9 + %call2 = call spir_func float @_Z11vloada_halfmPKDh(i64 0, half* nonnull %1) #6 + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %f, i64 %call + store float %call2, float addrspace(1)* %arrayidx3, align 4, !tbaa !13 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) local_unnamed_addr #2 + +; Function Attrs: convergent nounwind +declare spir_func float @_Z11vloada_halfmPKDh(i64, half*) local_unnamed_addr #3 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } +attributes #5 = { convergent nobuiltin nounwind readonly } +attributes #6 = { convergent nobuiltin nounwind } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} +!host.build_options = !{!8} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (half addrspace(1)*, float addrspace(1)*)* @test, !3, !4, !5, !6, !7} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 1} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"half*", !"float*"} +!6 = !{!"kernel_arg_base_type", !"half*", !"float*"} +!7 = !{!"kernel_arg_type_qual", !"const", !""} +!8 = !{!""} +!9 = !{!10, !10, i64 0} +!10 = !{!"short", !11, i64 0} +!11 = !{!"omnipotent char", !12, i64 0} +!12 = !{!"Simple C/C++ TBAA"} +!13 = !{!14, !14, i64 0} +!14 = !{!"float", !11, i64 0} + +; This test checks that an instantiated call with a constant operand gets +; that operand instantiated (packet-broadcast) correctly instead of causing the +; instantiation of the call to fail, thereby causing the packetization of the +; store to fail. +; CHECK: define spir_kernel void @__vecz_v4_test + +; CHECK: %[[C0:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}}) +; CHECK: %[[C1:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}}) +; CHECK: %[[C2:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}}) +; CHECK: %[[C3:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}}) +; CHECK: %[[G0:.+]] = insertelement <4 x float> poison, float %[[C0]], {{(i32|i64)}} 0 +; CHECK: %[[G1:.+]] = insertelement <4 x float> %[[G0]], float %[[C1]], {{(i32|i64)}} 1 +; CHECK: %[[G2:.+]] = insertelement <4 x float> %[[G1]], float %[[C2]], {{(i32|i64)}} 2 +; CHECK: %[[G3:.+]] = insertelement <4 x float> %[[G2]], float %[[C3]], {{(i32|i64)}} 3 +; CHECK: store <4 x float> %[[G3]], ptr addrspace(1) %{{.+}} +; CHECK-NOT: store float + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll new file mode 100644 index 0000000000000..5f95b1edde16f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll @@ -0,0 +1,73 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: @.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1 +@.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1 +@.strf = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1 + +; Function Attrs: nounwind +; CHECK-LABEL: define spir_kernel void @__vecz_v4_printf_kernel( +; CHECK: if.then: +; CHECK: [[ELT0:%.*]] = extractelement +; CHECK: [[ELT1:%.*]] = extractelement +; CHECK: [[ELT2:%.*]] = extractelement +; CHECK: [[ELT3:%.*]] = extractelement +; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT0]] +; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT1]] +; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT2]] +; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT3]] +; CHECK: ret void +define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %cmp = icmp eq i64 %call, 13 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8 + x i8] addrspace(2)* @.str, i64 0, i64 0), i32 %0) #3 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @printf_kernel, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0, i32 0} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"int*", !"int*", !"int*", !"int", !"int"} +!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*", !"int", !"int"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !""} +!6 = !{!"clang version 3.8.0 "} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll new file mode 100644 index 0000000000000..ec254f12ab85f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll @@ -0,0 +1,87 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k load16 -vecz-simd-width 4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-p:32:32-f64:64-i64:64-v128:64-v64:64-v32:32-v16:16-n8:16:32-S64" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) #0 !shave_original_kernel !10 { +entry: + %call = call i32 @__mux_get_global_id(i32 0) #2 + %call1 = call i32 @__mux_get_global_id(i32 1) #2 + %mul = mul nsw i32 %call1, %stride + %add = add nsw i32 %mul, %call + %mul2 = shl nsw i32 %add, 1 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i32 %mul2 + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %mul3 = mul nsw i32 %call1, %stride + %add4 = add nsw i32 %mul3, %call + %mul5 = shl nsw i32 %add4, 1 + %add6 = add i32 %mul5, 3 + %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %in, i32 %add6 + %1 = load i8, i8 addrspace(1)* %arrayidx7, align 1 + %add9 = add i8 %1, %0 + %mul11 = mul nsw i32 %call1, %stride + %add12 = add nsw i32 %mul11, %call + %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %add12 + store i8 %add9, i8 addrspace(1)* %arrayidx13, align 1 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i32 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!llvm.ident = !{!2} +!opencl.kernels = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{!"clang version 7.0.0 (tags/RELEASE_700/final) (based on LLVM 7.0.0)"} +!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i32)* @load16, !4, !5, !6, !7, !8, !9} +!4 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 0} +!5 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} +!6 = !{!"kernel_arg_type", !"uchar*", !"uchar*", !"int"} +!7 = !{!"kernel_arg_base_type", !"uchar*", !"uchar*", !"int"} +!8 = !{!"kernel_arg_type_qual", !"", !"", !""} +!9 = !{!"kernel_arg_name", !"out", !"in", !"stride"} +!10 = !{!"load16"} + +; Function start +; CHECK: define spir_kernel void @__vecz_v4_load16 + +; There should be exactly 2 interleaved loads in the code +; CHECK: call <4 x i8> @__vecz_b_interleaved_load1_2_Dv4_hu3ptrU3AS1 +; CHECK: call <4 x i8> @__vecz_b_interleaved_load1_2_Dv4_hu3ptrU3AS1 + +; There shouldn't be any more interleaved loads or stores left +; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load + +; There definitely shouldn't be any gather loads +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load + +; Function end +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll new file mode 100644 index 0000000000000..d2fda25173763 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll @@ -0,0 +1,66 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; This test checks that we can optimize interleaved accesses out of order. + +define dso_local spir_kernel void @interleaved_load_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %conv, %mul + %mul3 = shl nsw i32 %add, 1 + ; LLVM will not generate an add, but the precise form of the or instruction + ; that gets generated depends on the LLVM version. + ; LLVM 17-: %add4 = or i32 %mul3, 1 + ; LLVM 18+: %add4 = or disjoint i32 %mul3, 1 + ; The LLVM 17 form is not recognized as an add by LLVM 18, and the LLVM 18 + ; form uses a flag which does not exist in LLVM 17. As this is not the + ; purpose of the test, use an add instruction here for now, and revisit this + ; once our minimum version of LLVM is LLVM 18. + %add4 = add nsw nuw i32 %mul3, 1 + %idxprom = sext i32 %add4 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %0 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx) + %idxprom8 = sext i32 %mul3 to i64 + %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8 + %1 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx9) + %sub1 = sub nsw <4 x i32> %0, %1 + %idxprom12 = sext i32 %add to i64 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom12 + %2 = bitcast i32 addrspace(1)* %arrayidx13 to <4 x i32> addrspace(1)* + store <4 x i32> %sub1, <4 x i32> addrspace(1)* %2, align 4 + ret void +} + +; CHECK: __vecz_v4_interleaved_load_4( +; CHECK: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CHECK: [[TMP2:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4 +; CHECK: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 4 +; CHECK: %deinterleave = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK: %deinterleave1 = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK: %sub1 = sub nsw <4 x i32> %deinterleave1, %deinterleave + + +declare i64 @__mux_get_global_id(i32) +declare <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)*) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll new file mode 100644 index 0000000000000..9af442b68e1a2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll @@ -0,0 +1,95 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k f -vecz-simd-width 4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0 + %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3 + store double 1.600000e+01, double addrspace(1)* %.cast, align 8 + %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32 + %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32 + %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32 + %div = fdiv <4 x double> %3, %4 + %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div) + %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32 + %sub = fsub <4 x double> %6, %5 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +declare void @__mux_work_group_barrier(i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"} +!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""} +!6 = !{!"clang version 3.8.1 "} + +; Function start +; CHECK: define spir_kernel void @__vecz_v4_f +; CHECK: call i64 @__mux_get_global_id(i32 0) + +; There should be exactly 4 interleaved loads and one store in the code +; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1 +; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1 + +; And in between them there should be a barrier call +; CHECK: call void @__mux_work_group_barrier +; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}} +; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1 +; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1 + +; There shouldn't be any more interleaved loads or stores left +; CHECK-NOT: call <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1 +; CHECK-NOT: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}} + +; There should be some sufflevector instructions after the simplification +; CHECK: shufflevector + +; Function end +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll new file mode 100644 index 0000000000000..0b94abd180c31 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll @@ -0,0 +1,207 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k ctpop -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTPOP +; RUN: veczc -k ctlz -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTLZ +; RUN: veczc -k cttz -vecz-simd-width=8 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTTZ +; RUN: veczc -k sadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix SADD_SAT +; RUN: veczc -k uadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix UADD_SAT +; RUN: veczc -k ssub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix SSUB_SAT +; RUN: veczc -k usub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix USUB_SAT + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; It checks that the scalar intrinsics get vectorized, +; and the vector intrinsics get scalarized and then re-vectorized. + +define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a) + %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b) + store i32 %ctpopi32, i32* %arrayidxy, align 4 + store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false) + %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false) + store i32 %ctlzi32, i32* %arrayidxy, align 4 + store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false) + %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false) + store i32 %cttzi32, i32* %arrayidxy, align 4 + store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +declare i32 @llvm.ctpop.i32(i32) +declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>) + +declare i32 @llvm.ctlz.i32(i32, i1) +declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) + +declare i32 @llvm.cttz.i32(i32, i1) +declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1) + +declare i32 @llvm.sadd.sat.i32(i32, i32) +declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.uadd.sat.i32(i32, i32) +declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.ssub.sat.i32(i32, i32) +declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.usub.sat.i32(i32, i32) +declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i64 @__mux_get_global_id(i32) + +; CTPOP: void @__vecz_v2_ctpop +; CTPOP: = call {{.*}}<2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}}) +; CTPOP: = call {{.*}}<2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}}) +; CTPOP: = call {{.*}}<2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}}) + +; CTLZ: void @__vecz_v4_ctlz +; CTLZ: = call {{.*}}<4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false) +; CTLZ: = call {{.*}}<4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false) +; CTLZ: = call {{.*}}<4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false) + +; CTTZ: void @__vecz_v8_cttz +; CTTZ: = call {{.*}}<8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false) +; CTTZ: = call {{.*}}<8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false) +; CTTZ: = call {{.*}}<8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false) + +; SADD_SAT: void @__vecz_v2_sadd_sat +; SADD_SAT: = call <2 x i32> @llvm.sadd.sat.v2i32( +; SADD_SAT: = call <2 x i8> @llvm.sadd.sat.v2i8( +; SADD_SAT: = call <2 x i8> @llvm.sadd.sat.v2i8( + +; UADD_SAT: void @__vecz_v2_uadd_sat +; UADD_SAT: = call <2 x i32> @llvm.uadd.sat.v2i32( +; UADD_SAT: = call <2 x i8> @llvm.uadd.sat.v2i8( +; UADD_SAT: = call <2 x i8> @llvm.uadd.sat.v2i8( + +; SSUB_SAT: void @__vecz_v2_ssub_sat +; SSUB_SAT: = call <2 x i32> @llvm.ssub.sat.v2i32( +; SSUB_SAT: = call <2 x i8> @llvm.ssub.sat.v2i8( +; SSUB_SAT: = call <2 x i8> @llvm.ssub.sat.v2i8( + +; USUB_SAT: void @__vecz_v2_usub_sat +; USUB_SAT: = call <2 x i32> @llvm.usub.sat.v2i32( +; USUB_SAT: = call <2 x i8> @llvm.usub.sat.v2i8( +; USUB_SAT: = call <2 x i8> @llvm.usub.sat.v2i8( diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll new file mode 100644 index 0000000000000..d74607eea657e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll @@ -0,0 +1,200 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k ctpop -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix CTPOP +; RUN: veczc -k ctlz -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s --check-prefix CTLZ +; RUN: veczc -k cttz -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s --check-prefix CTTZ +; RUN: veczc -k sadd_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix SADD_SAT +; RUN: veczc -k uadd_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix UADD_SAT +; RUN: veczc -k ssub_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix SSUB_SAT +; RUN: veczc -k usub_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix USUB_SAT + +; It checks that the scalar intrinsics get vectorized, +; and the vector intrinsics get widened. + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a) + %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b) + store i32 %ctpopi32, i32* %arrayidxy, align 4 + store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false) + %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false) + store i32 %ctlzi32, i32* %arrayidxy, align 4 + store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false) + %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false) + store i32 %cttzi32, i32* %arrayidxy, align 4 + store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx + %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx + %a = load i32, i32* %arrayidxa, align 4 + %y = load i32, i32* %arrayidxy, align 4 + %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y) + %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx + %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx + %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2 + %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2 + %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z) + store i32 %v_i32, i32* %arrayidxy, align 4 + store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2 + ret void +} + +declare i32 @llvm.ctpop.i32(i32) +declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>) + +declare i32 @llvm.ctlz.i32(i32, i1) +declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) + +declare i32 @llvm.cttz.i32(i32, i1) +declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1) + +declare i32 @llvm.sadd.sat.i32(i32, i32) +declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.uadd.sat.i32(i32, i32) +declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.ssub.sat.i32(i32, i32) +declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i32 @llvm.usub.sat.i32(i32, i32) +declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) + +declare i64 @__mux_get_global_id(i32) + +; CTPOP: void @__vecz_v2_ctpop +; CTPOP: = call {{.*}}<2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}}) +; CTPOP: = call {{.*}}<4 x i8> @llvm.ctpop.v4i8(<4 x i8> %{{.*}}) + +; CTLZ: void @__vecz_v4_ctlz +; CTLZ: = call {{.*}}<4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false) +; CTLZ: = call {{.*}}<8 x i8> @llvm.ctlz.v8i8(<8 x i8> %{{.*}}, i1 false) + +; CTTZ: void @__vecz_v8_cttz +; CTTZ: = call {{.*}}<8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false) +; CTTZ: = call {{.*}}<16 x i8> @llvm.cttz.v16i8(<16 x i8> %{{.*}}, i1 false) + +; SADD_SAT: void @__vecz_v2_sadd_sat +; SADD_SAT: = call <2 x i32> @llvm.sadd.sat.v2i32( +; SADD_SAT: = call <4 x i8> @llvm.sadd.sat.v4i8( + +; UADD_SAT: void @__vecz_v2_uadd_sat +; UADD_SAT: = call <2 x i32> @llvm.uadd.sat.v2i32( +; UADD_SAT: = call <4 x i8> @llvm.uadd.sat.v4i8( + +; SSUB_SAT: void @__vecz_v2_ssub_sat +; SSUB_SAT: = call <2 x i32> @llvm.ssub.sat.v2i32( +; SSUB_SAT: = call <4 x i8> @llvm.ssub.sat.v4i8( + +; USUB_SAT: void @__vecz_v2_usub_sat +; USUB_SAT: = call <2 x i32> @llvm.usub.sat.v2i32( +; USUB_SAT: = call <4 x i8> @llvm.usub.sat.v4i8( diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll new file mode 100644 index 0000000000000..5f68305bfc205 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll @@ -0,0 +1,44 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Just check that we correctly clean up the assumption cache when vectorizing +; this function.: +; RUN: veczc -k foo -w 2 -S < %s +; RUN: not veczc -k foo -w 2 -vecz-scalable -S < %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @foo(ptr addrspace(1) nocapture readonly %_arg_v_acc) #0 { +entry: + %v4 = tail call i64 @__mux_get_global_id(i32 0) #2 + %v5 = tail call i64 @__mux_get_global_offset(i32 0) #2 + %v6 = sub i64 %v4, %v5 + %v7 = icmp ult i64 %v6, 2147483648 + tail call void @llvm.assume(i1 %v7) + %arrayidx.i.i = getelementptr inbounds i32, ptr addrspace(1) %_arg_v_acc, i64 %v6 + %v8 = load i32, ptr addrspace(1) %arrayidx.i.i, align 4 + ret void +} + +declare void @llvm.assume(i1 noundef) #1 + +declare i64 @__mux_get_global_id(i32) #2 +declare i64 @__mux_get_global_offset(i32) #2 + +attributes #0 = { convergent nounwind "mux-kernel"="entry-point" "mux-orig-fn"="foo" } +attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn inaccessiblememonly } +attributes #2 = { alwaysinline norecurse nounwind readonly } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll new file mode 100644 index 0000000000000..6b42e5fe4ca62 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll @@ -0,0 +1,36 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: not veczc -k noduplicate:4,8 -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @noduplicate(i32 addrspace(1)* %in1, i32 addrspace(1)* %out) { +entry: + %tid = call i64 @__mux_get_global_id(i32 0) #3 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid + %i1 = load i32, i32 addrspace(1)* %arrayidx, align 16 + %dec = call i32 @llvm.loop.decrement.reg.i32(i32 %i1, i32 4) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid + store i32 %dec, i32 addrspace(1)* %arrayidx2, align 16 + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare i32 @llvm.loop.decrement.reg.i32(i32, i32) + +;CHECK: Failed to vectorize function 'noduplicate' diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll new file mode 100644 index 0000000000000..770a31740a8b2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll @@ -0,0 +1,57 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k irreducible_loop -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @irreducible_loop(i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call + %ld = load i32, i32 addrspace(1)* %arrayidx4, align 4 + %cmp = icmp sgt i32 %ld, -1 + br i1 %cmp, label %label, label %do.body + +do.body: ; preds = %entry, %label + %id.0 = phi i64 [ %conv10, %label ], [ %call, %entry ] + br label %label + +label: ; preds = %entry, %do.body + %id.1 = phi i64 [ %id.0, %do.body ], [ %call, %entry ] + %conv10 = add i64 %id.1, 1 + %cmp11 = icmp slt i64 %conv10, 16 + br i1 %cmp11, label %do.body, label %do.end + +do.end: ; preds = %label + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) + +; CHECK: define spir_kernel void @__vecz_v4_irreducible_loop +; CHECK: entry: +; CHECK: br label %irr.guard + +; CHECK: irr.guard: +; CHECK: br i1 %{{.+}}, label %irr.guard.pure_exit, label %irr.guard + +; CHECK: irr.guard.pure_exit: ; preds = %irr.guard +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll new file mode 100644 index 0000000000000..4ffad2c31b104 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll @@ -0,0 +1,45 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @load_add_store(ptr %aptr, ptr %bptr, ptr %zptr) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx + %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx + %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx + %a = load i32, ptr %arrayidxa, align 4 + %b = load i32, ptr %arrayidxb, align 4 + %sum = add i32 %a, %b + store i32 %sum, ptr %arrayidxz, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_load_add_store(ptr %aptr, ptr %bptr, ptr %zptr) +; CHECK: %idx = call i64 @__mux_get_global_id(i32 0) +; CHECK: %arrayidxa = getelementptr i32, ptr %aptr, i64 %idx +; CHECK: %arrayidxb = getelementptr i32, ptr %bptr, i64 %idx +; CHECK: %arrayidxz = getelementptr i32, ptr %zptr, i64 %idx +; CHECK: %[[TMP0:.*]] = load <4 x i32>, ptr %arrayidxa, align 4 +; CHECK: %[[TMP1:.*]] = load <4 x i32>, ptr %arrayidxb, align 4 +; CHECK: %sum1 = add <4 x i32> %[[TMP0]], %[[TMP1]] +; CHECK: store <4 x i32> %sum1, ptr %arrayidxz, align 4 +; CHECK: ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll new file mode 100644 index 0000000000000..5f661497b794b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll @@ -0,0 +1,63 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-choices=InstantiateCallsInLoops -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [23 x i8] c"Hello from %d with %d\0A\00", align 1 +@.str.1 = private unnamed_addr addrspace(2) constant [14 x i8] c"Hello from %d\00", align 1 + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([23 x i8], [23 x i8] addrspace(2)* @.str, i64 0, i64 0), i64 %call, i32 %0) + %call2 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([14 x i8], [14 x i8] addrspace(2)* @.str.1, i64 0, i64 0), i64 %call) + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) + +; CHECK: define spir_kernel void @__vecz_v4_test(ptr addrspace(1) %in) + +; CHECK: [[LOOPHEADER1:instloop.header.*]]: +; CHECK: %[[INSTANCE1:instance.*]] = phi i32 [ 0, {{.+}} ], [ %[[V7:[0-9]+]], %[[LOOPBODY1:instloop.body.*]] ] +; CHECK: %[[V3:[0-9]+]] = icmp {{(samesign )?}}ult i32 %[[INSTANCE1]], 4 +; CHECK: br i1 %[[V3]], label %[[LOOPBODY1]], label {{.+}} + +; CHECK: [[LOOPBODY1]]: +; CHECK: %[[V4:[0-9]+]] = extractelement <4 x i64> %0, i32 %[[INSTANCE1]] +; CHECK: %[[V5:[0-9]+]] = extractelement <4 x i32> %{{.+}}, i32 %[[INSTANCE1]] +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @{{.+}}, i64 %[[V4]], i32 %[[V5]]) +; CHECK: %[[V7]] = add {{(nuw |nsw )*}}i32 %[[INSTANCE1]], 1 +; CHECK: br label %[[LOOPHEADER1]] + +; CHECK: [[LOOPHEADER2:instloop.header.*]]: +; CHECK: %[[INSTANCE3:.+]] = phi i32 [ %[[V11:[0-9]+]], %[[LOOPBODY2:instloop.body.*]] ], [ 0, {{.+}} ] +; CHECK: %[[V8:[0-9]+]] = icmp {{(samesign )?}}ult i32 %[[INSTANCE3]], 4 +; CHECK: br i1 %[[V8]], label %[[LOOPBODY2]], label {{.+}} + +; CHECK: [[LOOPBODY2]]: +; CHECK: %[[V9:[0-9]+]] = extractelement <4 x i64> %0, i32 %[[INSTANCE3]] +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @{{.+}}, i64 %[[V9]]) +; CHECK: %[[V11]] = add {{(nuw |nsw )*}}i32 %[[INSTANCE3]], 1 +; CHECK: br label %[[LOOPHEADER2]] + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll new file mode 100644 index 0000000000000..deef39666e8a1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll @@ -0,0 +1,89 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p) +define spir_kernel void @test_fn(ptr %p) { +entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> {{<(i64 3(, )?)+>|splat \(i64 3\)}}, + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp sgt i64 3, %call +; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64> + %wi_p_i32 = getelementptr i32, ptr %p, i64 %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry +; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]] + %old0 = atomicrmw add ptr %p, i32 1 acquire +; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]] + %old1 = atomicrmw add ptr %wi_p_i32, i32 1 acquire +; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]] + %old2 = atomicrmw umin ptr %wi_p_i32, i32 1 monotonic, align 2 +; CHECK: = call <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b( +; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x float> {{<(float 1.000000e\+00(, )?)+>|splat \(float 1.000000e\+00\)}}, <4 x i1> [[CMP]] + %old3 = atomicrmw volatile fmax ptr %wi_p_i32, float 1.0 syncscope("singlethread") seq_cst + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS:#[0-9]+]] { +; CHECK: entry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ] +; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]] +; CHECK: [[VAL:%.*]] = extractelement <4 x i32> [[VALS]], i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4 +; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[ATOM]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ] +; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1 + +; CHECK: exit: +; CHECK: ret <4 x i32> [[MERGE]] + +; Assume that all masked atomicrmw operations follow the logic above. Just +; check that the right atomicrmw instruction is being generated. +; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS]] { +; CHECK: atomicrmw umin ptr {{%.*}}, i32 {{%.*}} monotonic, align 2 + + +; CHECK: define <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(<4 x ptr> [[PTRS:%0]], <4 x float> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS]] { +; CHECK: atomicrmw volatile fmax ptr {{%.*}}, float {{%.*}} syncscope("singlethread") seq_cst, align 4 + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll new file mode 100644 index 0000000000000..5c061dadd28fc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_fn(ptr %p) { + %ret = call i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 1, i1 true) + ret void +} + +declare i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask) + +; CHECK: define i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask) { +; CHECK: entry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[RET_PREV:%.*]] = phi i32 [ poison, %entry ], [ [[RET:%.*]], %if.else ] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[ATOM:%.*]] = atomicrmw add ptr %p, i32 %val acquire, align 4 +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[RET]] = phi i32 [ [[RET_PREV]], %loopIR ], [ [[ATOM]], %if.then ] +; CHECK: [[CMP:%.*]] = icmp ult i32 %{{.*}}, 1 +; CHECK: br i1 [[CMP]], label %loopIR, label %exit + +; CHECK: exit: +; CHECK: ret i32 [[RET]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll new file mode 100644 index 0000000000000..65811dcc45ff2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll @@ -0,0 +1,81 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; Check if the call to max in the if block has been replaced with its vector +; equivalent +; CHECK: call spir_func <[[WIDTH:[0-9]+]] x i32> @_Z3maxDv[[WIDTH]]_iS_(<[[WIDTH]] x i32> {{.+}}, <[[WIDTH]] x i32> {{.+}}) +; CHECK: call spir_func <[[WIDTH]] x i32> @_Z3maxDv[[WIDTH]]_iS_(<[[WIDTH]] x i32> {{.+}}, <[[WIDTH]] x i32> {{.+}}) + +; There shouldn't be any masked versions of max +; CHECK-NOT: masked_Z3max + +define spir_kernel void @entry(ptr addrspace(1) %input, ptr addrspace(1) %output) { +entry: + %call = tail call i64 @__mux_get_local_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %call + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %add = add nsw i32 %0, 1 + %add3 = add nsw i32 %1, 1 + %call4 = tail call spir_func i32 @_Z3maxii(i32 %add, i32 %add3) + %add.i = shl nsw i32 %call4, 1 + %idxprom.i = sext i32 %add.i to i64 + %arrayidx.i = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %idxprom.i + store i32 %add.i, ptr addrspace(1) %arrayidx.i, align 4 + %2 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %3 = load i32, ptr addrspace(1) %arrayidx, align 4 + %4 = icmp eq i32 %2, -2147483648 + %5 = icmp eq i32 %3, -1 + %6 = and i1 %4, %5 + %7 = icmp eq i32 %3, 0 + %8 = or i1 %7, %6 + %9 = select i1 %8, i32 1, i32 %3 + %10 = icmp eq i32 %9, -1 + %11 = and i1 %4, %10 + %12 = select i1 %11, i32 1, i32 %9 + %rem = srem i32 %2, %12 + %tobool.not = icmp eq i32 %rem, 0 + br i1 %tobool.not, label %if.end, label %if.then + +if.then: + %call9 = tail call spir_func i32 @_Z3maxii(i32 %0, i32 %1) + %add.i27 = shl nsw i32 %call9, 1 + %idxprom.i28 = sext i32 %add.i27 to i64 + %arrayidx.i29 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %idxprom.i28 + store i32 %add.i27, ptr addrspace(1) %arrayidx.i29, align 4 + br label %if.end + +if.end: + %idxprom.i31.pre-phi = phi i64 [ %idxprom.i28, %if.then ], [ %idxprom.i, %entry ] + %add.i30.pre-phi = phi i32 [ %add.i27, %if.then ], [ %add.i, %entry ] + %r.0 = phi i32 [ %call9, %if.then ], [ %call4, %entry ] + %arrayidx.i32 = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %idxprom.i31.pre-phi + store i32 %add.i30.pre-phi, ptr addrspace(1) %arrayidx.i32, align 4 + store i32 %r.0, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +declare i64 @__mux_get_local_id(i32) + +declare spir_func i32 @_Z3maxii(i32, i32) + +declare spir_func <4 x i32> @_Z3maxDv4_iS_(<4 x i32>, <4 x i32>) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll new file mode 100644 index 0000000000000..bc6d2bf2b7ab7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll @@ -0,0 +1,107 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r) +define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) { +entry: +; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0 +; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> {{<(i64 3(, )?)+>|splat \(i64 3\)}}, + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp sgt i64 3, %call +; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64> + %wi_p_i32 = getelementptr i32, ptr %p, i64 %call + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry +; CHECK: [[CALL:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b( +; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, +; CHECK-SAME: <4 x i32> {{<(i32 2(, )?)+>|splat \(i32 2\)}}, <4 x i1> [[CMP]] + %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic + %val0 = extractvalue { i32, i1 } %old0, 0 + %success0 = extractvalue { i32, i1 } %old0, 1 + + %out = getelementptr i32, ptr %q, i64 %call + store i32 %val0, ptr %out, align 4 + + %outsuccess = getelementptr i8, ptr %r, i64 %call + %outbyte = zext i1 %success0 to i8 + store i8 %outbyte, ptr %outsuccess, align 1 + + ; Test a couple of insert/extract patterns +; CHECK: [[INS:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[CALL]], <4 x i1> [[CMP]], 1 +; CHECK: [[EXT:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS]], 1 + %testinsert = insertvalue { i32, i1 } %old0, i1 %cmp, 1 + %testextract = extractvalue { i32, i1 } %testinsert, 1 + + %outbyte0 = zext i1 %testextract to i8 + store i8 %outbyte0, ptr %outsuccess, align 1 + +; CHECK: = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b( + %old1 = cmpxchg weak volatile ptr %wi_p_i32, i32 1, i32 2 syncscope("singlethread") monotonic seq_cst, align 8 + + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) [[ATTRS:#[0-9]+]] { +; CHECK: entry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ] +; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ] +; CHECK: [[PREVSUCCESS:%.*]] = phi <4 x i1> [ poison, %entry ], [ [[MERGESUCCESS:%.*]], %if.else ] +; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]] +; CHECK: [[CMP:%.*]] = extractelement <4 x i32> [[CMPS]], i32 [[IDX]] +; CHECK: [[NEW:%.*]] = extractelement <4 x i32> [[NEWS]], i32 [[IDX]] +; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEW]] acquire monotonic, align 4 +; CHECK: [[VAL:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[VAL]], i32 [[IDX]] +; CHECK: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: [[RETSUCCESS:%.*]] = insertelement <4 x i1> [[PREVSUCCESS]], i1 [[SUCCESS]], i32 [[IDX]] +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ] +; CHECK: [[MERGESUCCESS]] = phi <4 x i1> [ [[PREVSUCCESS]], %loopIR ], [ [[RETSUCCESS]], %if.then ] +; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1 + +; CHECK: exit: +; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i32> [[MERGE]], 0 +; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[INS0]], <4 x i1> [[MERGESUCCESS]], 1 +; CHECK: ret { <4 x i32>, <4 x i1> } [[INS1]] + +; Assume that all masked cmpxchg operations follow the logic above. Just +; check that the right cmpxchg instruction is being generated. +; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) [[ATTRS]] { +; CHECK: cmpxchg weak volatile ptr {{%.*}}, i32 {{%.*}}, i32 {{%.*}} syncscope("singlethread") monotonic seq_cst, align 8 + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll new file mode 100644 index 0000000000000..6340be83b9f66 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_fn(ptr %p) { + %ret = call { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 1, i32 2, i1 true) + ret void +} + +declare { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) + +; CHECK: define { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) { +; CHECK: entry: +; CHECK: br label %loopIR + +; CHECK: loopIR: +; CHECK: [[RETVAL_PREV:%.*]] = phi i32 [ poison, %entry ], [ [[RETVAL:%.*]], %if.else ] +; CHECK: [[RETSUCC_PREV:%.*]] = phi i1 [ poison, %entry ], [ [[RETSUCC:%.*]], %if.else ] +; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false +; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: [[ATOM:%.*]] = cmpxchg ptr %p, i32 %cmp, i32 %newval acquire monotonic, align 4 +; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0 +; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1 +; CHECK: br label %if.else + +; CHECK: if.else: +; CHECK: [[RETVAL]] = phi i32 [ [[RETVAL_PREV]], %loopIR ], [ [[EXT0]], %if.then ] +; CHECK: [[RETSUCC]] = phi i1 [ [[RETSUCC_PREV]], %loopIR ], [ [[EXT1]], %if.then ] +; CHECK: [[CMP:%.*]] = icmp ult i32 %{{.*}}, 1 +; CHECK: br i1 [[CMP]], label %loopIR, label %exit + +; CHECK: exit: +; CHECK: [[INS0:%.*]] = insertvalue { i32, i1 } poison, i32 [[RETVAL]], 0 +; CHECK: [[INS1:%.*]] = insertvalue { i32, i1 } [[INS0]], i1 [[RETSUCC]], 1 +; CHECK: ret { i32, i1 } [[INS1]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll new file mode 100644 index 0000000000000..464c6b89db6d9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll @@ -0,0 +1,45 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes="cfg-convert" -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id() +declare i32 @__mux_work_group_scan_inclusive_smax_i32(i32, i32) + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_foo() +; CHECK-NOT: @__vecz_b_masked___mux_work_group_scan_inclusive_smax_i32 +define spir_kernel void @foo() { +entry: + %0 = call i64 @__mux_get_local_id() + br i1 false, label %for.body.i11, label %if.end.i105.i + +for.body.i11: + %1 = icmp slt i64 %0, 0 + br i1 %1, label %if.end.i13, label %if.end.i13 + +if.end.i13: + br i1 false, label %exit, label %if.end.i105.i + +if.end.i105.i: + %2 = call i32 @__mux_work_group_scan_inclusive_smax_i32(i32 0, i32 0) + br label %exit + +exit: + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll new file mode 100644 index 0000000000000..d88ad53d87e01 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll @@ -0,0 +1,76 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_fn -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @test_fn(i32 addrspace(1)* %results) #0 { +entry: + %results.addr = alloca i32 addrspace(1)*, align 8 + %tid = alloca i32, align 4 + store i32 addrspace(1)* %results, i32 addrspace(1)** %results.addr, align 8 + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + store i32 %conv, i32* %tid, align 4 + %0 = load i32, i32* %tid, align 4 + %cmp = icmp sgt i32 3, %0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i32, i32* %tid, align 4 + %mul = mul nsw i32 2, %1 + %add = add nsw i32 %mul, 2 + %idxprom = sext i32 %add to i64 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %results.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom + store i32 5, i32 addrspace(1)* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(1)*)* @test_fn, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1} +!2 = !{!"kernel_arg_access_qual", !"none"} +!3 = !{!"kernel_arg_type", !"int*"} +!4 = !{!"kernel_arg_base_type", !"int*"} +!5 = !{!"kernel_arg_type_qual", !""} +!6 = !{!"clang version 3.8.0 "} + + +; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) [[ATTRS:#[0-9]+]] { +; CHECK: entry: +; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0 +; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer +; CHECK: %3 = getelementptr i32, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> +; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %3, i32{{( immarg)?}} 4, <4 x i1> %2) # +; CHECK: ret void + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll new file mode 100644 index 0000000000000..3999f2cf44a80 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll @@ -0,0 +1,77 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_fn -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @test_fn(i32 addrspace(1)* %results) #0 { +entry: + %results.addr = alloca i32 addrspace(1)*, align 8 + %tid = alloca i32, align 4 + store i32 addrspace(1)* %results, i32 addrspace(1)** %results.addr, align 8 + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + store i32 %conv, i32* %tid, align 4 + %0 = load i32, i32* %tid, align 4 + %cmp = icmp sgt i32 3, %0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %1 = load i32, i32* %tid, align 4 + %mul = mul nsw i32 2, %1 + %add = add nsw i32 %mul, 2 + %idxprom = sext i32 %add to i64 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %results.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom + store i32 5, i32 addrspace(1)* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(1)*)* @test_fn, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1} +!2 = !{!"kernel_arg_access_qual", !"none"} +!3 = !{!"kernel_arg_type", !"int*"} +!4 = !{!"kernel_arg_base_type", !"int*"} +!5 = !{!"kernel_arg_type_qual", !""} +!6 = !{!"clang version 3.8.0 "} + + +; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) [[ATTRS:#[0-9]+]] { + +; Check for the address splat +; CHECK: %[[BROADCASTADDRSPLATINSERT:.+]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %{{.+}}, {{i32|i64}} 0 +; CHECK: %[[BROADCASTADDRSPLAT:.+]] = shufflevector <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLATINSERT]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer +; CHECK: getelementptr i32, <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLAT]], <4 x i64> + +; CHECK: ret void + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll new file mode 100644 index 0000000000000..0e2d567fd426c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll @@ -0,0 +1,99 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k mask -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @mask(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %call.tr = trunc i64 %call to i32 + %conv = shl i32 %call.tr, 1 + %idx.ext = sext i32 %conv to i64 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext + %0 = load i8, i8 addrspace(1)* %add.ptr, align 1 + %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1 + %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1 + %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idx.ext + %conv4 = sext i8 %0 to i32 + %conv5 = sext i8 %1 to i32 + %add = add nsw i32 %conv5, %conv4 + %cmp = icmp slt i32 %add, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr3, i64 1 + store i8 %0, i8 addrspace(1)* %arrayidx7, align 1 + br label %if.end + +if.else: ; preds = %entry + store i8 %1, i8 addrspace(1)* %add.ptr3, align 1 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!llvm.ident = !{!2} +!opencl.kernels = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"} +!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*)* @mask, !4, !5, !6, !7, !8, !9} +!4 = !{!"kernel_arg_addr_space", i32 1, i32 1} +!5 = !{!"kernel_arg_access_qual", !"none", !"none"} +!6 = !{!"kernel_arg_type", !"char*", !"char*"} +!7 = !{!"kernel_arg_base_type", !"char*", !"char*"} +!8 = !{!"kernel_arg_type_qual", !"", !""} +!9 = !{!"kernel_arg_name", !"out", !"in"} + +; This test makes sure we combine a group of masked interleaved stores +; into a single masked interleaved store using interleave operations. +; We expect the interleaved stores to come out unaltered. + +; CHECK: entry: + +; The data to store gets interleaved: +; CHECK: %interleave{{.*}} = shufflevector <16 x i8> +; CHECK: %interleave{{.*}} = shufflevector <16 x i8> + +; The masks get interleaved: +; CHECK: %interleave{{.*}} = shufflevector <16 x i1> +; CHECK: %interleave{{.*}} = shufflevector <16 x i1> + +; The stores are masked stores: +; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8> +; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8> + +; Definitely no unmasked stores: +; CHECK-NOT: store <16 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll new file mode 100644 index 0000000000000..5b7492f8c1761 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll @@ -0,0 +1,118 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k mask -vecz-simd-width=16 -S -vecz-choices=TargetIndependentPacketization < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @mask(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(1)* %doit) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %call.tr = trunc i64 %call to i32 + %conv = shl i32 %call.tr, 1 + %idx.ext = sext i32 %conv to i64 + %doit.ptr = getelementptr inbounds i8, i8 addrspace(1)* %doit, i64 %idx.ext + %ldbool = load i8, i8 addrspace(1)* %doit.ptr, align 1 + %skip = icmp slt i8 %ldbool, 0 + br i1 %skip, label %if.end, label %yes + +yes: ; preds = %entry + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext + %0 = load i8, i8 addrspace(1)* %add.ptr, align 1 + %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1 + %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1 + %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idx.ext + %conv4 = sext i8 %0 to i32 + %conv5 = sext i8 %1 to i32 + %add = add nsw i32 %conv5, %conv4 + %cmp = icmp slt i32 %add, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %yes + %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr3, i64 1 + store i8 %0, i8 addrspace(1)* %arrayidx7, align 1 + br label %if.end + +if.else: ; preds = %yes + store i8 %1, i8 addrspace(1)* %add.ptr3, align 1 + br label %if.end + +if.end: ; preds = %if.else, %if.then, %entry + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!llvm.ident = !{!2} +!opencl.kernels = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"} +!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @mask, !4, !5, !6, !7, !8, !9} +!4 = !{!"kernel_arg_addr_space", i32 1, i32 1} +!5 = !{!"kernel_arg_access_qual", !"none", !"none"} +!6 = !{!"kernel_arg_type", !"char*", !"char*"} +!7 = !{!"kernel_arg_base_type", !"char*", !"char*"} +!8 = !{!"kernel_arg_type_qual", !"", !""} +!9 = !{!"kernel_arg_name", !"out", !"in"} + +; This test makes sure we combine a group of masked interleaved stores +; into a single masked interleaved store using interleave operations. +; We expect the interleaved stores to come out unaltered. + +; CHECK: entry: +; CHECK: yes: + +; The masks get interleaved: +; CHECK: %interleave{{.*}} = shufflevector <16 x i1> +; CHECK: %interleave{{.*}} = shufflevector <16 x i1> + +; The loads are masked loads: +; CHECK: call <16 x i8> @llvm.masked.load.v16i8.p1(ptr +; CHECK: call <16 x i8> @llvm.masked.load.v16i8.p1(ptr + +; The loaded data gets deinterleaved: +; CHECK: %deinterleave{{.*}} = shufflevector <16 x i8> +; CHECK: %deinterleave{{.*}} = shufflevector <16 x i8> + +; The data to store gets interleaved: +; CHECK: %interleave{{.*}} = shufflevector <16 x i8> +; CHECK: %interleave{{.*}} = shufflevector <16 x i8> + +; The masks get interleaved: +; CHECK: %interleave{{.*}} = shufflevector <16 x i1> +; CHECK: %interleave{{.*}} = shufflevector <16 x i1> + +; The stores are masked stores: +; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8> +; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8> + +; Definitely no unmasked stores: +; CHECK-NOT: store <16 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll new file mode 100644 index 0000000000000..5be3ef46596f0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll @@ -0,0 +1,84 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_varying_if(i32 %a, ptr %b, float %on_true, float %on_false) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds ptr, ptr %b, i64 %idxprom + store float %on_true, ptr %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds ptr, ptr %b, i64 42 + store float %on_false, ptr %arrayidx2, align 4 + br label %if.end + +if.end: + ret void +} + +define spir_kernel void @test_varying_if_as3(i32 %a, ptr addrspace(3) %b, float %on_true, float %on_false) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds ptr, ptr addrspace(3) %b, i64 %idxprom + store float %on_true, ptr addrspace(3) %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds ptr, ptr addrspace(3) %b, i64 42 + store float %on_false, ptr addrspace(3) %arrayidx2, align 4 + br label %if.end + +if.end: + ret void +} + +; CHECK: define void @__vecz_b_masked_store4_fu3ptrb(float [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] { +; CHECK: br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: store float [[A]], ptr [[B]], align 4 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void + +; CHECK: define void @__vecz_b_masked_store4_fu3ptrU3AS3b(float [[A:%.*]], ptr addrspace(3) [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS]] { +; CHECK: br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]] +; CHECK: [[IF]]: +; CHECK-NEXT: store float [[A]], ptr addrspace(3) [[B]], align 4 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void + +; CHECK: attributes [[ATTRS]] = { norecurse nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll new file mode 100644 index 0000000000000..19fb3bda34b03 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll @@ -0,0 +1,70 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [18 x i8] c"Doing stuff, yay!\00", align 1 + +define spir_kernel void @test(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %add = add i64 %call, 1 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + br label %entry.1 + +entry.1: ; preds = %entry + %add1 = add i64 %call, 1 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add1 + store i32 %0, i32 addrspace(1)* %arrayidx2, align 4 + %cmp = icmp eq i64 %call, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry.1 + %call3 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([18 x i8], [18 x i8] addrspace(2)* @.str, i64 0, i64 0)) + br label %if.end + +if.end: ; preds = %if.then, %entry.1 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + br label %if.end1 + +if.end1: ; preds = %if.end + store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) + +; CHECK: define spir_kernel void @__vecz_v4_test + +; Check if the divergent block is masked correctly +; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b +; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b +; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b +; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b + +; Check if the exit block is not masked +; CHECK: load <4 x i32> +; CHECK: store <4 x i32> + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll new file mode 100644 index 0000000000000..ac1d5fc674484 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll @@ -0,0 +1,72 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k entry -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] } + +; Function Attrs: norecurse nounwind +define spir_kernel void @entry(%struct.S2** %result) #0 { +entry: + %c_640 = alloca %struct.S2, align 16 + %p_639 = alloca %struct.S2*, align 8 + store %struct.S2* %c_640, %struct.S2** %p_639, align 8 + %0 = load %struct.S2*, %struct.S2** %p_639, align 8 + store %struct.S2* %0, %struct.S2** %result, align 8 + ret void +} + +define spir_func void @func_10(%struct.S2* %p_484, i64** %ret) { +entry: + %l_462 = alloca i64, align 8 + %l_461 = alloca i64*, align 8 + %.cast = ptrtoint %struct.S2* %p_484 to i64 + store i64 %.cast, i64* %l_462, align 8 + store i64* %l_462, i64** %l_461, align 8 + store i64* %l_462, i64** %ret, align 8 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 + +attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } + +!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0} +!opencl.kernels = !{!1} + +!0 = !{!"clang version 3.8.1 "} +!1 = !{void (%struct.S2**)* @entry, !2, !3, !4, !5, !6} +!2 = !{!"kernel_arg_addr_space", i32 1} +!3 = !{!"kernel_arg_access_qual", !"none"} +!4 = !{!"kernel_arg_type", !"ulong*"} +!5 = !{!"kernel_arg_base_type", !"ulong*"} +!6 = !{!"kernel_arg_type_qual", !""} + +; CHECK: @__vecz_v4_entry + +; Check if the alloca with no value (c_640) is still here +; CHECK: %c_640 = alloca %struct.S2, align 16 + +; Check if the alloca with value (p_639) has been promoted +; CHECK-NOT: %p_639 = alloca %struct.S2*, align 8 +; CHECK-NOT: store %struct.S2* %c_640, %struct.S2** %p_639, align 8 +; CHECK: ret diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll new file mode 100644 index 0000000000000..060ca2bc249fd --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll @@ -0,0 +1,72 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k func_10 -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] } + +; Function Attrs: norecurse nounwind +define spir_kernel void @entry(%struct.S2** %result) #0 { +entry: + %c_640 = alloca %struct.S2, align 16 + %p_639 = alloca %struct.S2*, align 8 + store %struct.S2* %c_640, %struct.S2** %p_639, align 8 + %0 = load %struct.S2*, %struct.S2** %p_639, align 8 + store %struct.S2* %0, %struct.S2** %result, align 8 + ret void +} + +define spir_func void @func_10(%struct.S2* %p_484, i64** %ret) { +entry: + %l_462 = alloca i64, align 8 + %l_461 = alloca i64*, align 8 + %.cast = ptrtoint %struct.S2* %p_484 to i64 + store i64 %.cast, i64* %l_462, align 8 + store i64* %l_462, i64** %l_461, align 8 + store i64* %l_462, i64** %ret, align 8 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1 + +attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } + +!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0} +!opencl.kernels = !{!1} + +!0 = !{!"clang version 3.8.1 "} +!1 = !{void (%struct.S2**)* @entry, !2, !3, !4, !5, !6} +!2 = !{!"kernel_arg_addr_space", i32 1} +!3 = !{!"kernel_arg_access_qual", !"none"} +!4 = !{!"kernel_arg_type", !"ulong*"} +!5 = !{!"kernel_arg_base_type", !"ulong*"} +!6 = !{!"kernel_arg_type_qual", !""} + +; Check if the alloca used for its pointer is still here +; CHECK: @__vecz_v4_func_10 +; CHECK: %l_462 = alloca i64, align 8 + +; Check that the other alloca(s) have been promoted +; CHECK-NOT: alloca + +; Check if the store using the alloca is still here +; CHECK: store i64 %.cast, ptr %l_462, align 8 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll new file mode 100644 index 0000000000000..f3875519a10e8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll @@ -0,0 +1,34 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: store < diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll new file mode 100644 index 0000000000000..aa872c84a60b3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = add nsw i32 %conv, %n + %mul = mul nsw i32 %add, %conv + %idxprom = sext i32 %mul to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _scatter_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll new file mode 100644 index 0000000000000..54d11670a365e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = add nsw i32 %conv, %n + %mul = mul nsw i32 %add, 9 + %idxprom = sext i32 %mul to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll new file mode 100644 index 0000000000000..a9ed4f24f16ab --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = add nsw i32 %conv, %n + %mul = mul nsw i32 %add, %n + %idxprom = sext i32 %mul to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll new file mode 100644 index 0000000000000..aa872c84a60b3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = add nsw i32 %conv, %n + %mul = mul nsw i32 %add, %conv + %idxprom = sext i32 %mul to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _scatter_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll new file mode 100644 index 0000000000000..ba49af776ff08 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll @@ -0,0 +1,35 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %mul = mul nuw nsw i64 %call, 18 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll new file mode 100644 index 0000000000000..0281dad79916b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = shl i32 %n, 1 + %mul = mul i32 %add, %conv + %idxprom = sext i32 %mul to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll new file mode 100644 index 0000000000000..d99f4a812a6ed --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll @@ -0,0 +1,36 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = shl nuw nsw i64 %call, 1 + %mul = mul nuw nsw i64 %add, %call + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _scatter_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll new file mode 100644 index 0000000000000..767d17bb96b86 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %0 = load i32, i32 addrspace(1)* %src, align 4 + %mul = mul nsw i32 %conv2, %n + %add = add nsw i32 %mul, %conv + %idxprom = sext i32 %add to i64 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: store < diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll new file mode 100644 index 0000000000000..a2cf76be8fab5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll @@ -0,0 +1,37 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 addrspace(1)* readnone %r) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = add nuw nsw i64 %call, 255 + %idxprom = and i64 %conv, 255 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %src, i64 %idxprom + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _gather_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll new file mode 100644 index 0000000000000..b6c74f6bfed51 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll @@ -0,0 +1,35 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = add nuw nsw i64 %call, 9 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %add + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: store < diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll new file mode 100644 index 0000000000000..ab533910ee8b2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll @@ -0,0 +1,35 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %mul = mul nuw nsw i64 %call, 9 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll new file mode 100644 index 0000000000000..d1eb22ce6c643 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll @@ -0,0 +1,37 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = add nsw i32 %conv, %n + %idxprom = sext i32 %add to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: store < diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll new file mode 100644 index 0000000000000..92dd028dc1ee0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll @@ -0,0 +1,35 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %mul = mul nuw nsw i64 %call, 5 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll new file mode 100644 index 0000000000000..1a0b92bfb652f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll @@ -0,0 +1,35 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = shl nuw nsw i64 %call, 1 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %add + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll new file mode 100644 index 0000000000000..4dc7b34841204 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll @@ -0,0 +1,35 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %mul = mul nuw nsw i64 %call, %call + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _scatter_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll new file mode 100644 index 0000000000000..549b3a30626dc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll @@ -0,0 +1,36 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i32, i32 addrspace(1)* %src, align 4 + %1 = mul nuw nsw i64 %call, 9 + %mul = add nuw nsw i64 %1, 81 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll new file mode 100644 index 0000000000000..744df39852de9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %0 = load i32, i32 addrspace(1)* %src, align 4 + %add = add nuw nsw i32 %conv, 9 + %mul = mul nsw i32 %add, %n + %idxprom = sext i32 %mul to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @test +; CHECK: _interleaved_ diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll new file mode 100644 index 0000000000000..65031454b1470 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll @@ -0,0 +1,63 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k multiple_exit_blocks -vecz-passes="function(simplifycfg,dce),mergereturn,cfg-convert" -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @multiple_exit_blocks(i64 %n) { +entry: + %gid = tail call i64 @__mux_get_global_id(i32 0) + %lid = tail call i64 @__mux_get_local_id(i32 0) + %cmp1 = icmp slt i64 %lid, %n + %cmp2 = icmp slt i64 %gid, %n + br i1 %cmp2, label %if.then, label %if.end + +if.then: ; preds = %entry + %cmp3 = and i1 %cmp1, %cmp2 + br i1 %cmp3, label %if.then2, label %if.else2 + +if.then2: ; preds = %if.then + br label %if.else2 + +if.else2: ; preds = %if.then, %if.then2 + br i1 %cmp1, label %if.then3, label %if.end + +if.then3: ; preds = %if.else2 + br label %if.end + +if.end: ; preds = %entry, %if.else2, %if.then3 + ret void +} + +; The purpose of this test is to make sure we do not have a kernel that has more +; than one exit block after following the preparation pass. + +; CHECK: define spir_kernel void @__vecz_v4_multiple_exit_blocks + +; We don't want to generate any ROSCC branches: +; CHECK-NOT: entry.ROSCC: + +; Only one return statement: +; CHECK: ret void +; CHECK-NOT: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll new file mode 100644 index 0000000000000..dfb67303ad8ed --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll @@ -0,0 +1,51 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k foo3 -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +define void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + call void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out) + ret void +} + +define spir_kernel void @foo3(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + call void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out) + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_foo3(ptr addrspace(1) %in, ptr addrspace(1) %out) +; CHECK-NOT: call spir_kernel +; CHECK: call i64 @__mux_get_global_id(i32 0) +; CHECK: load <4 x i32>, ptr addrspace(1) %{{.+}}, align 4 +; CHECK: store <4 x i32> %{{.+}}, ptr addrspace(1) %{{.+}}, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll new file mode 100644 index 0000000000000..6adc22cf4efe8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check some basic properties of the veczc command line interface for multiple +; vectorizations works in various configurations. The kernel outputs here are +; not interesting, only their names. +; RUN: veczc -w 8 -k foo:4,8,16.2@32s -k bar:,64s -S < %s | FileCheck %s + +; CHECK-DAG: define spir_kernel void @foo +; CHECK-DAG: define spir_kernel void @bar +; CHECK-DAG: define spir_kernel void @__vecz_v4_foo +; CHECK-DAG: define spir_kernel void @__vecz_v8_foo +; CHECK-DAG: define spir_kernel void @__vecz_nxv16_foo +; CHECK-DAG: define spir_kernel void @__vecz_v8_bar +; CHECK-DAG: define spir_kernel void @__vecz_nxv64_bar + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @foo(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) { +entry: + ret void +} + +define spir_kernel void @bar(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) { +entry: + ret void +} + + diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll new file mode 100644 index 0000000000000..a30fafd7a5b56 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll @@ -0,0 +1,133 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that veczc can vectorize a kernel multiple times in one go, with a +; correct mapping between the vectorized versions of the kernels and their +; scalar base +; RUN: veczc -k add:4,8,16 -S < %s | FileCheck %s + +; CHECK: define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.base ![[BASE_1:[0-9]+]] !codeplay_ca_vecz.base ![[BASE_2:[0-9]+]] !codeplay_ca_vecz.base ![[BASE_3:[0-9]+]] { +; CHECK: define spir_kernel void @__vecz_v[[DERIVED_1_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_1:[0-9]+]] { +; CHECK: define spir_kernel void @__vecz_v[[DERIVED_2_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_2:[0-9]+]] { +; CHECK: define spir_kernel void @__vecz_v[[DERIVED_3_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_3:[0-9]+]] { + +; CHECK: ![[BASE_1]] = !{![[VFMD_1:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_1_VF]]_add +; CHECK: ![[VFMD_1]] = !{i32 [[DERIVED_1_VF]], i32 0, i32 0, i32 0} +; CHECK: ![[BASE_2]] = !{![[VFMD_2:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_2_VF]]_add +; CHECK: ![[VFMD_2]] = !{i32 [[DERIVED_2_VF]], i32 0, i32 0, i32 0} +; CHECK: ![[BASE_3]] = !{![[VFMD_3:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_3_VF]]_add +; CHECK: ![[VFMD_3]] = !{i32 [[DERIVED_3_VF]], i32 0, i32 0, i32 0} + +; CHECK: ![[DERIVED_1]] = !{![[VFMD_1]], {{.*}} @add +; CHECK: ![[DERIVED_2]] = !{![[VFMD_2]], {{.*}} @add +; CHECK: ![[DERIVED_3]] = !{![[VFMD_3]], {{.*}} @add + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 { +entry: + %in1.addr = alloca i32 addrspace(1)*, align 8 + %in2.addr = alloca i32 addrspace(1)*, align 8 + %out.addr = alloca i32 addrspace(1)*, align 8 + %tid = alloca i64, align 8 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30 + store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30 + store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30 + call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31 + %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31 + store i64 %call, i64* %tid, align 8, !dbg !31 + call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32 + %0 = load i64, i64* %tid, align 8, !dbg !32 + %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32 + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32 + store i32 %2, i32* %a, align 4, !dbg !32 + call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33 + %3 = load i64, i64* %tid, align 8, !dbg !33 + %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33 + %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33 + store i32 %5, i32* %b, align 4, !dbg !33 + %6 = load i32, i32* %a, align 4, !dbg !34 + %7 = load i32, i32* %b, align 4, !dbg !34 + %add = add nsw i32 %6, %7, !dbg !34 + %8 = load i64, i64* %tid, align 8, !dbg !34 + %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34 + store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34 + ret void, !dbg !35 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +declare i64 @__mux_get_global_id(i32) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nobuiltin } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!21} +!llvm.module.flags = !{!27} +!llvm.ident = !{!28} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2) +!1 = !DIFile(filename: "", directory: "/tmp") +!2 = !{} +!3 = !{!4} +!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10) +!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp") +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8, !8, !8} +!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64) +!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!10 = !{!11, !12, !13, !14, !19, !20} +!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8) +!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8) +!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8) +!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15) +!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17) +!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp") +!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18) +!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) +!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9) +!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9) +!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26} +!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1} +!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} +!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"} +!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"} +!26 = !{!"kernel_arg_type_qual", !"", !"", !""} +!27 = !{i32 2, !"Debug Info Version", i32 3} +!28 = !{!"clang version 3.8.0 "} +!29 = !DIExpression() +!30 = !DILocation(line: 1, scope: !4) +!31 = !DILocation(line: 3, scope: !4) +!32 = !DILocation(line: 5, scope: !4) +!33 = !DILocation(line: 6, scope: !4) +!34 = !DILocation(line: 7, scope: !4) +!35 = !DILocation(line: 8, scope: !4) + diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll new file mode 100644 index 0000000000000..3aa408292b16e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll @@ -0,0 +1,50 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that veczc can vectorize a kernel then vectorize the vectorized kernel, +; with base mappings from 1->2 and 2->3 and derived mappings back from 2->1 and +; 3->2. +; RUN: veczc -k add:2 -S < %s > %t2 +; RUN: veczc -k __vecz_v2_add:4 -S < %t2 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) { +entry: + %tid = call i64 @__mux_get_global_id(i32 0) #3 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid + %i1 = load i32, i32 addrspace(1)* %arrayidx, align 16 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2, i64 %tid + %i2 = load i32, i32 addrspace(1)* %arrayidx1, align 16 + %add = add nsw i32 %i1, %i2 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid + store i32 %add, i32 addrspace(1)* %arrayidx2, align 16 + ret void +} + +declare i64 @__mux_get_global_id(i32) #2 + +; CHECK: define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.base ![[BASE_1:[0-9]+]] +; CHECK: define spir_kernel void @__vecz_v2_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.base ![[BASE_2:[0-9]+]] !codeplay_ca_vecz.derived ![[DERIVED_1:[0-9]+]] { + ; CHECK: define spir_kernel void @__vecz_v4___vecz_v2_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.derived ![[DERIVED_2:[0-9]+]] { + +; CHECK: ![[BASE_1]] = !{![[VMD_1:[0-9]+]], {{.*}} @__vecz_v2_add} +; CHECK: ![[VMD_1]] = !{i32 2, i32 0, i32 0, i32 0} +; CHECK: ![[BASE_2]] = !{![[VMD_2:[0-9]+]], {{.*}} @__vecz_v4___vecz_v2_add} +; CHECK: ![[VMD_2]] = !{i32 4, i32 0, i32 0, i32 0} +; CHECK: ![[DERIVED_1]] = !{![[VMD_1]], {{.*}} @add} +; CHECK: ![[DERIVED_2]] = !{![[VMD_2]], {{.*}} @__vecz_v2_add} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll new file mode 100644 index 0000000000000..ed574554f9426 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll @@ -0,0 +1,39 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that veczc can vectorize a kernel multiple times in one go, with an +; equal width but with one enabling vector predication. +; RUN: veczc -k add:1s,1sp -S < %s | FileCheck %s + +declare i64 @__mux_get_global_id(i32) + +; CHECK: define spir_kernel void @add( +define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidx.in1 = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %idx + %arrayidx.in2 = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %idx + %in1.v = load i32, ptr addrspace(1) %arrayidx.in1, align 4 + %in2.v = load i32, ptr addrspace(1) %arrayidx.in2, align 4 + %add.v = add i32 %in1.v, %in2.v + %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx + store i32 %add.v, ptr addrspace(1) %arrayidx.out + ret void +} + +; CHECK: define spir_kernel void @__vecz_nxv1_add + +; CHECK: define spir_kernel void @__vecz_nxv1_vp_add diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll new file mode 100644 index 0000000000000..3656854643217 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll @@ -0,0 +1,66 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k priv -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @priv(i32 addrspace(3)* %a) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp ult i32 %storemerge, %conv + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %a, i64 %idxprom + store i32 %conv, i32 addrspace(3)* %arrayidx, align 4 + %inc = add i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(3)*)* @priv, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 3} +!2 = !{!"kernel_arg_access_qual", !"none"} +!3 = !{!"kernel_arg_type", !"int*"} +!4 = !{!"kernel_arg_base_type", !"int*"} +!5 = !{!"kernel_arg_type_qual", !""} +!6 = !{!"clang version 3.8.0 "} + + +; Test if the masked store is defined correctly +; CHECK: call void @__vecz_b_masked_scatter_store4_Dv4_jDv4_u3ptrU3AS3Dv4_b +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll new file mode 100644 index 0000000000000..4ebcd9ec22693 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll @@ -0,0 +1,68 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k memop_loop_dep -vecz-passes=builtin-inlining,scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.addr.0, %e + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in) + call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out) + %0 = extractelement <4 x i32> %call1, i64 0 + %tobool = icmp ne i32 %0, 0 + %tobool2 = icmp eq i64 %call, 0 + %or.cond = and i1 %tobool2, %tobool + br i1 %or.cond, label %while.cond, label %for.inc + +while.cond: ; preds = %while.cond, %for.body + %tobool3 = icmp eq i64 %call, 0 + br i1 %tobool3, label %for.inc, label %while.cond + +for.inc: ; preds = %for.body, %while.cond + %inc = add nsw i32 %i.addr.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*) + +declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*) + +; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep + +; Make sure Scalarization only results in four loads, NOT FIVE +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 +; CHECK: load i32 +; CHECK-NOT: load i32 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll new file mode 100644 index 0000000000000..d61a641d5251e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s -vecz-auto | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @no_vecz1(i32 addrspace(1)* %out, i32 %n) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %call, 0 + br i1 %cmp, label %for.cond.preheader, label %if.end + +for.cond.preheader: ; preds = %entry + %cmp19 = icmp sgt i32 %n, 0 + %spec.select = select i1 %cmp19, i32 %n, i32 0 + store i32 %spec.select, i32 addrspace(1)* %out, align 4 + br label %if.end + +if.end: ; preds = %for.cond.preheader, %entry + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK-NOT: insertelement +; CHECK-NOT: shufflevector +; CHECK-NOT: extractelement +; CHECK-NOT: define void @__vecz_b_masked_store diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll new file mode 100644 index 0000000000000..709ae760784a9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll @@ -0,0 +1,57 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s -vecz-auto | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @no_vecz2(i32 addrspace(1)* %out, i32 %n, i32 addrspace(1)* %m) { +entry: + %0 = load i32, i32 addrspace(1)* %m, align 4 + %call = tail call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %call, 0 + br i1 %cmp, label %for.cond.preheader, label %if.end + +for.cond.preheader: ; preds = %entry + %cmp167 = icmp sgt i32 %n, 0 + br i1 %cmp167, label %for.body29.lr.ph, label %for.cond.cleanup28 + +for.body29.lr.ph: ; preds = %for.cond.preheader + %add = add i32 %0, 1 + %factor = shl i32 %0, 2 + %1 = shl i32 %n, 2 + %2 = add i32 %1, -4 + %reass.mul = mul i32 %2, %add + %3 = add i32 %factor, 4 + %4 = add i32 %3, %reass.mul + br label %for.cond.cleanup28 + +for.cond.cleanup28: ; preds = %for.body29.lr.ph, %for.cond.preheader + %ret.3.lcssa = phi i32 [ %4, %for.body29.lr.ph ], [ 0, %for.cond.preheader ] + store i32 %ret.3.lcssa, i32 addrspace(1)* %out, align 4 + br label %if.end + +if.end: ; preds = %for.cond.cleanup28, %entry + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @{{(__vecz_v16_)?}}no_vecz2 +; CHECK-NOT: extractelement +; CHECK-NOT: define void @__vecz_b_masked_store +; CHECK: store i32 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll new file mode 100644 index 0000000000000..b455570f66c49 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k offset_info_analysis -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @offset_info_analysis(i8 addrspace(1)* noalias %in, i8 addrspace(1)* noalias %out, i32 %width) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %call1 = call i64 @__mux_get_global_id(i32 1) #2 + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %width + %0 = xor i32 %width, -1 + %add = add i32 %conv, %0 + %add5 = add i32 %add, %mul + %idxprom = sext i32 %add5 to i64 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom + %1 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %mul10 = mul nsw i32 %conv2, %width + %add11 = add nsw i32 %mul10, %conv + %idxprom15 = sext i32 %add11 to i64 + %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom15 + store i8 %1, i8 addrspace(1)* %arrayidx16, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks that a 'xor' as a binop operand does correctly get analyzed. +; and masked properly +; CHECK: define spir_kernel void @__vecz_v4_offset_info_analysis +; CHECK: load <4 x i8>, ptr addrspace(1) +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load_Dv4_hDv4_u3ptrU3AS1 +; CHECK: ret void + +; Check the gather load definition is not generated. +;CHECK-NOT: declare <4 x i8> @__vecz_b_gather_load_Dv4_hDv4 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll new file mode 100644 index 0000000000000..fc1d4b9eac4b0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll @@ -0,0 +1,50 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_d +; CHECK: call i64 @__mux_get_global_id(i32 0) +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll new file mode 100644 index 0000000000000..1cff6e5415803 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_f +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll new file mode 100644 index 0000000000000..0ce5bfdfdd701 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll @@ -0,0 +1,268 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isfinited -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z5isinfd(double) +declare spir_func i32 @_Z5isinff(float) +declare spir_func i32 @_Z5isnand(double) +declare spir_func i32 @_Z5isnanf(float) +declare spir_func i32 @_Z7signbitd(double) +declare spir_func i32 @_Z7signbitf(float) +declare spir_func i32 @_Z8isfinited(double) +declare spir_func i32 @_Z8isfinitef(float) +declare spir_func i32 @_Z8isnormald(double) +declare spir_func i32 @_Z8isnormalf(float) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isfinitef(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isfinited(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isinff(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isinfd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isnormalf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isnormald(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isnanf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isnand(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z7signbitf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z7signbitd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isfinited +; CHECK: and <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: zext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll new file mode 100644 index 0000000000000..168bf625a4c37 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll @@ -0,0 +1,268 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isfinitef -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z5isinfd(double) +declare spir_func i32 @_Z5isinff(float) +declare spir_func i32 @_Z5isnand(double) +declare spir_func i32 @_Z5isnanf(float) +declare spir_func i32 @_Z7signbitd(double) +declare spir_func i32 @_Z7signbitf(float) +declare spir_func i32 @_Z8isfinited(double) +declare spir_func i32 @_Z8isfinitef(float) +declare spir_func i32 @_Z8isnormald(double) +declare spir_func i32 @_Z8isnormalf(float) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isfinitef(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isfinited(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isinff(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isinfd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isnormalf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isnormald(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isnanf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isnand(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z7signbitf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z7signbitd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isfinitef +; CHECK: and <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: zext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll new file mode 100644 index 0000000000000..c11210f1097ea --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_d +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll new file mode 100644 index 0000000000000..67b641587a6af --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_f +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll new file mode 100644 index 0000000000000..56129f29e5ddd --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll @@ -0,0 +1,268 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isinfd -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z5isinfd(double) +declare spir_func i32 @_Z5isinff(float) +declare spir_func i32 @_Z5isnand(double) +declare spir_func i32 @_Z5isnanf(float) +declare spir_func i32 @_Z7signbitd(double) +declare spir_func i32 @_Z7signbitf(float) +declare spir_func i32 @_Z8isfinited(double) +declare spir_func i32 @_Z8isfinitef(float) +declare spir_func i32 @_Z8isnormald(double) +declare spir_func i32 @_Z8isnormalf(float) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isfinitef(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isfinited(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isinff(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isinfd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isnormalf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isnormald(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isnanf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isnand(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z7signbitf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z7signbitd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isinfd +; CHECK: and <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: zext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll new file mode 100644 index 0000000000000..ef9cadee9528c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll @@ -0,0 +1,268 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isinff -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z5isinfd(double) +declare spir_func i32 @_Z5isinff(float) +declare spir_func i32 @_Z5isnand(double) +declare spir_func i32 @_Z5isnanf(float) +declare spir_func i32 @_Z7signbitd(double) +declare spir_func i32 @_Z7signbitf(float) +declare spir_func i32 @_Z8isfinited(double) +declare spir_func i32 @_Z8isfinitef(float) +declare spir_func i32 @_Z8isnormald(double) +declare spir_func i32 @_Z8isnormalf(float) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isfinitef(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isfinited(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isinff(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isinfd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isnormalf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isnormald(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isnanf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isnand(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z7signbitf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z7signbitd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isinff +; CHECK: and <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: zext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll new file mode 100644 index 0000000000000..75862737a2c86 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll @@ -0,0 +1,61 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_d +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: and <4 x i1> +; CHECK: and <4 x i1> +; CHECK: and <4 x i1> +; CHECK: and <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll new file mode 100644 index 0000000000000..0d2c7e0073757 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll @@ -0,0 +1,61 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_f +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: and <4 x i1> +; CHECK: and <4 x i1> +; CHECK: and <4 x i1> +; CHECK: and <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll new file mode 100644 index 0000000000000..3b885da041f3f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll @@ -0,0 +1,271 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnand -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z5isinfd(double) +declare spir_func i32 @_Z5isinff(float) +declare spir_func i32 @_Z5isnand(double) +declare spir_func i32 @_Z5isnanf(float) +declare spir_func i32 @_Z7signbitd(double) +declare spir_func i32 @_Z7signbitf(float) +declare spir_func i32 @_Z8isfinited(double) +declare spir_func i32 @_Z8isfinitef(float) +declare spir_func i32 @_Z8isnormald(double) +declare spir_func i32 @_Z8isnormalf(float) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isfinitef(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isfinited(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isinff(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isinfd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isnormalf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isnormald(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isnanf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isnand(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z7signbitf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z7signbitd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnand +; CHECK: and <4 x i64> +; CHECK: icmp eq <4 x i64> +; CHECK: and <4 x i64> +; CHECK: icmp ne <4 x i64> +; CHECK: and <4 x i1> +; CHECK: zext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll new file mode 100644 index 0000000000000..1a5b038b5489d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll @@ -0,0 +1,271 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnanf -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z5isinfd(double) +declare spir_func i32 @_Z5isinff(float) +declare spir_func i32 @_Z5isnand(double) +declare spir_func i32 @_Z5isnanf(float) +declare spir_func i32 @_Z7signbitd(double) +declare spir_func i32 @_Z7signbitf(float) +declare spir_func i32 @_Z8isfinited(double) +declare spir_func i32 @_Z8isfinitef(float) +declare spir_func i32 @_Z8isnormald(double) +declare spir_func i32 @_Z8isnormalf(float) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isfinitef(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isfinited(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isinff(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isinfd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isnormalf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isnormald(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isnanf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isnand(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z7signbitf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z7signbitd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnanf +; CHECK: and <4 x i32> +; CHECK: icmp eq <4 x i32> +; CHECK: and <4 x i32> +; CHECK: icmp ne <4 x i32> +; CHECK: and <4 x i1> +; CHECK: zext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll new file mode 100644 index 0000000000000..6dee2711d597c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll @@ -0,0 +1,53 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_d +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: and <4 x i64> +; CHECK: add nsw <4 x i64> +; CHECK: add nsw <4 x i64> +; CHECK: add nsw <4 x i64> +; CHECK: add nsw <4 x i64> +; CHECK: icmp ult <4 x i64> +; CHECK: icmp ult <4 x i64> +; CHECK: icmp ult <4 x i64> +; CHECK: icmp ult <4 x i64> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll new file mode 100644 index 0000000000000..6ffb049b982e0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll @@ -0,0 +1,53 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_f +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: and <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: icmp ult <4 x i32> +; CHECK: icmp ult <4 x i32> +; CHECK: icmp ult <4 x i32> +; CHECK: icmp ult <4 x i32> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: sext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll new file mode 100644 index 0000000000000..880bb8d621d10 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll @@ -0,0 +1,269 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnormald -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z5isinfd(double) +declare spir_func i32 @_Z5isinff(float) +declare spir_func i32 @_Z5isnand(double) +declare spir_func i32 @_Z5isnanf(float) +declare spir_func i32 @_Z7signbitd(double) +declare spir_func i32 @_Z7signbitf(float) +declare spir_func i32 @_Z8isfinited(double) +declare spir_func i32 @_Z8isfinitef(float) +declare spir_func i32 @_Z8isnormald(double) +declare spir_func i32 @_Z8isnormalf(float) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isfinitef(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isfinited(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isinff(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isinfd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isnormalf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isnormald(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isnanf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isnand(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z7signbitf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z7signbitd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnormald +; CHECK: and <4 x i64> +; CHECK: add nsw <4 x i64> +; CHECK: icmp ult <4 x i64> +; CHECK: zext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll new file mode 100644 index 0000000000000..0e0c0a7574e83 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll @@ -0,0 +1,269 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_isnormalf -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) +declare spir_func i32 @_Z5isinfd(double) +declare spir_func i32 @_Z5isinff(float) +declare spir_func i32 @_Z5isnand(double) +declare spir_func i32 @_Z5isnanf(float) +declare spir_func i32 @_Z7signbitd(double) +declare spir_func i32 @_Z7signbitf(float) +declare spir_func i32 @_Z8isfinited(double) +declare spir_func i32 @_Z8isfinitef(float) +declare spir_func i32 @_Z8isnormald(double) +declare spir_func i32 @_Z8isnormalf(float) +declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>) +declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>) +declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>) +declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>) + +define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isfinitef(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isfinited(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isinff(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isinfd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z8isnormalf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z8isnormald(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z5isnanf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z5isnand(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 @_Z7signbitf(float %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call + %0 = load double, double addrspace(1)* %arrayidx, align 8 + %call1 = call spir_func i32 @_Z7signbitd(double %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call + %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16 + %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0) + %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call + store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16 + ret void +} + +define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0) + %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call + store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_isnormalf +; CHECK: and <4 x i32> +; CHECK: add nsw <4 x i32> +; CHECK: icmp ult <4 x i32> +; CHECK: zext <4 x i1> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll new file mode 100644 index 0000000000000..eaf7917c6dfa0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll @@ -0,0 +1,77 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i32 addrspace(2)* %in, i32 addrspace(1)* %out, i8 addrspace(2)* %text, double %f) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %call + %0 = load i32, i32 addrspace(2)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +define spir_kernel void @second_test(i32 %a, i32 %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + ret void +} + +declare i64 @__mux_get_global_id(i32) + +!opencl.kernels = !{!0, !6} +!opencl.kernel_wg_size_info = !{!12} +!llvm.ident = !{!13} + +!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"} +!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"} +!5 = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""} +!6 = !{void (i32, i32)* @second_test, !7, !8, !9, !10, !11} +!7 = !{!"kernel_arg_addr_space", i32 0, i32 0} +!8 = !{!"kernel_arg_access_qual", !"none", !"none"} +!9 = !{!"kernel_arg_type", !"int", !"int"} +!10 = !{!"kernel_arg_base_type", !"int", !"int"} +!11 = !{!"kernel_arg_type_qual", !"", !""} +!12 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, i32 16, i32 1, i32 1, i1 true} +!13 = !{!"clang version 3.8.1 "} + +; Sanity checking +;CHECK-DAG: define spir_kernel void @test(ptr addrspace(2) %in, ptr addrspace(1) %out, ptr addrspace(2) %text, double %f) +;CHECK-DAG: define spir_kernel void @__vecz_v4_test(ptr addrspace(2) %in, ptr addrspace(1) %out, ptr addrspace(2) %text, double %f) + +; Check if we have the metadata for the kernels +; CHECK: !opencl.kernels = !{![[MD0:[0-9]+]], ![[MD6:[0-9]+]], ![[MD12:[0-9]+]]} +; CHECK: !opencl.kernel_wg_size_info = !{![[MD13:[0-9]+]], ![[MD14:[0-9]+]]} +; CHECK: !llvm.ident = !{![[MD15:[0-9]+]]} + +; Check the actual metadata +; CHECK: ![[MD0]] = !{ptr @test, ![[MD1:[0-9]+]], ![[MD2:[0-9]+]], ![[MD3:[0-9]+]], ![[MD4:[0-9]+]], ![[MD5:[0-9]+]]} +; CHECK: ![[MD1]] = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0} +; CHECK: ![[MD2]] = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"} +; CHECK: ![[MD3]] = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"} +; CHECK: ![[MD4]] = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"} +; CHECK: ![[MD5]] = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""} +; CHECK: ![[MD12]] = !{ptr @__vecz_v4_test, ![[MD1]], ![[MD2]], ![[MD3]], ![[MD4]], ![[MD5]]} +; CHECK: ![[MD13]] = !{ptr @test, i32 16, i32 1, i32 1, i1 true} +; CHECK: ![[MD14]] = !{ptr @__vecz_v4_test, i32 16, i32 1, i32 1, i1 true} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll new file mode 100644 index 0000000000000..0438341148fdc --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll @@ -0,0 +1,76 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k second_test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i32 addrspace(2)* %in, i32 addrspace(1)* %out, i8 addrspace(2)* %text, double %f) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %call + %0 = load i32, i32 addrspace(2)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %0, i32 addrspace(1)* %arrayidx1, align 4 + ret void +} + +define spir_kernel void @second_test(i32 %a, i32 %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + ret void +} + +declare i64 @__mux_get_global_id(i32) + +!opencl.kernels = !{!0, !6} +!opencl.kernel_wg_size_info = !{!12} +!llvm.ident = !{!13} + +!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"} +!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"} +!5 = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""} +!6 = !{void (i32, i32)* @second_test, !7, !8, !9, !10, !11} +!7 = !{!"kernel_arg_addr_space", i32 0, i32 0} +!8 = !{!"kernel_arg_access_qual", !"none", !"none"} +!9 = !{!"kernel_arg_type", !"int", !"int"} +!10 = !{!"kernel_arg_base_type", !"int", !"int"} +!11 = !{!"kernel_arg_type_qual", !"", !""} +!12 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, i32 16, i32 1, i32 1, i1 true} +!13 = !{!"clang version 3.8.1 "} + +; Sanity checking +; CHECK: define spir_kernel void @second_test(i32 %a, i32 %b) +; CHECK: define spir_kernel void @__vecz_v4_second_test(i32 %a, i32 %b) + +; Check if we have the metadata for the kernels +; CHECK: !opencl.kernels = !{![[MD0:[0-9]+]], ![[MD6:[0-9]+]], ![[MD12:[0-9]+]]} +; CHECK: !opencl.kernel_wg_size_info = !{![[MD13:[0-9]+]]} +; CHECK: !llvm.ident = !{![[MD14:[0-9]+]]} + +; Check the actual metadata +; CHECK: ![[MD6]] = !{ptr @second_test, ![[MD7:[0-9]+]], ![[MD8:[0-9]+]], ![[MD9:[0-9]+]], ![[MD10:[0-9]+]], ![[MD11:[0-9]+]]} +; CHECK: ![[MD7]] = !{!"kernel_arg_addr_space", i32 0, i32 0} +; CHECK: ![[MD8]] = !{!"kernel_arg_access_qual", !"none", !"none"} +; CHECK: ![[MD9]] = !{!"kernel_arg_type", !"int", !"int"} +; CHECK: ![[MD10]] = !{!"kernel_arg_base_type", !"int", !"int"} +; CHECK: ![[MD11]] = !{!"kernel_arg_type_qual", !"", !""} +; CHECK: ![[MD12]] = !{ptr @__vecz_v4_second_test, ![[MD7]], ![[MD8]], ![[MD9]], ![[MD10]], ![[MD11]]} +; CHECK: ![[MD13]] = !{ptr @test, i32 16, i32 1, i32 1, i1 true} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll new file mode 100644 index 0000000000000..ae11c9692391e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll @@ -0,0 +1,80 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -vecz-auto -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@entry_test_alloca.lm = external unnamed_addr addrspace(3) constant [16 x <2 x float>], align 8 + +define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in, <2 x float> addrspace(1)* nocapture %out, i32 %offset) local_unnamed_addr { +entry: + %a.sroa.0 = alloca <2 x float>, align 16 + %b.sroa.2 = alloca <2 x float>, align 16 + %call = tail call i64 @__mux_get_global_id(i32 0) + %call1 = tail call i64 @__mux_get_local_id(i32 0) + %a.sroa.0.0..sroa_cast = bitcast <2 x float>* %a.sroa.0 to i8* + %b.sroa.2.0..sroa_cast = bitcast <2 x float>* %b.sroa.2 to i8* + %arrayidx2 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %call1 + %0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx2, align 8 + %conv = sext i32 %offset to i64 + %add = add i64 %call1, %conv + %arrayidx4 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %add + %1 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx4, align 8 + br label %for.body + +for.cond.cleanup: ; preds = %for.cond.cleanup10 + %mul.le.le = fmul <2 x float> %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0., %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8. + %arrayidx17 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i64 %call + store <2 x float> %mul.le.le, <2 x float> addrspace(1)* %arrayidx17, align 8 + ret void + +for.body: ; preds = %for.cond.cleanup10, %entry + %i.038 = phi i32 [ 0, %entry ], [ %inc15, %for.cond.cleanup10 ] + store volatile <2 x float> %0, <2 x float>* %a.sroa.0, align 8 + store volatile <2 x float> %1, <2 x float>* %b.sroa.2, align 8 + br label %for.body11 + +for.cond.cleanup10: ; preds = %for.body11 + %inc15 = add nuw nsw i32 %i.038, 1 + %cmp = icmp ult i32 %inc15, 16 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.body11: ; preds = %for.body11, %for.body + %i6.037 = phi i32 [ 0, %for.body ], [ %inc, %for.body11 ] + %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0. = load volatile <2 x float>, <2 x float>* %a.sroa.0, align 8 + %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8. = load volatile <2 x float>, <2 x float>* %b.sroa.2, align 8 + %inc = add nuw nsw i32 %i6.037, 1 + %cmp8 = icmp ult i32 %inc, 16 + br i1 %cmp8, label %for.body11, label %for.cond.cleanup10 +} + +declare i64 @__mux_get_global_id(i32) local_unnamed_addr +declare i64 @__mux_get_local_id(i32) local_unnamed_addr + +; Check that all the allocas come before anything else +; CHECK: define spir_kernel void @__vecz_v4_test( +; CHECK-NEXT: entry: +; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16 +; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16 +; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16 +; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16 +; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16 +; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16 +; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16 +; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll new file mode 100644 index 0000000000000..16b63d1e1c451 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll @@ -0,0 +1,66 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_branch -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_branch(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks if the branch conditions and the branch BBs are vectorized +; and masked properly +; CHECK: define spir_kernel void @__vecz_v4_test_branch(i32 %a, ptr %b) +; CHECK: %conv = sext i32 %a to i64 +; CHECK: %[[A_SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %conv, {{i32|i64}} 0 +; CHECK: %[[A_SPLAT:.+]] = shufflevector <4 x i64> %[[A_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: %call = call i64 @__mux_get_global_id(i32 0) +; CHECK: %[[GID_SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %call, {{i32|i64}} 0 +; CHECK: %[[GID_SPLAT:.+]] = shufflevector <4 x i64> %[[GID_SPLATINSERT:.+]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: %[[GID:.+]] = add <4 x i64> %[[GID_SPLAT]], +; CHECK: %[[CMP3:.+]] = icmp eq <4 x i64> %[[A_SPLAT]], %[[GID]] +; CHECK: %[[NOT_CMP4:.+]] = xor <4 x i1> %[[CMP3]], {{<(i1 true(, )?)+>|splat \(i1 true\)}} + +; CHECK: %[[IDX:.+]] = sext i32 %a to i64 +; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr %b, i64 %[[IDX]] +; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 11, ptr %[[GEP1]], i1 %{{any_of_mask[0-9]*}}) + +; CHECK: %[[GEP2:.+]] = getelementptr inbounds i32, ptr %b, i64 42 +; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 13, ptr %[[GEP2]], i1 %{{any_of_mask[0-9]*}}) + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll new file mode 100644 index 0000000000000..2d953c5daa499 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll @@ -0,0 +1,149 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that debug info is preserved in the vectorized kernel. +; Specifically that the packetization pass creates vector types +; in the DI for the variables. +; RUN: veczc -k add -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Vectorized kernel function +; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_add({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]] +; Check that intrinsics for user variable locations are still present +define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 { +entry: + %in1.addr = alloca i32 addrspace(1)*, align 8 + %in2.addr = alloca i32 addrspace(1)*, align 8 + %out.addr = alloca i32 addrspace(1)*, align 8 + %tid = alloca i64, align 8 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8 +; CHECK: #dbg_value(ptr addrspace(1) %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]] +; CHECK-SAME: [[PARAM_LOC:![0-9]+]] + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30 + store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8 +; CHECK: #dbg_value(ptr addrspace(1) %in2, [[DI_IN2:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[PARAM_LOC]] + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30 + store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8 +; CHECK: #dbg_value(ptr addrspace(1) %out, [[DI_OUT:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[PARAM_LOC]] + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30 +; CHECK: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[TID_LOC:![0-9]+]] + call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31 + %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31 + store i64 %call, i64* %tid, align 8, !dbg !31 +; CHECK: #dbg_value(i32 poison, [[DI_A:![0-9]+]], !DIExpression(), +; CHECK-SAME: [[A_LOC:![0-9]+]] + call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32 + %0 = load i64, i64* %tid, align 8, !dbg !32 + %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32 + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32 + store i32 %2, i32* %a, align 4, !dbg !32 +; CHECK: #dbg_value(i32 poison, [[DI_B:![0-9]+]], !DIExpression(), +; CHECK-SAME: [[B_LOC:![0-9]+]] + call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33 + %3 = load i64, i64* %tid, align 8, !dbg !33 + %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33 + %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33 + store i32 %5, i32* %b, align 4, !dbg !33 + %6 = load i32, i32* %a, align 4, !dbg !34 + %7 = load i32, i32* %b, align 4, !dbg !34 + %add = add nsw i32 %6, %7, !dbg !34 + %8 = load i64, i64* %tid, align 8, !dbg !34 + %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34 + store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34 + ret void, !dbg !35 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +declare i64 @__mux_get_global_id(i32) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nobuiltin } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!21} +!llvm.module.flags = !{!27} +!llvm.ident = !{!28} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2) +!1 = !DIFile(filename: "", directory: "/tmp") +!2 = !{} +!3 = !{!4} +!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10) +!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp") +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8, !8, !8} +!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64) +!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!10 = !{!11, !12, !13, !14, !19, !20} +!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8) +!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8) +!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8) +!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15) +!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17) +!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp") +!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18) +!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) +!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9) +!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9) +!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26} +!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1} +!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} +!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"} +!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"} +!26 = !{!"kernel_arg_type_qual", !"", !"", !""} +!27 = !{i32 2, !"Debug Info Version", i32 3} +!28 = !{!"clang version 3.8.0 "} +!29 = !DIExpression() +!30 = !DILocation(line: 1, scope: !4) +!31 = !DILocation(line: 3, scope: !4) +!32 = !DILocation(line: 5, scope: !4) +!33 = !DILocation(line: 6, scope: !4) +!34 = !DILocation(line: 7, scope: !4) +!35 = !DILocation(line: 8, scope: !4) + + +; Debug info metadata entries +; CHECK:[[PTR_TYPE:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[DI_BASE:![0-9]+]], size: 64, align: 64) +; CHECK:[[DI_BASE]] = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) + +; CHECK: [[VECZ_SUBPROG]] = distinct !DISubprogram(name: "add", +; CHECK-SAME: retainedNodes: [[VECZ_VARS:![0-9]+]] + +; CHECK: [[VECZ_VARS]] = !{[[DI_IN1]], [[DI_IN2]], [[DI_OUT]], [[DI_TID]], [[DI_A:![0-9]+]], [[DI_B:![0-9]+]]} +; CHECK: [[DI_IN1]] = !DILocalVariable(name: "in1", arg: 1, scope: [[VECZ_SUBPROG]], +; CHECK-SAME:line: 1, type: [[PTR_TYPE]] +; CHECK: [[DI_IN2]] = !DILocalVariable(name: "in2", arg: 2, scope: [[VECZ_SUBPROG]], +; CHECK-SAME:line: 1, type: [[PTR_TYPE]] +; CHECK: [[DI_OUT]] = !DILocalVariable(name: "out", arg: 3, scope: [[VECZ_SUBPROG]], +; CHECK-SAME: line: 1, type: [[PTR_TYPE]] + +; CHECK: [[DI_TID]] = !DILocalVariable(name: "tid", scope: [[VECZ_SUBPROG]] +; CHECK: [[DI_A]] = !DILocalVariable(name: "a", scope: [[VECZ_SUBPROG]], diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll new file mode 100644 index 0000000000000..9750f6bae94d0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll @@ -0,0 +1,93 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_nonvarying_loadstore -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_branch(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: + ret void +} + +define spir_kernel void @test_uniform_branch(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i32 %a, 42 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %idxprom = sext i32 %a to i64 + %idxadd = add i64 %idxprom, %call + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxadd + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %call + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: + %ptr = phi i32* [ %arrayidx, %if.then ], [ %arrayidx2, %if.else ] + %ptrplus = getelementptr inbounds i32, i32* %ptr, i64 %call + store i32 17, i32* %ptrplus, align 4 + ret void +} + +define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) { + %index = call i64 @__mux_get_global_id(i32 0) + %a.i = getelementptr i32, i32* %a, i64 %index + %b.i = getelementptr i32, i32* %b, i64 %index + %c.i = getelementptr i32, i32* %c, i64 %index + %a.load = load i32, i32* %a.i, align 4 + %b.load = load i32, i32* %b.i, align 4 + %add = add i32 %a.load, %b.load + store i32 %add, i32* %c.i + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks if a simple kernel is vectorized without any masks +; CHECK: define spir_func void @__vecz_v4_test_nonvarying_loadstore(ptr %a, ptr %b, ptr %c) +; CHECK: %index = call i64 @__mux_get_global_id(i32 0) +; CHECK: %a.i = getelementptr i32, ptr %a, i64 %index +; CHECK: %b.i = getelementptr i32, ptr %b, i64 %index +; CHECK: %c.i = getelementptr i32, ptr %c, i64 %index +; CHECK: %[[LAV:.+]] = load <4 x i32>, ptr %a.i{{(, align 4)?}} +; CHECK: %[[LBV:.+]] = load <4 x i32>, ptr %b.i{{(, align 4)?}} +; CHECK: %[[ADD1:.+]] = add <4 x i32> %[[LAV]], %[[LBV]] +; CHECK: store <4 x i32> %[[ADD1]], ptr %c.i{{(, align 4)?}} +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll new file mode 100644 index 0000000000000..9f82652ea8a23 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll @@ -0,0 +1,105 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_uniform_branch -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_branch(i32 %a, i32* %b) { +entry: + %conv = sext i32 %a to i64 + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %conv, %call + br i1 %cmp, label %if.then, label %if.else + +if.then: + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42 + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: + ret void +} + +define spir_kernel void @test_uniform_branch(i32 %a, i32* %b) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i32 %a, 42 + br i1 %cmp, label %if.then, label %if.else + +if.then: + %idxprom = sext i32 %a to i64 + %idxadd = add i64 %idxprom, %call + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxadd + store i32 11, i32* %arrayidx, align 4 + br label %if.end + +if.else: + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %call + store i32 13, i32* %arrayidx2, align 4 + br label %if.end + +if.end: + %ptr = phi i32* [ %arrayidx, %if.then ], [ %arrayidx2, %if.else ] + %ptrplus = getelementptr inbounds i32, i32* %ptr, i64 %call + store i32 17, i32* %ptrplus, align 4 + ret void +} + +define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) { + %index = call i64 @__mux_get_global_id(i32 0) + %a.i = getelementptr i32, i32* %a, i64 %index + %b.i = getelementptr i32, i32* %b, i64 %index + %c.i = getelementptr i32, i32* %c, i64 %index + %a.load = load i32, i32* %a.i, align 4 + %b.load = load i32, i32* %b.i, align 4 + %add = add i32 %a.load, %b.load + store i32 %add, i32* %c.i + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks if the if blocks are vectorized without masks and if the phi +; node is also vectorized properly +; CHECK: define spir_kernel void @__vecz_v4_test_uniform_branch(i32 %a, ptr %b) +; CHECK: %call = call i64 @__mux_get_global_id(i32 0) +; CHECK: %[[SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %call, {{i32|i64}} 0 +; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i64> %[[SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: %[[GID:.+]] = add <4 x i64> %[[SPLAT]], +; CHECK: %cmp = icmp eq i32 %a, 42 +; CHECK: br i1 %cmp, label %if.then, label %if.else + +; CHECK: if.then: +; CHECK: %[[GEP1:.+]] = getelementptr i32, ptr %b, <4 x i64> +; CHECK: store <4 x i32> {{<(i32 11(, )?)+>|splat \(i32 11\)}}, ptr %{{.+}}, align 4 +; CHECK: br label %if.end + +; CHECK: if.else: +; CHECK: %[[GEP2:.+]] = getelementptr i32, ptr %b, <4 x i64> +; CHECK: store <4 x i32> {{<(i32 13(, )?)+>|splat \(i32 13\)}}, ptr %{{.+}}, align 4 +; CHECK: br label %if.end + +; CHECK: if.end: +; CHECK: %[[PTR:.+]] = phi <4 x ptr> [ %[[GEP1]], %if.then ], [ %[[GEP2]], %if.else ] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll new file mode 100644 index 0000000000000..7b27991e6740f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll @@ -0,0 +1,50 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) + +define spir_kernel void @test(ptr %0, ptr %1) { +entry: + %lid = tail call i64 @__mux_get_local_id(i32 0) + %ptr.0 = getelementptr i32, ptr %0, i64 %lid + %ptr.1 = getelementptr i32, ptr %1, i64 %lid + %val = load i48, ptr %ptr.0 + store i48 %val, ptr %ptr.1 + ret void +} + +; CHECK-LABEL: define spir_kernel void @test +; CHECK: load i48 +; CHECK-NOT: load i48 +; CHECK: store i48 +; CHECK-NOT: store i48 + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test +; CHECK: load i48 +; CHECK: load i48 +; CHECK: load i48 +; CHECK: load i48 +; CHECK-NOT: load i48 +; CHECK: store i48 +; CHECK: store i48 +; CHECK: store i48 +; CHECK: store i48 +; CHECK-NOT: store i48 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll new file mode 100644 index 0000000000000..4723d16da1af5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll @@ -0,0 +1,41 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_foo() +define spir_kernel void @foo() { +; CHECK-LABEL: entry: +entry: + ; CHECK: %0 = call { <4 x i64>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align8_monotonic_monotonic_1_Dv4_u3ptrDv4_mDv4_mDv4_b( + %0 = cmpxchg ptr null, i64 0, i64 0 monotonic monotonic, align 8 + ; CHECK: br label %bb.1 + br label %bb.1 + +; CHECK-LABEL: bb.1: +bb.1: + ; CHECK: %1 = phi { <4 x i64>, <4 x i1> } [ %0, %bb.1 ], [ %0, %entry ] + %1 = phi { i64, i1 } [ %0, %bb.1 ], [ %0, %entry ] + ; CHECK: %2 = extractvalue { <4 x i64>, <4 x i1> } %1, 0 + %2 = extractvalue { i64, i1 } %1, 0 + ; %3 = call { <4 x i64>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align8_monotonic_monotonic_1_Dv4_u3ptrDv4_mDv4_mDv4_b( + %3 = cmpxchg ptr null, i64 0, i64 %2 monotonic monotonic, align 8 + ; CHECK: br label %bb.1 + br label %bb.1 +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll new file mode 100644 index 0000000000000..c23396643f9d1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll @@ -0,0 +1,46 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +%struct.T = type { i32, i8, float, i64 } + +; Function Attrs: nounwind +define spir_kernel void @test(%struct.T addrspace(1)* %in, %struct.T addrspace(1)* %out, i32 addrspace(1)* %offsets) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %conv = sext i32 %0 to i64 + %add = add i64 %conv, %call + %c = getelementptr inbounds %struct.T, %struct.T addrspace(1)* %in, i64 %add, i32 2 + %1 = load float, float addrspace(1)* %c, align 8 + %c3 = getelementptr inbounds %struct.T, %struct.T addrspace(1)* %out, i64 %add, i32 2 + store float %1, float addrspace(1)* %c3, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; Check if we can packetize GEPs on structs +; Note that we only need to packetize the non-uniform operands.. +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: getelementptr %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2 +; CHECK: getelementptr %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll new file mode 100644 index 0000000000000..4c5d2b32da7f4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll @@ -0,0 +1,159 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k conditional -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %rem1 = and i32 %0, 1 + %tobool = icmp eq i32 %rem1, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +; This test checks if the "packetize uniform" Vecz choice works on uniform +; values used by varying values, but not on uniform values used by other uniform +; values only. + +; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out) +; CHECK: insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %in, {{(i32|i64)}} 0 +; CHECK: shufflevector <4 x ptr addrspace(1)> +; CHECK: call <4 x i32> @__vecz_b_gather_load4_Dv4_jDv4_u3ptrU3AS1 +; CHECK: store <4 x i32> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll new file mode 100644 index 0000000000000..89d5118c8fbc1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll @@ -0,0 +1,159 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k conditional -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %rem1 = and i32 %0, 1 + %tobool = icmp eq i32 %rem1, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +; This test checks the kernel when the "packetize uniform" Vecz choice is not +; explicitly set. Currently, this means that the uniform values should not be +; packetized. + +; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out) +; CHECK: load i32, ptr +; CHECK: insertelement <4 x i32> poison +; CHECK: shufflevector <4 x i32> +; CHECK: store <4 x i32> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll new file mode 100644 index 0000000000000..982b2352ced3a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll @@ -0,0 +1,159 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k noreduce -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %rem1 = and i32 %0, 1 + %tobool = icmp eq i32 %rem1, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +; This test checks the kernel when the "packetize uniform" Vecz choice is not +; explicitly set. Currently, this means that the uniform values should not be +; packetized. + +; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: icmp ugt i64 +; CHECK: and i32{{.*}}, 3 +; CHECK: icmp eq i32 +; CHECK: shl i32 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll new file mode 100644 index 0000000000000..2e5a7b31a1665 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll @@ -0,0 +1,73 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k noreduce2 -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 8 + %1 = select i1 %0, i32 17, i32 %storemerge + %rem = urem i32 37, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; This test checks the kernel when the "packetize uniform" Vecz choice is not +; explicitly set. Currently, this means that the uniform values should not be +; packetized. + +; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: icmp ugt i64 %{{.+}}, 1 +; CHECK: phi i32 +; CHECK: icmp eq i32 %{{.+}}, 8 +; CHECK: urem i32 37 +; CHECK: icmp eq i32 %{{.+}}, 0 +; CHECK: store i32 5 +; CHECK: shl i32 %{{.+}}, 1 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll new file mode 100644 index 0000000000000..6fc47a670a781 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll @@ -0,0 +1,165 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k reduce -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %rem1 = and i32 %0, 1 + %tobool = icmp eq i32 %rem1, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +; This test checks the kernel when the "packetize uniform" Vecz choice is not +; explicitly set. Currently, this means that the uniform values should not be +; packetized. + +; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: insertelement <4 x i64> poison, i64 +; CHECK: shufflevector <4 x i64> +; CHECK: %[[LOCAL_SIZE:[^ ]+]] = call i64 @__mux_get_local_size(i32 0) +; CHECK: icmp {{(ugt|ult)}} i64 %[[LOCAL_SIZE]], {{(1|2)}} +; CHECK-NEXT: br +; CHECK: phi i32 +; CHECK: mul i32 %{{.+}}, 3 +; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer +; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS3Dv4_b(<4 x i32> {{<(i32 5(, )?)+>|splat \(i32 5\)}} +; CHECK: shl i32 %{{.+}}, 1 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll new file mode 100644 index 0000000000000..1a4a89972205f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll @@ -0,0 +1,160 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k conditional -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %rem1 = and i32 %0, 1 + %tobool = icmp eq i32 %rem1, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +; This test checks if the "packetize uniform in loops" Vecz choice works on +; uniform values used by varying values in loops, but not on uniform values used +; by other uniform values only. + +; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out) +; CHECK: load i32, {{(ptr|i32)}} +; CHECK: load i32, {{(ptr|i32)}} +; CHECK: insertelement <4 x i32> poison +; CHECK: shufflevector <4 x i32> +; CHECK: store <4 x i32> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll new file mode 100644 index 0000000000000..fd65118718f99 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll @@ -0,0 +1,159 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k noreduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %rem1 = and i32 %0, 1 + %tobool = icmp eq i32 %rem1, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +; This test checks if the "packetize uniform in loops" Vecz choice works on +; uniform values used by varying values in loops, but not on uniform values used +; by other uniform values only. + +; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: icmp ugt i64 +; CHECK: and i32{{.*}}, 3 +; CHECK: icmp eq i32 +; CHECK: shl i32 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll new file mode 100644 index 0000000000000..9e0a24b6879e6 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll @@ -0,0 +1,73 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k noreduce2 -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 8 + %1 = select i1 %0, i32 17, i32 %storemerge + %rem = urem i32 37, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; This test checks if the "packetize uniform in loops" Vecz choice works on +; uniform values used by varying values in loops, but not on uniform values used +; by other uniform values only. + +; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: icmp ugt i64 +; CHECK: phi i32 +; CHECK: icmp eq i32 +; CHECK: urem i32 37 +; CHECK: icmp eq i32 +; CHECK: store i32 5 +; CHECK: shl i32 %{{.+}}, 1 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll new file mode 100644 index 0000000000000..e251cc4bd07e1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll @@ -0,0 +1,76 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k reduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; This test checks if the "packetize uniform in loops" Vecz choice works on +; uniform values used by varying values in loops, but not on uniform values used +; by other uniform values only. + +; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: insertelement <4 x i64> poison, i64 %{{.+}}, {{(i32|i64)}} 0 +; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: phi <4 x i32> +; CHECK: mul <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}} +; CHECK: urem <4 x i64> +; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer + +; The branch condition is actually Uniform, despite the divergence analysis +; CHECK: icmp ugt i64 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll new file mode 100644 index 0000000000000..93634442feb66 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll @@ -0,0 +1,159 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k noreduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 0 + %1 = select i1 %0, i32 1, i32 %storemerge + %rem = urem i32 3, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: nounwind +define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %0 = load i32, i32 addrspace(1)* %in, align 4 + %rem1 = and i32 %0, 1 + %tobool = icmp eq i32 %rem1, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + %idxprom = sext i32 %0 to i64 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %1, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %entry, %if.then + ret void +} + +; This test checks if the "packetize uniform" Vecz choice works on uniform +; values used by varying values, but not on uniform values used by other uniform +; values only. + +; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: icmp ugt i64 +; CHECK: and i32{{.*}}, 3 +; CHECK: icmp eq i32 +; CHECK: shl i32 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll new file mode 100644 index 0000000000000..716ee2540db66 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll @@ -0,0 +1,73 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k noreduce2 -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = icmp eq i32 %storemerge, 8 + %1 = select i1 %0, i32 17, i32 %storemerge + %rem = urem i32 37, %1 + %cmp3 = icmp eq i32 %rem, 0 + br i1 %cmp3, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %idxprom = zext i32 %storemerge to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; This test checks if the "packetize uniform" Vecz choice works on uniform +; values used by varying values, but not on uniform values used by other uniform +; values only. + +; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: icmp ugt i64 +; CHECK: phi i32 +; CHECK: icmp eq i32 +; CHECK: urem i32 37 +; CHECK: icmp eq i32 +; CHECK: store i32 5 +; CHECK: shl i32 %{{.+}}, 1 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll new file mode 100644 index 0000000000000..3815fce8e6637 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll @@ -0,0 +1,76 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k reduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_local_size(i32) + +; Function Attrs: nounwind +define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) { +entry: + %call = call i64 @__mux_get_local_id(i32 0) + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ] + %conv = zext i32 %storemerge to i64 + %call1 = call i64 @__mux_get_local_size(i32 0) + %cmp = icmp ult i64 %conv, %call1 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %mul = mul i32 %storemerge, 3 + %conv3 = zext i32 %mul to i64 + %0 = icmp eq i32 %mul, 0 + %1 = select i1 %0, i64 1, i64 %conv3 + %rem = urem i64 %call, %1 + %cmp4 = icmp eq i64 %rem, 0 + br i1 %cmp4, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call + store i32 5, i32 addrspace(3)* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %mul6 = shl i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; This test checks if the "packetize uniform" Vecz choice works on uniform +; values used by varying values, but not on uniform values used by other uniform +; values only. + +; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out) +; CHECK: insertelement <4 x i64> poison, i64 %{{.+}}, {{(i32|i64)}} 0 +; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: phi <4 x i32> +; CHECK: mul <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}} +; CHECK: urem <4 x i64> +; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer + +; The branch condition is actually Uniform, despite the divergence analysis +; CHECK: icmp ugt i64 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll new file mode 100644 index 0000000000000..873ea7a983eae --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll @@ -0,0 +1,377 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c +; \ / +; d +; | +; e +; / \ +; / \ +; f g +; / \ / \ +; h i j k +; \ / \ / +; l m +; \ / +; \ / +; n +; +; * where node e is a uniform branch, and nodes a, f and g are varying +; branches. +; * where nodes b, c, d, h, i, j, k, l, m are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; c +; | +; b +; | +; d +; | +; e +; / \ +; f g +; | | +; i k +; | | +; h j +; | | +; l m +; \ / +; n +; +; instead of: +; +; a +; | +; b +; | +; c +; | +; d +; | +; e +; | +; g +; | +; j +; | +; k +; | +; m +; | +; f +; | +; i +; | +; h +; | +; l +; | +; n +; +; __kernel void partial_linearization0(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (id % 5 == 0) { +; for (int i = 0; i < n * 2; i++) ret++; +; } else { +; for (int i = 0; i < n / 4; i++) ret++; +; } +; +; if (n > 10) { // uniform +; if (id % 2 == 0) { // varying +; for (int i = 0; i < n + 10; i++) ret++; +; } else { // varying +; for (int i = 0; i < n + 10; i++) ret *= 2; +; } +; ret += id * 10; +; } else { // uniform +; if (id % 2 == 0) { // varying +; for (int i = 0; i < n + 8; i++) ret++; +; } else { // varying +; for (int i = 0; i < n + 8; i++) ret *= 2; +; } +; ret += id / 2; +; } +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization0(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %rem = srem i32 %conv, 5 + %cmp = icmp eq i32 %rem, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + br label %for.cond + +for.cond: ; preds = %for.body, %if.then + %ret.0 = phi i32 [ 0, %if.then ], [ %inc, %for.body ] + %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp slt i32 %storemerge8, %mul + br i1 %cmp2, label %for.body, label %if.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.0, 1 + %inc4 = add nsw i32 %storemerge8, 1 + br label %for.cond + +if.else: ; preds = %entry + br label %for.cond6 + +for.cond6: ; preds = %for.body9, %if.else + %ret.1 = phi i32 [ 0, %if.else ], [ %inc10, %for.body9 ] + %storemerge = phi i32 [ 0, %if.else ], [ %inc12, %for.body9 ] + %div = sdiv i32 %n, 4 + %cmp7 = icmp slt i32 %storemerge, %div + br i1 %cmp7, label %for.body9, label %if.end + +for.body9: ; preds = %for.cond6 + %inc10 = add nsw i32 %ret.1, 1 + %inc12 = add nsw i32 %storemerge, 1 + br label %for.cond6 + +if.end: ; preds = %for.cond6, %for.cond + %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond6 ] + %cmp14 = icmp sgt i32 %n, 10 + %rem175 = and i32 %conv, 1 + %cmp18 = icmp eq i32 %rem175, 0 + br i1 %cmp14, label %if.then16, label %if.else44 + +if.then16: ; preds = %if.end + br i1 %cmp18, label %if.then20, label %if.else30 + +if.then20: ; preds = %if.then16 + br label %for.cond22 + +for.cond22: ; preds = %for.body25, %if.then20 + %ret.3 = phi i32 [ %ret.2, %if.then20 ], [ %inc26, %for.body25 ] + %storemerge7 = phi i32 [ 0, %if.then20 ], [ %inc28, %for.body25 ] + %add = add nsw i32 %n, 10 + %cmp23 = icmp slt i32 %storemerge7, %add + br i1 %cmp23, label %for.body25, label %if.end41 + +for.body25: ; preds = %for.cond22 + %inc26 = add nsw i32 %ret.3, 1 + %inc28 = add nsw i32 %storemerge7, 1 + br label %for.cond22 + +if.else30: ; preds = %if.then16 + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.else30 + %ret.4 = phi i32 [ %ret.2, %if.else30 ], [ %mul37, %for.body36 ] + %storemerge6 = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ] + %add33 = add nsw i32 %n, 10 + %cmp34 = icmp slt i32 %storemerge6, %add33 + br i1 %cmp34, label %for.body36, label %if.end41 + +for.body36: ; preds = %for.cond32 + %mul37 = shl nsw i32 %ret.4, 1 + %inc39 = add nsw i32 %storemerge6, 1 + br label %for.cond32 + +if.end41: ; preds = %for.cond32, %for.cond22 + %ret.5 = phi i32 [ %ret.3, %for.cond22 ], [ %ret.4, %for.cond32 ] + %mul42 = mul nsw i32 %conv, 10 + %add43 = add nsw i32 %ret.5, %mul42 + br label %if.end73 + +if.else44: ; preds = %if.end + br i1 %cmp18, label %if.then48, label %if.else59 + +if.then48: ; preds = %if.else44 + br label %for.cond50 + +for.cond50: ; preds = %for.body54, %if.then48 + %ret.6 = phi i32 [ %ret.2, %if.then48 ], [ %inc55, %for.body54 ] + %storemerge4 = phi i32 [ 0, %if.then48 ], [ %inc57, %for.body54 ] + %add51 = add nsw i32 %n, 8 + %cmp52 = icmp slt i32 %storemerge4, %add51 + br i1 %cmp52, label %for.body54, label %if.end70 + +for.body54: ; preds = %for.cond50 + %inc55 = add nsw i32 %ret.6, 1 + %inc57 = add nsw i32 %storemerge4, 1 + br label %for.cond50 + +if.else59: ; preds = %if.else44 + br label %for.cond61 + +for.cond61: ; preds = %for.body65, %if.else59 + %ret.7 = phi i32 [ %ret.2, %if.else59 ], [ %mul66, %for.body65 ] + %storemerge2 = phi i32 [ 0, %if.else59 ], [ %inc68, %for.body65 ] + %add62 = add nsw i32 %n, 8 + %cmp63 = icmp slt i32 %storemerge2, %add62 + br i1 %cmp63, label %for.body65, label %if.end70 + +for.body65: ; preds = %for.cond61 + %mul66 = shl nsw i32 %ret.7, 1 + %inc68 = add nsw i32 %storemerge2, 1 + br label %for.cond61 + +if.end70: ; preds = %for.cond61, %for.cond50 + %ret.8 = phi i32 [ %ret.6, %for.cond50 ], [ %ret.7, %for.cond61 ] + %div71 = sdiv i32 %conv, 2 + %add72 = add nsw i32 %ret.8, %div71 + br label %if.end73 + +if.end73: ; preds = %if.end70, %if.end41 + %storemerge3 = phi i32 [ %add72, %if.end70 ], [ %add43, %if.end41 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization0, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization0 +; CHECK: br label %[[FORCOND6PREHEADER:.+]] + +; CHECK: [[FORCOND6PREHEADER]]: +; CHECK: br label %[[FORCOND6:.+]] + +; CHECK: [[FORCONDPREHEADER:.+]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{((%[0-9A-Za-z\.]+))|(false)}}, label %[[FORBODY:.+]], label %[[IFENDLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND6]]: +; CHECK: %[[CMP7:.+]] = icmp +; CHECK: br i1 %[[CMP7]], label %[[FORBODY9:.+]], label %[[IFENDLOOPEXIT6:.+]] + +; CHECK: [[FORBODY9]]: +; CHECK: br label %[[FORCOND6]] + +; CHECK: [[IFENDLOOPEXIT]]: +; CHECK: br label %[[IFEND:.+]] + +; CHECK: [[IFENDLOOPEXIT6]]: +; CHECK: br label %[[FORCONDPREHEADER]] + +; CHECK: [[IFEND]]: +; CHECK: %[[CMP14:.+]] = icmp +; CHECK: br i1 %[[CMP14]], label %[[IFTHEN16:.+]], label %[[IFELSE44:.+]] + +; CHECK: [[IFTHEN16]]: +; CHECK: br label %[[FORCOND32PREHEADER:.+]] + +; CHECK: [[FORCOND32PREHEADER:.+]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[FORCOND22PREHEADER:.+]]: +; CHECK: br label %[[FORCOND22:.+]] + +; CHECK: [[FORCOND22]]: +; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[FORBODY25:.+]], label %[[IFEND41LOOPEXIT:.+]] + +; CHECK: [[FORBODY25]]: +; CHECK: br label %[[FORCOND22]] + +; CHECK: [[FORCOND32]]: +; CHECK: %[[CMP34:.+]] = icmp +; CHECK: br i1 %[[CMP34]], label %[[FORBODY36:.+]], label %[[IFEND41LOOPEXIT4:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[IFEND41LOOPEXIT]]: +; CHECK: br label %[[IFEND41:.+]] + +; CHECK: [[IFEND41LOOPEXIT4]]: +; CHECK: br label %[[FORCOND22PREHEADER]] + +; CHECK: [[IFEND41]]: +; CHECK: br label %[[IFEND73:.+]] + +; CHECK: [[IFELSE44]]: +; CHECK: br label %[[FORCOND61PREHEADER:.+]] + +; CHECK: [[FORCOND61PREHEADER]]: +; CHECK: br label %[[FORCOND61:.+]] + +; CHECK: [[FORCOND50PREHEADER:.+]]: +; CHECK: br label %[[FORCOND50:.+]] + +; CHECK: [[FORCOND50]]: +; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[FORBODY54:.+]], label %[[IFEND70LOOPEXIT:.+]] + +; CHECK: [[FORBODY54]]: +; CHECK: br label %[[FORCOND50]] + +; CHECK: [[FORCOND61]]: +; CHECK: %[[CMP63:.+]] = icmp +; CHECK: br i1 %[[CMP63]], label %[[FORBODY65:.+]], label %[[IFEND70LOOPEXIT5:.+]] + +; CHECK: [[FORBODY65]]: +; CHECK: br label %[[FORCOND61]] + +; CHECK: [[IFEND70LOOPEXIT]]: +; CHECK: br label %[[IFEND70:.+]] + +; CHECK: [[IFEND70LOOPEXIT5]]: +; CHECK: br label %[[FORCOND50PREHEADER]] + +; CHECK: [[IFEND70]]: +; CHECK: br label %[[IFEND73]] + +; CHECK: [[IFEND73]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll new file mode 100644 index 0000000000000..7e5becea883fa --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll @@ -0,0 +1,261 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-. +; / \ | +; c d | +; / \ / | +; e f --' +; \ | +; \ g +; \| +; h +; +; * where nodes c and f are uniform branches, and node b is a varying +; branch. +; * where nodes c, d, e, f, g and h are divergent. +; +; With partial linearization, it can be transformed in the following way: +; +; a +; | +; b <. +; | | +; d | +; | | +; c | +; | | +; f -' +; | +; g +; | +; e +; | +; h +; +; __kernel void partial_linearization1(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; while (1) { +; if (id + i % 2 == 0) { +; if (n > 2) { +; goto e; +; } +; } else { +; for (int i = 0; i < n + 10; i++) ret++; +; } +; if (n <= 2) break; +; } +; +; ret += n * 2; +; for (int i = 0; i < n * 2; i++) ret -= i; +; ret /= n; +; goto early; +; +; e: +; for (int i = 0; i < n + 5; i++) ret /= 2; +; ret -= n; +; +; early: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization1(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end14, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %ret.2, %if.end14 ] + %cmp = icmp eq i32 %conv, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body + %cmp2 = icmp sgt i32 %n, 2 + br i1 %cmp2, label %e, label %if.end10 + +if.else: ; preds = %while.body + br label %for.cond + +for.cond: ; preds = %for.body, %if.else + %ret.1 = phi i32 [ %ret.0, %if.else ], [ %inc, %for.body ] + %storemerge = phi i32 [ 0, %if.else ], [ %inc9, %for.body ] + %add6 = add nsw i32 %n, 10 + %cmp7 = icmp slt i32 %storemerge, %add6 + br i1 %cmp7, label %for.body, label %if.end10 + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.1, 1 + %inc9 = add nsw i32 %storemerge, 1 + br label %for.cond + +if.end10: ; preds = %for.cond, %if.then + %ret.2 = phi i32 [ %ret.0, %if.then ], [ %ret.1, %for.cond ] + %cmp11 = icmp slt i32 %n, 3 + br i1 %cmp11, label %while.end, label %if.end14 + +if.end14: ; preds = %if.end10 + br label %while.body + +while.end: ; preds = %if.end10 + %mul = mul i32 %n, 2 + %add15 = add nsw i32 %ret.2, %mul + br label %for.cond17 + +for.cond17: ; preds = %for.body21, %while.end + %ret.3 = phi i32 [ %add15, %while.end ], [ %sub, %for.body21 ] + %storemerge1 = phi i32 [ 0, %while.end ], [ %inc23, %for.body21 ] + %mul18 = shl nsw i32 %n, 1 + %cmp19 = icmp slt i32 %storemerge1, %mul18 + br i1 %cmp19, label %for.body21, label %for.end24 + +for.body21: ; preds = %for.cond17 + %sub = sub nsw i32 %ret.3, %storemerge1 + %inc23 = add nsw i32 %storemerge1, 1 + br label %for.cond17 + +for.end24: ; preds = %for.cond17 + %0 = icmp eq i32 %ret.3, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %ret.3, %5 + br label %early + +e: ; preds = %if.then + br label %for.cond26 + +for.cond26: ; preds = %for.body30, %e + %ret.4 = phi i32 [ %ret.0, %e ], [ %div31, %for.body30 ] + %storemerge3 = phi i32 [ 0, %e ], [ %inc33, %for.body30 ] + %add27 = add nsw i32 %n, 5 + %cmp28 = icmp slt i32 %storemerge3, %add27 + br i1 %cmp28, label %for.body30, label %for.end34 + +for.body30: ; preds = %for.cond26 + %div31 = sdiv i32 %ret.4, 2 + %inc33 = add nsw i32 %storemerge3, 1 + br label %for.cond26 + +for.end34: ; preds = %for.cond26 + %sub35 = sub nsw i32 %ret.4, %n + br label %early + +early: ; preds = %for.end34, %for.end24 + %storemerge2 = phi i32 [ %div, %for.end24 ], [ %sub35, %for.end34 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization1, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization1 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[IFTHEN:.+]]: +; CHECK: br label %[[IFEND10:.+]] + +; CHECK: [[FORCOND26PREHEADER:.+]]: +; CHECK: br label %[[FORCOND26:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFEND10LOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[IFEND10LOOPEXIT]]: +; CHECK: br label %[[IFTHEN]] + +; CHECK: [[IFEND10]]: +; CHECK: %[[CMP11:.+]] = icmp +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[WHILEEND:.+]] + +; CHECK: [[WHILEEND]]: +; CHECK: br label %[[FORCOND17:.+]] + +; CHECK: [[WHILEENDELSE:.+]]: +; CHECK: br label %[[FORCOND26PREHEADER]] + +; CHECK: [[FORCOND17]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21:.+]], label %[[FOREND24:.+]] + +; CHECK: [[FORBODY21]]: +; CHECK: br label %[[FORCOND17]] + +; CHECK: [[FOREND24]]: +; CHECK: br label %[[WHILEENDELSE]] + +; CHECK: [[FORCOND26]]: +; CHECK: %[[CMP28:.+]] = icmp +; CHECK: br i1 %[[CMP28]], label %[[FORBODY30:.+]], label %[[FOREND34:.+]] + +; CHECK: [[FORBODY30]]: +; CHECK: br label %[[FORCOND26]] + +; CHECK: [[FOREND34]]: +; CHECK: br label %[[EARLY:.+]] + +; CHECK: [[EARLY]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll new file mode 100644 index 0000000000000..17d186cc11900 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll @@ -0,0 +1,465 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-----. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; / g <---. | +; / / \ | | +; / h i | | +; f / \ / \ | | +; | j k l | | +; | /| / \ / | | +; | m | n o --' | +; | / |/ | +; |/ q ----------' +; p | +; \ r +; \ / +; s +; +; * where nodes b, c, g, h, j, k and q are uniform branches, and node i is a +; varying branch. +; * where nodes k, l, o, n, m, p, q, r and s are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; b <-----. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; / g <---. | +; f / \ | | +; | / \ | | +; | h i | | +; | / \ | | | +; | j | l | | +; | | \ / | | +; | | k | | +; | \ | | | +; | \ o ---' | +; | \ / | +; | n | +; \ | | +; \ q -------' +; \ / +; m +; | +; r +; | +; p +; | +; s +; +; __kernel void partial_linearization10(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0) { // b +; // c +; for (int i = 0; i < n * 2; i++) ret++; +; if (n <= 10) { +; // f +; goto f; +; } +; } else { +; // d +; for (int i = 0; i < n / 4; i++) ret++; +; } +; // e +; ret++; +; while (1) { +; if (n & 1) { // g +; // h +; if (n < 3) { +; // j +; goto j; +; } +; } else { +; // i +; if (ret + id >= n) { +; // l +; ret /= n * n + ret; +; goto o; +; } +; } +; // k +; if (n & 1) { +; // n +; ret += n * ret; +; goto n; +; } +; // o +; o: +; ret++; +; } +; j: +; if (n < 2) { +; // m +; ret += n * 2 + 20; +; goto p; +; } else { +; goto q; +; } +; n: +; ret *= 4; +; q: +; if (n & 1) { +; // r +; ret++; +; goto r; +; } +; } +; +; r: +; for (int i = 0; i < n / 4; i++) ret++; +; goto s; +; +; f: +; ret /= n; +; goto p; +; +; p: +; for (int i = 0; i < n * 2; i++) ret++; +; +; s: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization10(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end55, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %ret.5, %if.end55 ] + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body + br label %for.cond + +for.cond: ; preds = %for.body, %if.then + %ret.1 = phi i32 [ %ret.0, %if.then ], [ %inc, %for.body ] + %storemerge5 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp slt i32 %storemerge5, %mul + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.1, 1 + %inc4 = add nsw i32 %storemerge5, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp5 = icmp slt i32 %n, 11 + br i1 %cmp5, label %f, label %if.end17 + +if.else: ; preds = %while.body + br label %for.cond9 + +for.cond9: ; preds = %for.body12, %if.else + %ret.2 = phi i32 [ %ret.0, %if.else ], [ %inc13, %for.body12 ] + %storemerge = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ] + %div = sdiv i32 %n, 4 + %cmp10 = icmp slt i32 %storemerge, %div + br i1 %cmp10, label %for.body12, label %if.end17 + +for.body12: ; preds = %for.cond9 + %inc13 = add nsw i32 %ret.2, 1 + %inc15 = add nsw i32 %storemerge, 1 + br label %for.cond9 + +if.end17: ; preds = %for.cond9, %for.end + %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ] + br label %while.body20 + +while.body20: ; preds = %o, %if.end17 + %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %o ] + %storemerge1 = add nsw i32 %storemerge1.in, 1 + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %if.else26, label %if.then21 + +if.then21: ; preds = %while.body20 + %cmp22 = icmp slt i32 %n, 3 + br i1 %cmp22, label %j, label %if.end34 + +if.else26: ; preds = %while.body20 + %add = add nsw i32 %storemerge1, %conv + %cmp27 = icmp slt i32 %add, %n + br i1 %cmp27, label %if.end34, label %if.then29 + +if.then29: ; preds = %if.else26 + %mul30 = mul nsw i32 %n, %n + %add31 = add nsw i32 %storemerge1, %mul30 + %0 = icmp eq i32 %add31, 0 + %1 = select i1 %0, i32 1, i32 %add31 + %div32 = sdiv i32 %storemerge1, %1 + br label %o + +if.end34: ; preds = %if.else26, %if.then21 + %and35 = and i32 %n, 1 + %tobool36 = icmp eq i32 %and35, 0 + br i1 %tobool36, label %o, label %if.then37 + +if.then37: ; preds = %if.end34 + %mul38 = mul nsw i32 %storemerge1, %n + %add39 = add nsw i32 %mul38, %storemerge1 + %mul50 = shl nsw i32 %add39, 2 + br label %q + +o: ; preds = %if.end34, %if.then29 + %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ] + br label %while.body20 + +j: ; preds = %if.then21 + %cmp42 = icmp eq i32 %n, 2 + br i1 %cmp42, label %q, label %if.then44 + +if.then44: ; preds = %j + %mul45 = mul i32 %n, 2 + %add46 = add nsw i32 %mul45, 20 + %add47 = add nsw i32 %add46, %storemerge1 + br label %p + +q: ; preds = %j, %if.then37 + %ret.5 = phi i32 [ %mul50, %if.then37 ], [ %storemerge1, %j ] + %and51 = and i32 %n, 1 + %tobool52 = icmp eq i32 %and51, 0 + br i1 %tobool52, label %if.end55, label %if.then53 + +if.then53: ; preds = %q + br label %for.cond57 + +if.end55: ; preds = %q + br label %while.body + +for.cond57: ; preds = %for.body61, %if.then53 + %ret.6.in = phi i32 [ %ret.5, %if.then53 ], [ %ret.6, %for.body61 ] + %storemerge2 = phi i32 [ 0, %if.then53 ], [ %inc64, %for.body61 ] + %ret.6 = add nsw i32 %ret.6.in, 1 + %div58 = sdiv i32 %n, 4 + %cmp59 = icmp slt i32 %storemerge2, %div58 + br i1 %cmp59, label %for.body61, label %s + +for.body61: ; preds = %for.cond57 + %inc64 = add nsw i32 %storemerge2, 1 + br label %for.cond57 + +f: ; preds = %for.end + %2 = icmp eq i32 %ret.1, -2147483648 + %3 = icmp eq i32 %n, -1 + %4 = and i1 %3, %2 + %5 = icmp eq i32 %n, 0 + %6 = or i1 %5, %4 + %7 = select i1 %6, i32 1, i32 %n + %div66 = sdiv i32 %ret.1, %7 + br label %p + +p: ; preds = %f, %if.then44 + %storemerge3 = phi i32 [ %add47, %if.then44 ], [ %div66, %f ] + br label %for.cond68 + +for.cond68: ; preds = %for.body72, %p + %ret.7 = phi i32 [ %storemerge3, %p ], [ %inc73, %for.body72 ] + %storemerge4 = phi i32 [ 0, %p ], [ %inc75, %for.body72 ] + %mul69 = shl nsw i32 %n, 1 + %cmp70 = icmp slt i32 %storemerge4, %mul69 + br i1 %cmp70, label %for.body72, label %s + +for.body72: ; preds = %for.cond68 + %inc73 = add nsw i32 %ret.7, 1 + %inc75 = add nsw i32 %storemerge4, 1 + br label %for.cond68 + +s: ; preds = %for.cond68, %for.cond57 + %ret.8 = phi i32 [ %ret.6, %for.cond57 ], [ %ret.7, %for.cond68 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization10, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization10 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]] + +; CHECK: [[FORCOND9PREHEADER]]: +; CHECK: br label %[[FORCOND9:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]] + +; CHECK: [[FORCOND9]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND9]] + +; CHECK: [[IFEND17LOOPEXIT]]: +; CHECK: br label %[[IFEND17]] + +; CHECK: [[IFEND17]]: +; CHECK: br label %[[WHILEBODY20:.+]] + +; CHECK: [[WHILEBODY20]]: +; CHECK: %[[AND:.+]] = and i32 +; CHECK: %[[TOBOOL:.+]] = icmp eq i32 %[[AND]] +; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]] + +; CHECK: [[IFTHEN21]]: +; CHECK: %[[CMP22:.+]] = icmp +; CHECK: br i1 %[[CMP22]], label %[[J:.+]], label %[[IFEND34:.+]] + +; CHECK: [[IFELSE26]]: +; CHECK: br label %[[IFTHEN29:.+]] + +; CHECK: [[IFTHEN29]]: +; CHECK: br label %[[IFEND34]] + +; CHECK: [[IFEND34]]: +; CHECK: br label %[[O:.+]] + +; CHECK: [[IFTHEN37:.+]]: +; CHECK: br label %[[IFTHEN37ELSE:.+]] + +; CHECK: [[IFTHEN37ELSE]]: +; CHECK: br i1 %{{.+}}, label %[[JELSE:.+]], label %[[JSPLIT:.+]] + +; CHECK: [[O]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]] + +; CHECK: [[WHILEBODY20PUREEXIT]]: +; CHECK: br label %[[IFTHEN37]] + +; CHECK: [[J]]: +; CHECK: br label %[[WHILEBODY20PUREEXIT]] + +; CHECK: [[JELSE]]: +; CHECK: br label %[[Q:.+]] + +; CHECK: [[JSPLIT]]: +; CHECK: br label %[[Q]] + +; CHECK: [[IFTHEN44:.+]]: +; CHECK: br label %[[IFTHEN44ELSE:.+]] + +; CHECK: [[IFTHEN44ELSE]]: +; CHECK: br label %[[FORCOND57PREHEADER:.+]] + +; CHECK: [[Q]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[IFTHEN44]] + +; CHECK: [[FORCOND57PREHEADER]]: +; CHECK: br label %[[FORCOND57:.+]] + +; CHECK: [[FORCOND57PREHEADERELSE:.+]]: +; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]] + +; CHECK: [[FORCOND57]]: +; CHECK: %[[CMP59:.+]] = icmp +; CHECK: br i1 %[[CMP59]], label %[[FORBODY61:.+]], label %[[SLOOPEXIT2:.+]] + +; CHECK: [[FORBODY61]]: +; CHECK: br label %[[FORCOND57]] + +; CHECK: [[F]]: +; CHECK: br label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[FELSE]]: +; CHECK: br label %[[P:.+]] + +; CHECK: [[FSPLIT]]: +; CHECK: br label %[[P]] + +; CHECK: [[P]]: +; CHECK: br label %[[FORCOND68:.+]] + +; CHECK: [[FORCOND68]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72:.+]], label %[[SLOOPEXIT:.+]] + +; CHECK: [[FORBODY72]]: +; CHECK: br label %[[FORCOND68]] + +; CHECK: [[SLOOPEXIT]]: +; CHECK: br label %[[S:.+]] + +; CHECK: [[SLOOPEXIT2]]: +; CHECK: br label %[[FORCOND57PREHEADERELSE]] + +; CHECK: [[S]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll new file mode 100644 index 0000000000000..7721a7577a09a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll @@ -0,0 +1,357 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-------. +; | | +; c <---. | +; / \ | | +; d e | | +; / \ / \ | | +; i f g | | +; | / \ / \| | +; | j h --' | +; | | \ | +; | | k | +; | \ / | +; | \ / | +; | \ / | +; | \ / | +; | l -----' +; | / +; \ m +; \ / +; n +; +; * where nodes c, d, f, g, and l are uniform branches, and node e is a +; varying branch. +; * where nodes i, f, g, j, h, k, l, m and n are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; b <----. +; | | +; c <--. | +; / \ | | +; d e | | +; | | | | +; | g | | +; \ / | | +; f | | +; | | | +; h ---' | +; | | +; k | +; | | +; j | +; | | +; l -----' +; | +; m +; | +; i +; | +; n +; +; __kernel void partial_linearization11(__global int *out, int n) { +; // a +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; // b +; while (1) { +; if (n < 5) { // c +; // d +; for (int i = 0; i < n * 2; i++) ret++; +; if (n <= 3) { +; // i +; goto i; +; } +; } else { +; // e +; if (ret + id >= n) { +; // g +; ret /= n * n + ret; +; if (n <= 10) { +; goto k; +; } else { +; goto h; +; } +; } +; } +; // f +; ret *= n; +; if (n & 1) { +; goto j; +; } +; +; // h +; h: +; ret++; +; } +; +; j: +; ret += n * 2 + 20; +; goto l; +; +; k: +; ret *= n; +; goto l; +; +; l: +; if (n & 1) { +; // m +; ret++; +; goto m; +; } +; } +; +; m: +; for (int i = 0; i < n / 4; i++) ret++; +; goto n; +; +; i: +; ret /= n; +; +; n: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization11(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end33, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %storemerge, %if.end33 ] + br label %while.body2 + +while.body2: ; preds = %h, %while.body + %ret.1 = phi i32 [ %ret.0, %while.body ], [ %inc24, %h ] + %cmp = icmp slt i32 %n, 5 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body2 + br label %for.cond + +for.cond: ; preds = %for.body, %if.then + %ret.2 = phi i32 [ %ret.1, %if.then ], [ %inc, %for.body ] + %storemerge2 = phi i32 [ 0, %if.then ], [ %inc6, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp4 = icmp slt i32 %storemerge2, %mul + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.2, 1 + %inc6 = add nsw i32 %storemerge2, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp7 = icmp slt i32 %n, 4 + br i1 %cmp7, label %i44, label %if.end20 + +if.else: ; preds = %while.body2 + %add = add nsw i32 %ret.1, %conv + %cmp10 = icmp slt i32 %add, %n + br i1 %cmp10, label %if.end20, label %if.then12 + +if.then12: ; preds = %if.else + %mul13 = mul nsw i32 %n, %n + %add14 = add nsw i32 %ret.1, %mul13 + %0 = icmp eq i32 %ret.1, -2147483648 + %1 = icmp eq i32 %add14, -1 + %2 = and i1 %0, %1 + %3 = icmp eq i32 %add14, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %add14 + %div = sdiv i32 %ret.1, %5 + %cmp15 = icmp slt i32 %n, 11 + br i1 %cmp15, label %k, label %h + +if.end20: ; preds = %if.else, %for.end + %ret.3 = phi i32 [ %ret.2, %for.end ], [ %ret.1, %if.else ] + %mul21 = mul nsw i32 %ret.3, %n + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %h, label %j + +h: ; preds = %if.end20, %if.then12 + %ret.4 = phi i32 [ %div, %if.then12 ], [ %mul21, %if.end20 ] + %inc24 = add nsw i32 %ret.4, 1 + br label %while.body2 + +j: ; preds = %if.end20 + %mul25 = mul i32 %n, 2 + %add26 = add nsw i32 %mul25, 20 + %add27 = add nsw i32 %add26, %mul21 + br label %l + +k: ; preds = %if.then12 + %mul28 = mul nsw i32 %div, %n + br label %l + +l: ; preds = %k, %j + %storemerge = phi i32 [ %add27, %j ], [ %mul28, %k ] + %and29 = and i32 %n, 1 + %tobool30 = icmp eq i32 %and29, 0 + br i1 %tobool30, label %if.end33, label %if.then31 + +if.then31: ; preds = %l + br label %for.cond35 + +if.end33: ; preds = %l + br label %while.body + +for.cond35: ; preds = %for.body39, %if.then31 + %ret.5.in = phi i32 [ %storemerge, %if.then31 ], [ %ret.5, %for.body39 ] + %storemerge1 = phi i32 [ 0, %if.then31 ], [ %inc42, %for.body39 ] + %ret.5 = add nsw i32 %ret.5.in, 1 + %div36 = sdiv i32 %n, 4 + %cmp37 = icmp slt i32 %storemerge1, %div36 + br i1 %cmp37, label %for.body39, label %n46 + +for.body39: ; preds = %for.cond35 + %inc42 = add nsw i32 %storemerge1, 1 + br label %for.cond35 + +i44: ; preds = %for.end + %6 = icmp eq i32 %ret.2, -2147483648 + %7 = icmp eq i32 %n, -1 + %8 = and i1 %7, %6 + %9 = icmp eq i32 %n, 0 + %10 = or i1 %9, %8 + %11 = select i1 %10, i32 1, i32 %n + %div45 = sdiv i32 %ret.2, %11 + br label %n46 + +n46: ; preds = %i44, %for.cond35 + %ret.6 = phi i32 [ %div45, %i44 ], [ %ret.5, %for.cond35 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization11, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization11 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[WHILEBODY2:.+]] + +; CHECK: [[WHILEBODY2]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[IFELSE:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: br label %[[IFEND20:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[IFTHEN12:.+]] + +; CHECK: [[IFTHEN12]]: +; CHECK: br label %[[IFEND20]] + +; CHECK: [[IFEND20]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[H]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY2]], label %[[WHILEBODY2PUREEXIT:.+]] + +; CHECK: [[WHILEBODY2PUREEXIT:.+]]: +; CHECK: br label %[[K:.+]] + +; CHECK: [[J:.+]]: +; CHECK: br label %[[L:.+]] + +; CHECK: [[K]]: +; CHECK: br label %[[KELSE:.+]] + +; CHECK: [[KELSE]]: +; CHECK: br label %[[J]] + +; CHECK: [[L]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[FORCOND35PREHEADER:.+]] + +; CHECK: [[FORCOND35PREHEADER]]: +; CHECK: br label %[[FORCOND35:.+]] + +; CHECK: [[FORCOND35PREHEADERELSE:.+]]: +; CHECK: br label %[[I44:.+]] + +; CHECK: [[FORCOND35]]: +; CHECK: %[[CMP37:.+]] = icmp +; CHECK: br i1 %[[CMP37]], label %[[FORBODY39:.+]], label %[[N46LOOPEXIT:.+]] + +; CHECK: [[FORBODY39]]: +; CHECK: br label %[[FORCOND35]] + +; CHECK: [[I44]]: +; CHECK: br label %[[N46:.+]] + +; CHECK: [[N46LOOPEXIT]]: +; CHECK: br label %[[FORCOND35PREHEADERELSE]] + +; CHECK: [[N46]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll new file mode 100644 index 0000000000000..be2f0f909e0c3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll @@ -0,0 +1,627 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-----. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; / g <---. | +; f / \ | | +; | h i | | +; | / / \ | | +; | / k l | | +; | / |\ /| | | +; |/ |/ \| | | +; j m n | | +; /| / \ / | | +; / | o p --' | +; / | / / | +; | | / r | +; | | / | | +; | |/ s ------' +; | | / +; | /| t +; | / | / +; |/ | / +; q | / +; | |/ +; | u +; \ / +; v +; +; * where nodes b, c, g, j, k, l, m, p and s are uniform branches, +; and node i is a varying branch. +; * where nodes k, l, o, n, m, p, q, s, r, t and v are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; b <----. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; f g <--. | +; | / \ | | +; | h i | | +; | / | | | +; | / l | | +; |/ | | | +; j k | | +; |\ | | | +; | \ n | | +; | \ | | | +; | | m | | +; | | | | | +; | | p -' | +; | | / | +; | | r | +; | | | | +; | | s -----' +; | |/ +; | o +; | / +; | t +; |/ +; u +; | +; q +; | +; v +; +; __kernel void partial_linearization12(__global int *out, int n) { +; // a +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0) { // b +; // c +; for (int i = 0; i < n * 2; i++) ret++; +; if (n < 5) { +; // f +; goto f; +; } +; } else { +; // d +; for (int i = 0; i < n / 4; i++) ret++; +; } +; // e +; ret++; +; while (1) { +; if (n <= 2) { // g +; // h +; ret -= n * ret; +; for (int i = 0; i < n * 2; i++) ret++; +; // j +; goto j; +; } else { +; // i +; if (ret + id >= n) { +; // k +; ret /= n * n + ret; +; if (n < 5) { +; // m +; ret -= n; +; goto m; +; } else { +; // n +; ret += n; +; goto n; +; } +; } else { +; // l +; if (n >= 5) { +; // m +; ret += n; +; goto m; +; } else { +; // n +; ret -= n; +; goto n; +; } +; } +; } +; // m +; m: +; if (n & 1) { +; // o +; ret *= n; +; goto q; +; } else { +; // p +; goto p; +; } +; +; // n +; n: +; ret *= ret; +; // p +; p: +; if (n > 3) { +; goto r; +; } +; ret++; +; } +; +; // r +; r: +; ret *= 4; +; for (int i = 0; i < n / 4; i++) ret++; +; +; // s +; if (n & 1) { +; goto t; +; } +; ret++; +; } +; +; f: +; ret /= n; +; goto j; +; +; j: +; if (n == 2) { +; goto q; +; } else { +; goto u; +; } +; +; t: +; for (int i = 0; i < n + 1; i++) ret++; +; goto u; +; +; q: +; for (int i = 0; i < n / 4; i++) ret++; +; goto v; +; +; u: +; for (int i = 0; i < n * 2; i++) ret++; +; +; v: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end79, %entry + %storemerge = phi i32 [ 0, %entry ], [ %inc80, %if.end79 ] + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body + br label %for.cond + +for.cond: ; preds = %for.body, %if.then + %ret.0 = phi i32 [ %storemerge, %if.then ], [ %inc, %for.body ] + %storemerge10 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp slt i32 %storemerge10, %mul + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.0, 1 + %inc4 = add nsw i32 %storemerge10, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp5 = icmp slt i32 %n, 5 + br i1 %cmp5, label %f, label %if.end17 + +if.else: ; preds = %while.body + br label %for.cond9 + +for.cond9: ; preds = %for.body12, %if.else + %ret.1 = phi i32 [ %storemerge, %if.else ], [ %inc13, %for.body12 ] + %storemerge1 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ] + %div = sdiv i32 %n, 4 + %cmp10 = icmp slt i32 %storemerge1, %div + br i1 %cmp10, label %for.body12, label %if.end17 + +for.body12: ; preds = %for.cond9 + %inc13 = add nsw i32 %ret.1, 1 + %inc15 = add nsw i32 %storemerge1, 1 + br label %for.cond9 + +if.end17: ; preds = %for.cond9, %for.end + %ret.2 = phi i32 [ %ret.0, %for.end ], [ %ret.1, %for.cond9 ] + br label %while.body20 + +while.body20: ; preds = %if.end63, %if.end17 + %storemerge2.in = phi i32 [ %ret.2, %if.end17 ], [ %ret.4, %if.end63 ] + %storemerge2 = add nsw i32 %storemerge2.in, 1 + %cmp21 = icmp slt i32 %n, 3 + br i1 %cmp21, label %if.then23, label %if.else35 + +if.then23: ; preds = %while.body20 + %mul24 = mul nsw i32 %storemerge2, %n + %sub = sub nsw i32 %storemerge2, %mul24 + br label %for.cond26 + +for.cond26: ; preds = %for.body30, %if.then23 + %ret.3 = phi i32 [ %sub, %if.then23 ], [ %inc31, %for.body30 ] + %storemerge9 = phi i32 [ 0, %if.then23 ], [ %inc33, %for.body30 ] + %mul27 = shl nsw i32 %n, 1 + %cmp28 = icmp slt i32 %storemerge9, %mul27 + br i1 %cmp28, label %for.body30, label %j + +for.body30: ; preds = %for.cond26 + %inc31 = add nsw i32 %ret.3, 1 + %inc33 = add nsw i32 %storemerge9, 1 + br label %for.cond26 + +if.else35: ; preds = %while.body20 + %add = add nsw i32 %storemerge2, %conv + %cmp36 = icmp slt i32 %add, %n + br i1 %cmp36, label %if.else48, label %if.then38 + +if.then38: ; preds = %if.else35 + %mul39 = mul nsw i32 %n, %n + %add40 = add nsw i32 %storemerge2, %mul39 + %0 = icmp eq i32 %add40, 0 + %1 = select i1 %0, i32 1, i32 %add40 + %div41 = sdiv i32 %storemerge2, %1 + %cmp42 = icmp slt i32 %n, 5 + br i1 %cmp42, label %if.then44, label %if.else46 + +if.then44: ; preds = %if.then38 + %sub45 = sub nsw i32 %div41, %n + br label %m + +if.else46: ; preds = %if.then38 + %add47 = add nsw i32 %div41, %n + br label %n58 + +if.else48: ; preds = %if.else35 + %cmp49 = icmp sgt i32 %n, 4 + br i1 %cmp49, label %if.then51, label %if.else53 + +if.then51: ; preds = %if.else48 + %add52 = add nsw i32 %storemerge2, %n + br label %m + +if.else53: ; preds = %if.else48 + %sub54 = sub nsw i32 %storemerge2, %n + br label %n58 + +m: ; preds = %if.then51, %if.then44 + %storemerge7 = phi i32 [ %add52, %if.then51 ], [ %sub45, %if.then44 ] + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %p, label %if.then55 + +if.then55: ; preds = %m + %mul56 = mul nsw i32 %storemerge7, %n + br label %q + +n58: ; preds = %if.else53, %if.else46 + %storemerge3 = phi i32 [ %sub54, %if.else53 ], [ %add47, %if.else46 ] + %mul59 = mul nsw i32 %storemerge3, %storemerge3 + br label %p + +p: ; preds = %n58, %m + %ret.4 = phi i32 [ %mul59, %n58 ], [ %storemerge7, %m ] + %cmp60 = icmp sgt i32 %n, 3 + br i1 %cmp60, label %r, label %if.end63 + +if.end63: ; preds = %p + br label %while.body20 + +r: ; preds = %p + %mul65 = shl nsw i32 %ret.4, 2 + br label %for.cond67 + +for.cond67: ; preds = %for.body71, %r + %ret.5 = phi i32 [ %mul65, %r ], [ %inc72, %for.body71 ] + %storemerge4 = phi i32 [ 0, %r ], [ %inc74, %for.body71 ] + %div68 = sdiv i32 %n, 4 + %cmp69 = icmp slt i32 %storemerge4, %div68 + br i1 %cmp69, label %for.body71, label %for.end75 + +for.body71: ; preds = %for.cond67 + %inc72 = add nsw i32 %ret.5, 1 + %inc74 = add nsw i32 %storemerge4, 1 + br label %for.cond67 + +for.end75: ; preds = %for.cond67 + %and76 = and i32 %n, 1 + %tobool77 = icmp eq i32 %and76, 0 + br i1 %tobool77, label %if.end79, label %t + +if.end79: ; preds = %for.end75 + %inc80 = add nsw i32 %ret.5, 1 + br label %while.body + +f: ; preds = %for.end + %2 = icmp eq i32 %n, 0 + %3 = select i1 %2, i32 1, i32 %n + %div81 = sdiv i32 %ret.0, %3 + br label %j + +j: ; preds = %f, %for.cond26 + %ret.6 = phi i32 [ %div81, %f ], [ %ret.3, %for.cond26 ] + %cmp82 = icmp eq i32 %n, 2 + br i1 %cmp82, label %q, label %u + +t: ; preds = %for.end75 + br label %for.cond87 + +for.cond87: ; preds = %for.body91, %t + %ret.7 = phi i32 [ %ret.5, %t ], [ %inc92, %for.body91 ] + %storemerge5 = phi i32 [ 0, %t ], [ %inc94, %for.body91 ] + %cmp89 = icmp sgt i32 %storemerge5, %n + br i1 %cmp89, label %u, label %for.body91 + +for.body91: ; preds = %for.cond87 + %inc92 = add nsw i32 %ret.7, 1 + %inc94 = add nsw i32 %storemerge5, 1 + br label %for.cond87 + +q: ; preds = %j, %if.then55 + %ret.8 = phi i32 [ %mul56, %if.then55 ], [ %ret.6, %j ] + br label %for.cond97 + +for.cond97: ; preds = %for.body101, %q + %ret.9 = phi i32 [ %ret.8, %q ], [ %inc102, %for.body101 ] + %storemerge8 = phi i32 [ 0, %q ], [ %inc104, %for.body101 ] + %div98 = sdiv i32 %n, 4 + %cmp99 = icmp slt i32 %storemerge8, %div98 + br i1 %cmp99, label %for.body101, label %v + +for.body101: ; preds = %for.cond97 + %inc102 = add nsw i32 %ret.9, 1 + %inc104 = add nsw i32 %storemerge8, 1 + br label %for.cond97 + +u: ; preds = %for.cond87, %j + %ret.10 = phi i32 [ %ret.6, %j ], [ %ret.7, %for.cond87 ] + br label %for.cond107 + +for.cond107: ; preds = %for.body111, %u + %ret.11 = phi i32 [ %ret.10, %u ], [ %inc112, %for.body111 ] + %storemerge6 = phi i32 [ 0, %u ], [ %inc114, %for.body111 ] + %mul108 = shl nsw i32 %n, 1 + %cmp109 = icmp slt i32 %storemerge6, %mul108 + br i1 %cmp109, label %for.body111, label %v + +for.body111: ; preds = %for.cond107 + %inc112 = add nsw i32 %ret.11, 1 + %inc114 = add nsw i32 %storemerge6, 1 + br label %for.cond107 + +v: ; preds = %for.cond107, %for.cond97 + %ret.12 = phi i32 [ %ret.9, %for.cond97 ], [ %ret.11, %for.cond107 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.12, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization12, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization12 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]] + +; CHECK: [[FORCOND9PREHEADER]]: +; CHECK: br label %[[FORCOND9:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]] + +; CHECK: [[FORCOND9]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND9]] + +; CHECK: [[IFEND17LOOPEXIT]]: +; CHECK: br label %[[IFEND17]] + +; CHECK: [[IFEND17]]: +; CHECK: br label %[[WHILEBODY20:.+]] + +; CHECK: [[WHILEBODY20]]: +; CHECK: %[[CMP21:.+]] = icmp +; CHECK: br i1 %[[CMP21]], label %[[IFTHEN23:.+]], label %[[IFELSE35:.+]] + +; CHECK: [[IFTHEN23]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[IFTHEN23ELSE:.+]]: +; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]] + +; CHECK: [[IFTHEN23SPLIT:.+]]: +; CHECK: br label %[[FORCOND26:.+]] + +; CHECK: [[FORCOND26]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[JLOOPEXIT:.+]] + +; CHECK: [[FORBODY30]]: +; CHECK: br label %[[FORCOND26]] + +; CHECK: [[IFELSE35]]: +; CHECK: br label %[[IFTHEN38:.+]] + +; CHECK: [[IFTHEN38]]: +; CHECK: %[[CMP42:.+]] = icmp slt i32 +; CHECK: br i1 %[[CMP42]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]] + +; CHECK: [[IFTHEN44]]: +; CHECK: br label %[[IFELSE48:.+]] + +; CHECK: [[IFELSE46]]: +; CHECK: br label %[[IFELSE48]] + +; CHECK: [[IFELSE48]]: +; CHECK: %[[CMP49:.+]] = icmp +; CHECK: br i1 %[[CMP49]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]] + +; CHECK: [[IFTHEN51]]: +; CHECK: br label %[[N58:.+]] + +; CHECK: [[IFELSE53]]: +; CHECK: br label %[[N58]] + +; CHECK: [[M:.+]]: +; CHECK: br label %[[P:.+]] + +; CHECK: [[IFTHEN55:.+]]: +; CHECK: br label %[[IFTHEN55ELSE:.+]] + +; CHECK: [[IFTHEN55ELSE]]: +; CHECK: br label %[[FORCOND87PREHEADER:.+]] + +; CHECK: [[N58]]: +; CHECK: br label %[[M]] + +; CHECK: [[P]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]] + +; CHECK: [[WHILEBODY20PUREEXIT]]: +; CHECK: br label %[[R:.+]] + +; CHECK: [[R]]: +; CHECK: br label %[[FORCOND67:.+]] + +; CHECK: [[FORCOND67]]: +; CHECK: %[[CMP69:.+]] = icmp +; CHECK: br i1 %[[CMP69]], label %[[FORBODY71:.+]], label %[[FOREND75:.+]] + +; CHECK: [[FORBODY71]]: +; CHECK: br label %[[FORCOND67]] + +; CHECK: [[FOREND75]]: +; CHECK: br label %[[IFEND79:.+]] + +; CHECK: [[FORCOND87PREHEADER]]: +; CHECK: br label %[[FORCOND87:.+]] + +; CHECK: [[FORCOND87PREHEADERELSE:.+]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHEN23ELSE]], label %[[IFTHEN23SPLIT]] + +; CHECK: [[IFEND79]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[IFTHEN55]] + +; CHECK: [[F]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[FELSE]]: +; CHECK: br label %[[U:.+]] + +; CHECK: [[FSPLIT]]: +; CHECK: br label %[[J:.+]] + +; CHECK: [[JLOOPEXIT]]: +; CHECK: br label %[[J]] + +; CHECK: [[J]]: +; CHECK: %[[CMP82:.+]] = icmp +; CHECK: br i1 %[[CMP82]], label %[[Q:.+]], label %[[U]] + +; CHECK: [[FORCOND87]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXIT:.+]], label %[[FORBODY91:.+]] + +; CHECK: [[FORBODY91]]: +; CHECK: br label %[[FORCOND87]] + +; CHECK: [[Q]]: +; CHECK: br label %[[FORCOND97:.+]] + +; CHECK: [[FORCOND97]]: +; CHECK: %[[CMP99:.+]] = icmp +; CHECK: br i1 %[[CMP99]], label %[[FORBODY101:.+]], label %[[VLOOPEXIT:.+]] + +; CHECK: [[FORBODY101]]: +; CHECK: br label %[[FORCOND97]] + +; CHECK: [[ULOOPEXIT]]: +; CHECK: br label %[[FORCOND87PREHEADERELSE]] + +; CHECK: [[U]]: +; CHECK: br label %[[FORCOND107:.+]] + +; CHECK: [[FORCOND107]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111:.+]], label %[[VLOOPEXIT2:.+]] + +; CHECK: [[FORBODY111]]: +; CHECK: br label %[[FORCOND107]] + +; CHECK: [[VLOOPEXIT]]: +; CHECK: br label %[[V:.+]] + +; CHECK: [[VLOOPEXIT2]]: +; CHECK: br label %[[Q]] + +; CHECK: [[V]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll new file mode 100644 index 0000000000000..e044bef6c2a43 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll @@ -0,0 +1,222 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c +; \ / \ +; | \ +; | d +; | / \ +; | | e +; | \ / +; | f +; | / \ +; | | g +; | \ / +; \ h +; \ / +; i +; +; * where nodes d and f are uniform branches, and nodes a and c are varying +; branches. +; * where nodes b, c, i are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; c +; | +; d +; / \ +; | e +; \ / +; f +; / \ +; | g +; \ / +; h +; | +; b +; | +; i +; +; __kernel void partial_linearization13(__global int *out, int n) { +; size_t tid = get_global_id(0); +; size_t size = get_global_size(0); +; // a +; if (tid + 1 < size) { +; // b +; out[tid] = n; +; } else if (tid + 1 == size) { // c +; size_t leftovers = 1 + (size & 1); +; switch (leftovers) { // d +; case 2: // e +; out[tid] = 2 * n + 1; +; // fall through +; case 1: // f +; out[tid] += 3 * n - 1; +; break; +; } +; switch (leftovers) { // g +; case 2: +; out[tid] /= n; +; // fall through +; case 1: // h +; out[tid]--; +; break; +; } +; } +; // i +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization13(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %call1 = call i64 @__mux_get_global_size(i32 0) #2 + %add = add i64 %call, 1 + %cmp = icmp ult i64 %add, %call1 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %n, i32 addrspace(1)* %arrayidx, align 4 + br label %if.end17 + +if.else: ; preds = %entry + %add2 = add i64 %call, 1 + %cmp3 = icmp eq i64 %add2, %call1 + br i1 %cmp3, label %if.then4, label %if.end17 + +if.then4: ; preds = %if.else + %0 = and i64 %call1, 1 + %trunc = icmp eq i64 %0, 0 + br i1 %trunc, label %sw.bb8, label %sw.bb + +sw.bb: ; preds = %if.then4 + %mul = shl nsw i32 %n, 1 + %add6 = or i32 %mul, 1 + %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %add6, i32 addrspace(1)* %arrayidx7, align 4 + br label %sw.bb8 + +sw.bb8: ; preds = %sw.bb, %if.then4 + %mul9 = mul nsw i32 %n, 3 + %sub = add nsw i32 %mul9, -1 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %add11 = add nsw i32 %sub, %1 + store i32 %add11, i32 addrspace(1)* %arrayidx10, align 4 + %2 = and i64 %call1, 1 + %trunc2 = icmp ne i64 %2, 0 + %trunc2.off = add i1 %trunc2, true + %switch = icmp ult i1 %trunc2.off, true + br i1 %switch, label %sw.bb12, label %sw.bb14 + +sw.bb12: ; preds = %sw.bb8 + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + %3 = load i32, i32 addrspace(1)* %arrayidx13, align 4 + %4 = icmp eq i32 %3, -2147483648 + %5 = icmp eq i32 %n, -1 + %6 = and i1 %5, %4 + %7 = icmp eq i32 %n, 0 + %8 = or i1 %7, %6 + %9 = select i1 %8, i32 1, i32 %n + %div = sdiv i32 %3, %9 + store i32 %div, i32 addrspace(1)* %arrayidx13, align 4 + br label %sw.bb14 + +sw.bb14: ; preds = %sw.bb12, %sw.bb8 + %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + %10 = load i32, i32 addrspace(1)* %arrayidx15, align 4 + %dec = add nsw i32 %10, -1 + store i32 %dec, i32 addrspace(1)* %arrayidx15, align 4 + br label %if.end17 + +if.end17: ; preds = %sw.bb14, %if.else, %if.then + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_size(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization13, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization13 +; CHECK: br label %[[IFELSE:.+]] + +; CHECK: [[IFTHEN:.+]]: +; CHECK: br label %[[IFEND17:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[IFTHEN4:.+]] + +; CHECK: [[IFTHEN4]]: +; CHECK: %[[TMP:.+]] = and i64 %call1, 1 +; CHECK: %[[TRUNC:.+]] = icmp eq i64 %[[TMP]], 0 +; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even +; on inactive lanes. +; CHECK: %[[TRUNC_ACTIVE:.+]] = select i1 {{%.*}}, i1 %[[TRUNC]], i1 false +; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]]) +; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]] + +; CHECK: [[SWBB]]: +; CHECK: br label %[[SWBB8]] + +; CHECK: [[SWBB8]]: +; CHECK: %[[TMP2:.+]] = and i64 %call1, 1 +; CHECK: %[[TRUNC2:.+]] = icmp eq i64 %[[TMP2]], 0 +; CHECK: br i1 %[[TRUNC2]], label %[[SWBB14:.+]], label %[[SWBB12:.+]] + +; CHECK: [[SWBB12]]: +; CHECK: br label %[[SWBB14]] + +; CHECK: [[SWBB14]]: +; CHECK: br label %[[IFTHEN]] + +; CHECK: [[IFEND17]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll new file mode 100644 index 0000000000000..165092cd8c1ba --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll @@ -0,0 +1,292 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c <-. +; | / \ | +; | d e | +; |/ \ / | +; f g --' +; \ | +; \ h +; \| +; i +; +; * where nodes a, d and g are uniform branches, and node c is a varying +; branch. +; * where nodes d, e, f, g, h and i are divergent. +; +; With partial linearization, it can be transformed in the following way: +; +; a +; / \ +; b c <. +; | | | +; | e | +; | | | +; | d | +; | | | +; | g -' +; \ | +; \ h +; \| +; f +; | +; i +; +; __kernel void partial_linearization14(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; if (n < 5) { +; for (int i = 0; i < n + 10; i++) ret++; +; goto f; +; } else { +; while (1) { +; if (id + i % 2 == 0) { +; if (n > 2) { +; goto f; +; } +; } else { +; for (int i = 0; i < n + 10; i++) ret++; +; } +; if (n <= 2) break; +; } +; } +; +; ret += n * 2; +; for (int i = 0; i < n * 2; i++) ret -= i; +; ret /= n; +; goto early; +; +; f: +; for (int i = 0; i < n + 5; i++) ret /= 2; +; ret -= n; +; +; early: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization14(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %n, 5 + br i1 %cmp, label %for.cond, label %while.body + +for.cond: ; preds = %for.body, %entry + %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ] + %add = add nsw i32 %n, 10 + %cmp3 = icmp slt i32 %storemerge4, %add + br i1 %cmp3, label %for.body, label %f + +for.body: ; preds = %for.cond + %inc = add nuw nsw i32 %ret.0, 1 + %inc5 = add nuw nsw i32 %storemerge4, 1 + br label %for.cond + +while.body: ; preds = %if.end24, %entry + %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ] + %cmp7 = icmp eq i32 %conv, 0 + br i1 %cmp7, label %if.then9, label %for.cond15 + +if.then9: ; preds = %while.body + %cmp10 = icmp sgt i32 %n, 2 + br i1 %cmp10, label %f, label %if.end24 + +for.cond15: ; preds = %for.body19, %while.body + %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ] + %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ] + %add16 = add nsw i32 %n, 10 + %cmp17 = icmp slt i32 %storemerge, %add16 + br i1 %cmp17, label %for.body19, label %if.end24 + +for.body19: ; preds = %for.cond15 + %inc20 = add nsw i32 %ret.2, 1 + %inc22 = add nuw nsw i32 %storemerge, 1 + br label %for.cond15 + +if.end24: ; preds = %for.cond15, %if.then9 + %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ] + %cmp25 = icmp slt i32 %n, 3 + br i1 %cmp25, label %if.end29, label %while.body + +if.end29: ; preds = %if.end24 + %mul = mul i32 %n, 2 + %add30 = add nsw i32 %ret.3, %mul + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.end29 + %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ] + %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ] + %mul33 = shl nsw i32 %n, 1 + %cmp34 = icmp slt i32 %storemerge1, %mul33 + br i1 %cmp34, label %for.body36, label %for.end39 + +for.body36: ; preds = %for.cond32 + %sub = sub nsw i32 %ret.4, %storemerge1 + %inc38 = add nuw nsw i32 %storemerge1, 1 + br label %for.cond32 + +for.end39: ; preds = %for.cond32 + %0 = icmp eq i32 %ret.4, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %ret.4, %5 + br label %early + +f: ; preds = %if.then9, %for.cond + %ret.5 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %if.then9 ] + br label %for.cond41 + +for.cond41: ; preds = %for.body45, %f + %ret.6 = phi i32 [ %ret.5, %f ], [ %div46, %for.body45 ] + %storemerge3 = phi i32 [ 0, %f ], [ %inc48, %for.body45 ] + %add42 = add nsw i32 %n, 5 + %cmp43 = icmp slt i32 %storemerge3, %add42 + br i1 %cmp43, label %for.body45, label %for.end49 + +for.body45: ; preds = %for.cond41 + %div46 = sdiv i32 %ret.6, 2 + %inc48 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond41 + +for.end49: ; preds = %for.cond41 + %sub50 = sub nsw i32 %ret.6, %n + br label %early + +early: ; preds = %for.end49, %for.end39 + %storemerge2 = phi i32 [ %div, %for.end39 ], [ %sub50, %for.end49 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization14, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization14 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]] + +; CHECK: [[WHILEBODYPREHEADER]]: +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[FORCOND15PREHEADER:.+]] + +; CHECK: [[FORCOND15PREHEADER]]: +; CHECK: br label %[[FORCOND15:.+]] + +; CHECK: [[IFTHEN9:.+]]: +; CHECK: br label %[[IFEND24:.+]] + +; CHECK: [[FORCOND15]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]] + +; CHECK: [[FORBODY19]]: +; CHECK: br label %[[FORCOND15]] + +; CHECK: [[IFEND24LOOPEXIT]]: +; CHECK: br label %[[IFTHEN9]] + +; CHECK: [[IFEND24]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[IFEND29:.+]] + +; CHECK: [[IFEND29]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[IFEND29ELSE:.+]]: +; CHECK: br label %[[FLOOPEXIT2:.+]] + +; CHECK: [[FORCOND32]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[FOREND39]]: +; CHECK: br label %[[IFEND29ELSE]] + +; CHECK: [[FLOOPEXIT]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[FLOOPEXIT2]]: +; CHECK: br label %[[F]] + +; CHECK: [[F]]: +; CHECK: br label %[[FORCOND41:.+]] + +; CHECK: [[FORCOND41]]: +; CHECK: %[[CMP43:.+]] = icmp +; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]] + +; CHECK: [[FORBODY45]]: +; CHECK: br label %[[FORCOND41]] + +; CHECK: [[FOREND49]]: +; CHECK: br label %[[EARLY:.+]] + +; CHECK: [[EARLY]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll new file mode 100644 index 0000000000000..96155f725946f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll @@ -0,0 +1,385 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-----. +; / \ | +; c d | +; / \ / | +; / e | +; / | | +; / g <---. | +; / / \ | | +; f h i | | +; | / \ / \ | | +; | | j k | | +; | \ / \ / | | +; | l m --' | +; | / | +; | o ----------' +; | | +; n p +; \ / +; q +; +; * where nodes b, c, g, h, j and o are uniform branches, and node i is a +; varying branch. +; * where nodes j, k, m, l, and o are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; b <-----. +; / \ | +; c d | +; / \ / | +; f e | +; | | | +; | g <---. | +; | / \ | | +; | h i | | +; | | | | | +; | | k | | +; | \ / | | +; | j | | +; | | | | +; | m ----' | +; | | | +; | l | +; | | | +; | o ------' +; | | +; n p +; \ / +; q +; +; __kernel void partial_linearization15(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0) { // b +; // c +; for (int i = 0; i < n * 2; i++) ret++; +; if (n <= 10) { +; // f +; goto f; +; } +; } else { +; // d +; for (int i = 0; i < n / 4; i++) ret++; +; } +; // e +; ret++; +; while (1) { +; if (n & 1) { // g +; // h +; if (n < 3) { +; goto l; +; } +; } else { +; // i +; if (ret + id >= n) { +; // k +; ret /= n * n + ret; +; goto m; +; } +; } +; // j +; if (n & 1) { +; goto l; +; } +; // m +; m: +; ret++; +; } +; l: +; ret *= 4; +; o: +; if (n & 1) { +; // p +; ret++; +; goto p; +; } +; } +; +; p: +; for (int i = 0; i < n / 4; i++) ret++; +; goto q; +; +; f: +; ret /= n; +; goto n; +; +; n: +; for (int i = 0; i < n * 2; i++) ret++; +; +; q: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization15(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %l, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %mul40, %l ] + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %for.cond, label %for.cond9 + +for.cond: ; preds = %for.body, %while.body + %ret.1 = phi i32 [ %inc, %for.body ], [ %ret.0, %while.body ] + %storemerge3 = phi i32 [ %inc4, %for.body ], [ 0, %while.body ] + %mul = shl nsw i32 %n, 1 + %cmp2 = icmp slt i32 %storemerge3, %mul + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %inc = add nsw i32 %ret.1, 1 + %inc4 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp5 = icmp slt i32 %n, 11 + br i1 %cmp5, label %f, label %if.end17 + +for.cond9: ; preds = %for.body12, %while.body + %ret.2 = phi i32 [ %inc13, %for.body12 ], [ %ret.0, %while.body ] + %storemerge = phi i32 [ %inc15, %for.body12 ], [ 0, %while.body ] + %div = sdiv i32 %n, 4 + %cmp10 = icmp slt i32 %storemerge, %div + br i1 %cmp10, label %for.body12, label %if.end17 + +for.body12: ; preds = %for.cond9 + %inc13 = add nsw i32 %ret.2, 1 + %inc15 = add nuw nsw i32 %storemerge, 1 + br label %for.cond9 + +if.end17: ; preds = %for.cond9, %for.end + %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ] + br label %while.body20 + +while.body20: ; preds = %m, %if.end17 + %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %m ] + %storemerge1 = add nsw i32 %storemerge1.in, 1 + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + br i1 %tobool, label %if.else26, label %if.then21 + +if.then21: ; preds = %while.body20 + %cmp22 = icmp slt i32 %n, 3 + br i1 %cmp22, label %l, label %if.end34 + +if.else26: ; preds = %while.body20 + %add = add nsw i32 %storemerge1, %conv + %cmp27 = icmp slt i32 %add, %n + br i1 %cmp27, label %if.end34, label %if.then29 + +if.then29: ; preds = %if.else26 + %mul30 = mul nsw i32 %n, %n + %add31 = add nsw i32 %storemerge1, %mul30 + %0 = icmp eq i32 %add31, 0 + %1 = select i1 %0, i32 1, i32 %add31 + %div32 = sdiv i32 %storemerge1, %1 + br label %m + +if.end34: ; preds = %if.else26, %if.then21 + %and35 = and i32 %n, 1 + %tobool36 = icmp eq i32 %and35, 0 + br i1 %tobool36, label %m, label %l + +m: ; preds = %if.end34, %if.then29 + %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ] + br label %while.body20 + +l: ; preds = %if.end34, %if.then21 + %mul40 = shl nsw i32 %storemerge1, 2 + %and41 = and i32 %n, 1 + %tobool42 = icmp eq i32 %and41, 0 + br i1 %tobool42, label %while.body, label %if.then43 + +if.then43: ; preds = %l + %inc44 = or i32 %mul40, 1 + br label %for.cond47 + +for.cond47: ; preds = %for.body51, %if.then43 + %ret.5 = phi i32 [ %inc44, %if.then43 ], [ %inc52, %for.body51 ] + %storemerge2 = phi i32 [ 0, %if.then43 ], [ %inc54, %for.body51 ] + %div48 = sdiv i32 %n, 4 + %cmp49 = icmp slt i32 %storemerge2, %div48 + br i1 %cmp49, label %for.body51, label %q + +for.body51: ; preds = %for.cond47 + %inc52 = add nsw i32 %ret.5, 1 + %inc54 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond47 + +f: ; preds = %for.end + %2 = icmp eq i32 %ret.1, -2147483648 + %3 = icmp eq i32 %n, -1 + %4 = and i1 %3, %2 + %5 = icmp eq i32 %n, 0 + %6 = or i1 %5, %4 + %7 = select i1 %6, i32 1, i32 %n + %div56 = sdiv i32 %ret.1, %7 + br label %for.cond59 + +for.cond59: ; preds = %for.body63, %f + %ret.6 = phi i32 [ %div56, %f ], [ %inc64, %for.body63 ] + %storemerge4 = phi i32 [ 0, %f ], [ %inc66, %for.body63 ] + %mul60 = shl nsw i32 %n, 1 + %cmp61 = icmp slt i32 %storemerge4, %mul60 + br i1 %cmp61, label %for.body63, label %q + +for.body63: ; preds = %for.cond59 + %inc64 = add nsw i32 %ret.6, 1 + %inc66 = add nuw nsw i32 %storemerge4, 1 + br label %for.cond59 + +q: ; preds = %for.cond59, %for.cond47 + %ret.7 = phi i32 [ %ret.5, %for.cond47 ], [ %ret.6, %for.cond59 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.7, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization15, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization15 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]] + +; CHECK: [[FORCOND9PREHEADER]]: +; CHECK: br label %[[FORCOND9:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 false, label %[[FORBODY:.+]], label %[[FOREND:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]] + +; CHECK: [[FORCOND9]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND9]] + +; CHECK: [[IFEND17LOOPEXIT]]: +; CHECK: br label %[[IFEND17]] + +; CHECK: [[IFEND17]]: +; CHECK: br label %[[WHILEBODY20:.+]] + +; CHECK: [[WHILEBODY20]]: +; CHECK: %[[AND:.+]] = and i32 +; CHECK: %[[TOBOOL:.+]] = icmp eq i32 %[[AND]] +; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]] + +; CHECK: [[IFTHEN21]]: +; CHECK: br label %[[M:.+]] + +; CHECK: [[IFELSE26]]: +; CHECK: br label %[[IFTHEN29:.+]] + +; CHECK: [[IFTHEN29]]: +; CHECK: br label %[[IFEND34:.+]] + +; CHECK: [[IFEND34]]: +; CHECK: br label %[[M:.+]] + +; CHECK: [[M]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]] + +; CHECK: [[WHILEBODY20PUREEXIT]]: +; CHECK: br label %[[L:.+]] + +; CHECK: [[L]]: +; CHECK: %[[TOBOOL42:.+]] = icmp +; CHECK: br i1 %[[TOBOOL42]], label %[[WHILEBODY]], label %[[IFTHEN43:.+]] + +; CHECK: [[IFTHEN43]]: +; CHECK: br label %[[FORCOND47:.+]] + +; CHECK: [[FORCOND47]]: +; CHECK: %[[CMP49:.+]] = icmp +; CHECK: br i1 %[[CMP49]], label %[[FORBODY51:.+]], label %[[QLOOPEXIT2:.+]] + +; CHECK: [[FORBODY51]]: +; CHECK: br label %[[FORCOND47]] + +; CHECK: [[F]]: +; CHECK: br label %[[FORCOND59:.+]] + +; CHECK: [[FORCOND59]]: +; CHECK: br i1 false, label %[[FORBODY63:.+]], label %[[QLOOPEXIT:.+]] + +; CHECK: [[FORBODY63]]: +; CHECK: br label %[[FORCOND59]] + +; CHECK: [[QLOOPEXIT]]: +; CHECK: br label %[[Q:.+]] + +; CHECK: [[QLOOPEXIT2]]: +; CHECK: br label %[[Q]] + +; CHECK: [[Q]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll new file mode 100644 index 0000000000000..48295e243c7ab --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll @@ -0,0 +1,319 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c <-. +; / / \ | +; | d e | +; | / \ / | +; | f g --' +; |/ | +; h i +; \ / +; \ / +; j +; +; * where nodes a, d and g are uniform branches, and node c is a varying +; branch. +; * where nodes d, e, f, g, i and j are divergent. +; +; With partial linearization, it can be transformed in the following way: +; +; a +; / \ +; b c <. +; | | | +; | e | +; | | | +; | d | +; | | | +; | g -' +; | | +; | i +; \ | +; \ f +; \| +; h +; | +; j +; +; __kernel void partial_linearization16(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; if (n < 5) { +; for (int i = 0; i < n + 10; i++) ret++; +; goto h; +; } else { +; while (1) { +; if (id + i % 2 == 0) { +; if (n > 2) { +; goto f; +; } +; } else { +; for (int i = 0; i < n + 10; i++) ret++; +; } +; if (n <= 2) break; +; } +; } +; +; ret += n * 2; +; for (int i = 0; i < n * 2; i++) ret -= i; +; ret /= n; +; goto early; +; +; f: +; for (int i = 0; i < n + 5; i++) ret /= 2; +; ret -= n; +; +; h: +; for (int i = 0; i < n * 2; i++) ret -= i; +; +; early: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization16(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %n, 5 + br i1 %cmp, label %for.cond, label %while.body + +for.cond: ; preds = %for.body, %entry + %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ] + %add = add nsw i32 %n, 10 + %cmp3 = icmp slt i32 %storemerge4, %add + br i1 %cmp3, label %for.body, label %h + +for.body: ; preds = %for.cond + %inc = add nuw nsw i32 %ret.0, 1 + %inc5 = add nuw nsw i32 %storemerge4, 1 + br label %for.cond + +while.body: ; preds = %if.end24, %entry + %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ] + %cmp7 = icmp eq i32 %conv, 0 + br i1 %cmp7, label %if.then9, label %for.cond15 + +if.then9: ; preds = %while.body + %cmp10 = icmp sgt i32 %n, 2 + br i1 %cmp10, label %for.cond41, label %if.end24 + +for.cond15: ; preds = %for.body19, %while.body + %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ] + %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ] + %add16 = add nsw i32 %n, 10 + %cmp17 = icmp slt i32 %storemerge, %add16 + br i1 %cmp17, label %for.body19, label %if.end24 + +for.body19: ; preds = %for.cond15 + %inc20 = add nsw i32 %ret.2, 1 + %inc22 = add nuw nsw i32 %storemerge, 1 + br label %for.cond15 + +if.end24: ; preds = %for.cond15, %if.then9 + %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ] + %cmp25 = icmp slt i32 %n, 3 + br i1 %cmp25, label %if.end29, label %while.body + +if.end29: ; preds = %if.end24 + %mul = mul i32 %n, 2 + %add30 = add nsw i32 %ret.3, %mul + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.end29 + %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ] + %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ] + %mul33 = shl nsw i32 %n, 1 + %cmp34 = icmp slt i32 %storemerge1, %mul33 + br i1 %cmp34, label %for.body36, label %for.end39 + +for.body36: ; preds = %for.cond32 + %sub = sub nsw i32 %ret.4, %storemerge1 + %inc38 = add nuw nsw i32 %storemerge1, 1 + br label %for.cond32 + +for.end39: ; preds = %for.cond32 + %0 = icmp eq i32 %ret.4, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %ret.4, %5 + br label %early + +for.cond41: ; preds = %for.body45, %if.then9 + %ret.5 = phi i32 [ %div46, %for.body45 ], [ %ret.1, %if.then9 ] + %storemerge2 = phi i32 [ %inc48, %for.body45 ], [ 0, %if.then9 ] + %add42 = add nsw i32 %n, 5 + %cmp43 = icmp slt i32 %storemerge2, %add42 + br i1 %cmp43, label %for.body45, label %for.end49 + +for.body45: ; preds = %for.cond41 + %div46 = sdiv i32 %ret.5, 2 + %inc48 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond41 + +for.end49: ; preds = %for.cond41 + %sub50 = sub nsw i32 %ret.5, %n + br label %h + +h: ; preds = %for.end49, %for.cond + %ret.6 = phi i32 [ %sub50, %for.end49 ], [ %ret.0, %for.cond ] + br label %for.cond52 + +for.cond52: ; preds = %for.body56, %h + %ret.7 = phi i32 [ %ret.6, %h ], [ %sub57, %for.body56 ] + %storemerge3 = phi i32 [ 0, %h ], [ %inc59, %for.body56 ] + %mul53 = shl nsw i32 %n, 1 + %cmp54 = icmp slt i32 %storemerge3, %mul53 + br i1 %cmp54, label %for.body56, label %early + +for.body56: ; preds = %for.cond52 + %sub57 = sub nsw i32 %ret.7, %storemerge3 + %inc59 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond52 + +early: ; preds = %for.cond52, %for.end39 + %ret.8 = phi i32 [ %div, %for.end39 ], [ %ret.7, %for.cond52 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization16, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization16 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]] + +; CHECK: [[WHILEBODYPREHEADER]]: +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[FORCOND15PREHEADER:.+]] + +; CHECK: [[FORCOND15PREHEADER]]: +; CHECK: br label %[[FORCOND15:.+]] + +; CHECK: [[IFTHEN9:.+]]: +; CHECK: br label %[[IFEND24:.+]] + +; CHECK: [[FORCOND41PREHEADER:.+]]: +; CHECK: br label %[[FORCOND41:.+]] + +; CHECK: [[FORCOND15]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]] + +; CHECK: [[FORBODY19]]: +; CHECK: br label %[[FORCOND15]] + +; CHECK: [[IFEND24LOOPEXIT]]: +; CHECK: br label %[[IFTHEN9]] + +; CHECK: [[IFEND24]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[IFEND29:.+]] + +; CHECK: [[IFEND29]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[IFEND29ELSE:.+]]: +; CHECK: br label %[[FORCOND41PREHEADER]] + +; CHECK: [[FORCOND32]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[FOREND39]]: +; CHECK: br label %[[IFEND29ELSE]] + +; CHECK: [[FORCOND41]]: +; CHECK: %[[CMP43:.+]] = icmp +; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]] + +; CHECK: [[FORBODY45]]: +; CHECK: br label %[[FORCOND41]] + +; CHECK: [[FOREND49]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[H]] + +; CHECK: [[H]]: +; CHECK: br label %[[FORCOND52:.+]] + +; CHECK: [[FORCOND52]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56:.+]], label %[[EARLYLOOPEXIT:.+]] + +; CHECK: [[FORBODY56]]: +; CHECK: br label %[[FORCOND52]] + +; CHECK: [[EARLYLOOPEXIT]]: +; CHECK: br label %[[EARLY:.+]] + +; CHECK: [[EARLY]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll new file mode 100644 index 0000000000000..0ed3fe5c32596 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll @@ -0,0 +1,377 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <----. +; / \ | +; c d | +; / / \ | +; e f g -' +; / \ | | +; .--> h | i j +; | / \ | \ / +; '- k l '-> m +; | \ / +; n \ / +; \ o +; \ / +; \ / +; p +; +; * where nodes b, d, and h are uniform branches, and nodes e and g are varying +; branches. +; * where nodes h, j, m, o, and p are divergent. +; +; With partial linearization, it can be transformed in the following way: +; +; a +; | +; b <----. +; / \ | +; c d | +; / / \ | +; e f g -' +; / | | +; .--> h i | +; | / \ | | +; '- k l | | +; \ \ | / +; n \ | / +; \ \|/ +; `-> j +; | +; m +; | +; o +; | +; p +; +; __kernel void partial_linearization17(__global int *out, int n, int x) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; while (1) { +; if (n > 10) { +; goto c; +; } else if (n < 5) { +; goto f; +; } +; if (id + i++ % 2 == 0) { +; break; +; } +; } +; +; // j +; for (int i = 0; i < n + 10; i++) ret++; +; goto m; +; +; f: +; ret += x / 2; +; for (int i = 0; i < x / 2; i++) ret += i; +; goto m; +; +; c: +; for (int i = 0; i < n - 5; i++) ret += 2; +; // e +; if (id % 2 == 0) { +; goto h; +; } else { +; goto m; +; } +; +; m: +; ret <<= 2; +; goto o; +; +; h: +; for (int i = 0; i < x / 2; i++) { +; if (x < 5) { +; goto l; +; } +; } +; // n +; ret += id << 3; +; goto p; +; +; l: +; ret += id << 3; +; +; o: +; for (int i = 0; i < x / 2; i++) ret += i; +; +; p: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 noundef %n, i32 noundef %x) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end5, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end5 ] + %cmp = icmp sgt i32 %n, 10 + br i1 %cmp, label %for.cond28, label %if.else + +if.else: ; preds = %while.body + %cmp2 = icmp slt i32 %n, 5 + br i1 %cmp2, label %f, label %if.end5 + +if.end5: ; preds = %if.else + %inc = add nuw nsw i32 %i.0, 1 + %rem = and i32 %i.0, 1 + %add = sub nsw i32 0, %rem + %cmp6 = icmp eq i32 %conv, %add + br i1 %cmp6, label %for.cond, label %while.body + +for.cond: ; preds = %for.body, %if.end5 + %ret.0 = phi i32 [ %inc14, %for.body ], [ 0, %if.end5 ] + %storemerge = phi i32 [ %inc15, %for.body ], [ 0, %if.end5 ] + %add11 = add nsw i32 %n, 10 + %cmp12 = icmp slt i32 %storemerge, %add11 + br i1 %cmp12, label %for.body, label %m + +for.body: ; preds = %for.cond + %inc14 = add nuw nsw i32 %ret.0, 1 + %inc15 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +f: ; preds = %if.else + %div = sdiv i32 %x, 2 + br label %for.cond18 + +for.cond18: ; preds = %for.body22, %f + %ret.1 = phi i32 [ %div, %f ], [ %add23, %for.body22 ] + %storemerge3 = phi i32 [ 0, %f ], [ %inc25, %for.body22 ] + %div19 = sdiv i32 %x, 2 + %cmp20 = icmp slt i32 %storemerge3, %div19 + br i1 %cmp20, label %for.body22, label %m + +for.body22: ; preds = %for.cond18 + %add23 = add nsw i32 %storemerge3, %ret.1 + %inc25 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond18 + +for.cond28: ; preds = %for.body32, %while.body + %ret.2 = phi i32 [ %add33, %for.body32 ], [ 0, %while.body ] + %storemerge4 = phi i32 [ %inc35, %for.body32 ], [ 0, %while.body ] + %add29 = add nsw i32 %n, 5 + %cmp30 = icmp slt i32 %storemerge4, %add29 + br i1 %cmp30, label %for.body32, label %for.end36 + +for.body32: ; preds = %for.cond28 + %add33 = add nuw nsw i32 %ret.2, 2 + %inc35 = add nuw nsw i32 %storemerge4, 1 + br label %for.cond28 + +for.end36: ; preds = %for.cond28 + %rem375 = and i32 %conv, 1 + %cmp38 = icmp eq i32 %rem375, 0 + br i1 %cmp38, label %for.cond43, label %m + +m: ; preds = %for.end36, %for.cond18, %for.cond + %ret.3 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond18 ], [ %ret.2, %for.end36 ] + %shl = shl i32 %ret.3, 2 + br label %o + +for.cond43: ; preds = %for.inc52, %for.end36 + %storemerge6 = phi i32 [ %inc53, %for.inc52 ], [ 0, %for.end36 ] + %div44 = sdiv i32 %x, 2 + %cmp45 = icmp slt i32 %storemerge6, %div44 + br i1 %cmp45, label %for.body47, label %for.end54 + +for.body47: ; preds = %for.cond43 + %cmp48 = icmp slt i32 %x, 5 + br i1 %cmp48, label %l, label %for.inc52 + +for.inc52: ; preds = %for.body47 + %inc53 = add nuw nsw i32 %storemerge6, 1 + br label %for.cond43 + +for.end54: ; preds = %for.cond43 + %shl55 = mul i32 %conv, 8 + %add56 = add nsw i32 %ret.2, %shl55 + br label %p + +l: ; preds = %for.body47 + %shl57 = mul i32 %conv, 8 + %add58 = add nsw i32 %ret.2, %shl57 + br label %o + +o: ; preds = %l, %m + %storemerge1 = phi i32 [ %shl, %m ], [ %add58, %l ] + br label %for.cond60 + +for.cond60: ; preds = %for.body64, %o + %ret.4 = phi i32 [ %storemerge1, %o ], [ %add65, %for.body64 ] + %storemerge2 = phi i32 [ 0, %o ], [ %inc67, %for.body64 ] + %div61 = sdiv i32 %x, 2 + %cmp62 = icmp slt i32 %storemerge2, %div61 + br i1 %cmp62, label %for.body64, label %p + +for.body64: ; preds = %for.cond60 + %add65 = add nsw i32 %storemerge2, %ret.4 + %inc67 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond60 + +p: ; preds = %for.cond60, %for.end54 + %ret.5 = phi i32 [ %add56, %for.end54 ], [ %ret.4, %for.cond60 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.5, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization17, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization17 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]] + +; CHECK: [[FORCOND28PREHEADER]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[FORCOND28PREHEADERELSE:.+]]: +; CHECK: br label %[[M:.+]] + +; CHECK: [[FORCOND28PREHEADERSPLIT:.+]]: +; CHECK: br label %[[FORCOND28:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: %[[CMP2:.+]] = icmp +; CHECK: br i1 %[[CMP2]], label %[[F:.+]], label %[[IFEND5:.+]] + +; CHECK: [[IFEND5]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCONDPREHEADERELSE:.+]]: +; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[MLOOPEXIT2:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[F]]: +; CHECK: br label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[FELSE]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND28PREHEADERELSE]], label %[[FORCOND28PREHEADERSPLIT]] + +; CHECK: [[FSPLIT]]: +; CHECK: br label %[[FORCOND18:.+]] + +; CHECK: [[FORCOND18]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[MLOOPEXIT:.+]] + +; CHECK: [[FORBODY22]]: +; CHECK: br label %[[FORCOND18]] + +; CHECK: [[FORCOND28]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32:.+]], label %[[FOREND36:.+]] + +; CHECK: [[FORBODY32]]: +; CHECK: br label %[[FORCOND28]] + +; CHECK: [[FOREND36]]: +; CHECK: br label %[[FORCOND43PREHEADER:.+]] + +; CHECK: [[FORCOND43PREHEADER]]: +; CHECK: br label %[[FORCOND43:.+]] + +; CHECK: [[MLOOPEXIT]]: +; CHECK: br label %[[M]] + +; CHECK: [[MLOOPEXIT2]]: +; CHECK: br label %[[FORCONDPREHEADERELSE]] + +; CHECK: [[M]]: +; CHECK: br label %[[O:.+]] + +; CHECK: [[FORCOND43]]: +; CHECK: %[[CMP14:.+]] = icmp +; CHECK: br i1 %[[CMP14]], label %[[FORBODY47:.+]], label %[[FOREND54:.+]] + +; CHECK: [[FORBODY47]]: +; CHECK: %[[CMP48:.+]] = icmp +; CHECK: br i1 %[[CMP48]], label %[[L:.+]], label %[[FORINC52:.+]] + +; CHECK: [[FORINC52]]: +; CHECK: br label %[[FORCOND43]] + +; CHECK: [[FOREND54]]: +; CHECK: br label %[[M]] + +; CHECK: [[L]]: +; CHECK: br label %[[M]] + +; CHECK: [[O]]: +; CHECK: br label %[[FORCOND60:.+]] + +; CHECK: [[FORCOND60]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64:.+]], label %[[PLOOPEXIT:.+]] + +; CHECK: [[FORBODY64]]: +; CHECK: br label %[[FORCOND60]] + +; CHECK: [[PLOOPEXIT]]: +; CHECK: br label %[[P:.+]] + +; CHECK: [[P]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll new file mode 100644 index 0000000000000..903ba12b02fd9 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll @@ -0,0 +1,289 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <--. +; / \ | +; c d -' +; / \ | +; e f | +; | \| +; | g +; | / +; | h +; \ / \ +; i j +; \ / +; k +; +; * where nodes b, and h are uniform branches, and nodes c and d are varying +; branches. +; * where nodes e, f, g, i and k are divergent. +; +; With partial linearization, it can be transformed in the following way: +; +; a +; | +; b <--. +; / \ | +; c d -' +; | | +; f | +; | | +; e | +; \ / +; g +; | +; h +; / \ +; | j +; \ / +; i +; | +; k +; +; __kernel void partial_linearization18(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; while (1) { +; if (n > 5) { +; if (id + i % 2 == 0) { +; goto e; +; } else { +; goto f; +; } +; } +; if (++i + id > 3) { +; goto g; +; } +; } +; +; f: +; for (int i = 0; i < n + 5; i++) ret += 2; +; goto g; +; +; g: +; for (int i = 1; i < n * 2; i++) ret *= i; +; goto h; +; +; e: +; for (int i = 0; i < n + 5; i++) ret++; +; goto i; +; +; h: +; if (n > 3) { +; i: +; ret++; +; } else { +; ret *= 3; +; } +; +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %cmp = icmp sgt i32 %n, 5 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %while.body + %rem = and i32 %i.0, 1 + %add = sub nsw i32 0, %rem + %cmp2 = icmp eq i32 %conv, %add + br i1 %cmp2, label %for.cond26, label %for.cond + +if.end: ; preds = %while.body + %inc = add nuw nsw i32 %i.0, 1 + %add5 = add nsw i32 %inc, %conv + %cmp6 = icmp sgt i32 %add5, 3 + br i1 %cmp6, label %g, label %while.body + +for.cond: ; preds = %for.body, %if.then + %ret.0 = phi i32 [ %add14, %for.body ], [ 0, %if.then ] + %storemerge2 = phi i32 [ %inc15, %for.body ], [ 0, %if.then ] + %add11 = add nsw i32 %n, 5 + %cmp12 = icmp slt i32 %storemerge2, %add11 + br i1 %cmp12, label %for.body, label %g + +for.body: ; preds = %for.cond + %add14 = add nuw nsw i32 %ret.0, 2 + %inc15 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond + +g: ; preds = %for.cond, %if.end + %ret.1 = phi i32 [ 0, %if.end ], [ %ret.0, %for.cond ] + br label %for.cond17 + +for.cond17: ; preds = %for.body20, %g + %ret.2 = phi i32 [ %ret.1, %g ], [ %mul21, %for.body20 ] + %storemerge = phi i32 [ 1, %g ], [ %inc23, %for.body20 ] + %mul = shl nsw i32 %n, 1 + %cmp18 = icmp slt i32 %storemerge, %mul + br i1 %cmp18, label %for.body20, label %h + +for.body20: ; preds = %for.cond17 + %mul21 = mul nsw i32 %storemerge, %ret.2 + %inc23 = add nuw nsw i32 %storemerge, 1 + br label %for.cond17 + +for.cond26: ; preds = %for.body30, %if.then + %ret.3 = phi i32 [ %inc31, %for.body30 ], [ 0, %if.then ] + %storemerge3 = phi i32 [ %inc33, %for.body30 ], [ 0, %if.then ] + %add27 = add nsw i32 %n, 5 + %cmp28 = icmp slt i32 %storemerge3, %add27 + br i1 %cmp28, label %for.body30, label %i38 + +for.body30: ; preds = %for.cond26 + %inc31 = add nuw nsw i32 %ret.3, 1 + %inc33 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond26 + +h: ; preds = %for.cond17 + %cmp35 = icmp sgt i32 %n, 3 + br i1 %cmp35, label %i38, label %if.else40 + +i38: ; preds = %h, %for.cond26 + %ret.4 = phi i32 [ %ret.3, %for.cond26 ], [ %ret.2, %h ] + %inc39 = add nsw i32 %ret.4, 1 + br label %if.end42 + +if.else40: ; preds = %h + %mul41 = mul nsw i32 %ret.2, 3 + br label %if.end42 + +if.end42: ; preds = %if.else40, %i38 + %storemerge1 = phi i32 [ %mul41, %if.else40 ], [ %inc39, %i38 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge1, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization18, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization18 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[IFTHENELSE:.+]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[IFTHENSPLIT:.+]]: +; CHECK: br label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND26PREHEADER:.+]]: +; CHECK: br label %[[FORCOND26:.+]] + +; CHECK: [[IFEND]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[GLOOPEXIT2:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[GLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[GLOOPEXIT]]: +; CHECK: br label %[[FORCOND26PREHEADER]] + +; CHECK: [[GLOOPEXIT2]]: +; CHECK: br label %[[GLOOPEXIT2ELSE:.+]] + +; CHECK: [[GLOOPEXIT2ELSE]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]] + +; CHECK: [[G]]: +; CHECK: br label %[[FORCOND17:.+]] + +; CHECK: [[FORCOND17]]: +; CHECK: %[[CMP18:.+]] = icmp +; CHECK: br i1 %[[CMP18]], label %[[FORBODY20:.+]], label %[[H:.+]] + +; CHECK: [[FORBODY20]]: +; CHECK: br label %[[FORCOND17]] + +; CHECK: [[FORCOND26]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[I38LOOPEXIT:.+]] + +; CHECK: [[FORBODY30]]: +; CHECK: br label %[[FORCOND26]] + +; CHECK: [[H]]: +; CHECK: %[[CMP35:.+]] = icmp +; CHECK: br i1 %[[CMP35]], label %[[I38:.+]], label %[[IFELSE40:.+]] + +; CHECK: [[I38LOOPEXIT]]: +; CHECK: br label %[[G]] + +; CHECK: [[I38]]: +; CHECK: br label %[[IFEND42:.+]] + +; CHECK: [[IFELSE40]]: +; CHECK: br label %[[I38]] + +; CHECK: [[IFEND42]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll new file mode 100644 index 0000000000000..6810eb855c5f4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll @@ -0,0 +1,308 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <----. +; / \ | +; c \ | +; / \ \ | +; d e f -' +; | | | +; \ \ g +; \ \ / \ +; \ h i <, +; \ \ / / +; \ j / +; \ / +; `-' +; +; * where nodes b, c, and g are uniform branches, and node f is a varying +; branch. +; * where nodes g, h, i and j are divergent. +; +; With partial linearization, it can be transformed in the following way: +; +; a +; | +; b <----. +; / \ | +; c \ | +; / \ \ | +; d e f -' +; | | | +; \ | / +; \ | / +; \|/ +; g +; | +; i +; | +; h +; | +; j +; +; The uniform branch `g` has been linearized because both its successors are +; divergent. Not linearizing `g` would mean that only one of both +; successors could be executed in addition to the other, pending a uniform +; condition evaluates to true, whereas what we want is to possibly execute both +; no matter what the uniform condition evaluates to. +; +; __kernel void partial_linearization19(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; int i = 0; +; +; while (1) { +; if (n > 5) { +; if (n == 6) { +; goto d; +; } else { +; goto e; +; } +; } +; if (++i + id > 3) { +; break; +; } +; } +; +; // g +; if (n == 3) { +; goto h; +; } else { +; goto i; +; } +; +; d: +; for (int i = 0; i < n + 5; i++) ret += 2; +; goto i; +; +; e: +; for (int i = 1; i < n * 2; i++) ret += i; +; goto h; +; +; i: +; for (int i = 0; i < n + 5; i++) ret++; +; goto j; +; +; h: +; for (int i = 0; i < n; i++) ret++; +; goto j; +; +; j: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ] + %cmp = icmp sgt i32 %n, 5 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %while.body + %cmp2 = icmp eq i32 %n, 6 + br i1 %cmp2, label %for.cond, label %for.cond20 + +if.end: ; preds = %while.body + %inc = add nuw nsw i32 %i.0, 1 + %add = add nsw i32 %inc, %conv + %cmp5 = icmp sgt i32 %add, 3 + br i1 %cmp5, label %while.end, label %while.body + +while.end: ; preds = %if.end + %cmp9 = icmp eq i32 %n, 3 + br i1 %cmp9, label %h, label %i28 + +for.cond: ; preds = %for.body, %if.then + %ret.0 = phi i32 [ %add17, %for.body ], [ 0, %if.then ] + %storemerge3 = phi i32 [ %inc18, %for.body ], [ 0, %if.then ] + %add14 = add nsw i32 %n, 5 + %cmp15 = icmp slt i32 %storemerge3, %add14 + br i1 %cmp15, label %for.body, label %i28 + +for.body: ; preds = %for.cond + %add17 = add nuw nsw i32 %ret.0, 2 + %inc18 = add nuw nsw i32 %storemerge3, 1 + br label %for.cond + +for.cond20: ; preds = %for.body23, %if.then + %ret.1 = phi i32 [ %add24, %for.body23 ], [ 0, %if.then ] + %storemerge2 = phi i32 [ %inc26, %for.body23 ], [ 1, %if.then ] + %mul = shl nsw i32 %n, 1 + %cmp21 = icmp slt i32 %storemerge2, %mul + br i1 %cmp21, label %for.body23, label %h + +for.body23: ; preds = %for.cond20 + %add24 = add nuw nsw i32 %storemerge2, %ret.1 + %inc26 = add nuw nsw i32 %storemerge2, 1 + br label %for.cond20 + +i28: ; preds = %for.cond, %while.end + %ret.2 = phi i32 [ 0, %while.end ], [ %ret.0, %for.cond ] + br label %for.cond30 + +for.cond30: ; preds = %for.body34, %i28 + %ret.3 = phi i32 [ %ret.2, %i28 ], [ %inc35, %for.body34 ] + %storemerge = phi i32 [ 0, %i28 ], [ %inc37, %for.body34 ] + %add31 = add nsw i32 %n, 5 + %cmp32 = icmp slt i32 %storemerge, %add31 + br i1 %cmp32, label %for.body34, label %j + +for.body34: ; preds = %for.cond30 + %inc35 = add nuw nsw i32 %ret.3, 1 + %inc37 = add nuw nsw i32 %storemerge, 1 + br label %for.cond30 + +h: ; preds = %for.cond20, %while.end + %ret.4 = phi i32 [ 0, %while.end ], [ %ret.1, %for.cond20 ] + br label %for.cond40 + +for.cond40: ; preds = %for.body43, %h + %ret.5 = phi i32 [ %ret.4, %h ], [ %inc44, %for.body43 ] + %storemerge1 = phi i32 [ 0, %h ], [ %inc46, %for.body43 ] + %cmp41 = icmp slt i32 %storemerge1, %n + br i1 %cmp41, label %for.body43, label %j + +for.body43: ; preds = %for.cond40 + %inc44 = add nsw i32 %ret.5, 1 + %inc46 = add nuw nsw i32 %storemerge1, 1 + br label %for.cond40 + +j: ; preds = %for.cond40, %for.cond30 + %ret.6 = phi i32 [ %ret.3, %for.cond30 ], [ %ret.5, %for.cond40 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization19, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization19 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: %[[CMP2:.+]] = icmp +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[IFTHENELSE:.+]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[IFTHENSPLIT:.+]]: +; CHECK: br i1 %[[CMP2MERGE:.+]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND20PREHEADER:.+]] + +; CHECK: [[FORCOND20PREHEADER]]: +; CHECK: br label %[[FORCOND20:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[IFEND]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: %[[CMP2MERGE]] = phi i1 [ %[[CMP2]], %[[IFTHEN]] ], [ false, %[[IFEND]] ] +; CHECK: br label %[[WHILEEND:.+]] + +; CHECK: [[WHILEEND]]: +; CHECK: br label %[[WHILEENDELSE:.+]] + +; CHECK: [[WHILEENDELSE]]: +; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[I28LOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND20]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY23]]: +; CHECK: br label %[[FORCOND20]] + +; CHECK: [[I28LOOPEXIT]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[I28:.+]]: +; CHECK: br label %[[FORCOND30:.+]] + +; CHECK: [[FORCOND30]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34:.+]], label %[[JLOOPEXIT:.+]] + +; CHECK: [[FORBODY34]]: +; CHECK: br label %[[FORCOND30]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[H]] + +; CHECK: [[H]]: +; CHECK: br label %[[FORCOND40:.+]] + +; CHECK: [[FORCOND40]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43:.+]], label %[[JLOOPEXIT2:.+]] + +; CHECK: [[FORBODY43]]: +; CHECK: br label %[[FORCOND40]] + +; CHECK: [[JLOOPEXIT]]: +; CHECK: br label %[[J:.+]] + +; CHECK: [[JLOOPEXIT2]]: +; CHECK: br label %[[I28]] + +; CHECK: [[J]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll new file mode 100644 index 0000000000000..9e59e6bf7092b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll @@ -0,0 +1,274 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; / \ +; / \ +; b c +; / \ / \ +; d e f g +; \ \ / / +; \ X / +; \ / \ / +; h i +; \ / +; j +; +; * where node a is a uniform branch, and nodes b and c are varying branches. +; * where nodes d, e, f, g are divergent. +; +; With partial linearization we will have a CFG of the form: +; +; a +; / \ +; / \ +; / \ +; b c +; / \ +; e - d f - g +; \ / +; i +; | +; h +; | +; j +; +; __kernel void partial_linearization2(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (n < 10) { // uniform +; if (id % 3 == 0) { // varying +; for (int i = 0; i < n - 1; i++) { ret /= 2; } goto h; +; } else { // varying +; for (int i = 0; i < n / 3; i++) { ret -= 2; } goto i; +; } +; } else { // uniform +; if (id % 2 == 0) { // varying +; for (int i = 0; i < n * 2; i++) { ret += 1; } goto h; +; } else { // varying +; for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i; +; } +; } +; +; h: +; ret += 5; +; goto end; +; +; i: +; ret *= 10; +; goto end; +; +; end: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %n, 10 + br i1 %cmp, label %if.then, label %if.else17 + +if.then: ; preds = %entry + %rem = srem i32 %conv, 3 + %cmp2 = icmp eq i32 %rem, 0 + br i1 %cmp2, label %if.then4, label %if.else + +if.then4: ; preds = %if.then + br label %for.cond + +for.cond: ; preds = %for.body, %if.then4 + %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ] + %storemerge5 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ] + %sub = add nsw i32 %n, -1 + %cmp5 = icmp slt i32 %storemerge5, %sub + br i1 %cmp5, label %for.body, label %h + +for.body: ; preds = %for.cond + %div = sdiv i32 %ret.0, 2 + %inc = add nsw i32 %storemerge5, 1 + br label %for.cond + +if.else: ; preds = %if.then + br label %for.cond8 + +for.cond8: ; preds = %for.body12, %if.else + %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ] + %storemerge4 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ] + %div9 = sdiv i32 %n, 3 + %cmp10 = icmp slt i32 %storemerge4, %div9 + br i1 %cmp10, label %for.body12, label %i42 + +for.body12: ; preds = %for.cond8 + %sub13 = add nsw i32 %ret.1, -2 + %inc15 = add nsw i32 %storemerge4, 1 + br label %for.cond8 + +if.else17: ; preds = %entry + %rem181 = and i32 %conv, 1 + %cmp19 = icmp eq i32 %rem181, 0 + br i1 %cmp19, label %if.then21, label %if.else30 + +if.then21: ; preds = %if.else17 + br label %for.cond23 + +for.cond23: ; preds = %for.body26, %if.then21 + %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ] + %storemerge3 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ] + %mul = shl nsw i32 %n, 1 + %cmp24 = icmp slt i32 %storemerge3, %mul + br i1 %cmp24, label %for.body26, label %h + +for.body26: ; preds = %for.cond23 + %add = add nsw i32 %ret.2, 1 + %inc28 = add nsw i32 %storemerge3, 1 + br label %for.cond23 + +if.else30: ; preds = %if.else17 + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.else30 + %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ] + %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ] + %add33 = add nsw i32 %n, 5 + %cmp34 = icmp slt i32 %storemerge, %add33 + br i1 %cmp34, label %for.body36, label %i42 + +for.body36: ; preds = %for.cond32 + %mul37 = shl nsw i32 %ret.3, 1 + %inc39 = add nsw i32 %storemerge, 1 + br label %for.cond32 + +h: ; preds = %for.cond23, %for.cond + %ret.4 = phi i32 [ %ret.0, %for.cond ], [ %ret.2, %for.cond23 ] + %add41 = add nsw i32 %ret.4, 5 + br label %end + +i42: ; preds = %for.cond32, %for.cond8 + %ret.5 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.3, %for.cond32 ] + %mul43 = mul nsw i32 %ret.5, 10 + br label %end + +end: ; preds = %i42, %h + %storemerge2 = phi i32 [ %mul43, %i42 ], [ %add41, %h ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization2, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization2 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br label %[[FORCOND8PREHEADER:.+]] + +; CHECK: [[FORCOND8PREHEADER:.+]]: +; CHECK: br label %[[FORCOND8:.+]] + +; CHECK: [[FORCONDPREHEADER:.+]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND8]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[I42LOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND8]] + +; CHECK: [[IFELSE17]]: +; CHECK: br label %[[FORCOND32PREHEADER:.+]] + +; CHECK: [[FORCOND32PREHEADER]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[FORCOND23PREHEADER:.+]]: +; CHECK: br label %[[FORCOND23:.+]] + +; CHECK: [[FORCOND23]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT3:.+]] + +; CHECK: [[FORBODY26]]: +; CHECK: br label %[[FORCOND23]] + +; CHECK: [[FORCOND32]]: +; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[I42LOOPEXIT4:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[I42:.+]] + +; CHECK: [[HLOOPEXIT3]]: +; CHECK: br label %[[I42]] + +; CHECK: [[H:.+]]: +; CHECK: br label %[[END:.+]] + +; CHECK: [[I42LOOPEXIT]]: +; CHECK: br label %[[FORCONDPREHEADER]] + +; CHECK: [[I42LOOPEXIT4]]: +; CHECK: br label %[[FORCOND23PREHEADER]] + +; CHECK: [[I42]]: +; CHECK: br label %[[H]] + +; CHECK: [[END]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll new file mode 100644 index 0000000000000..56369b161964e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll @@ -0,0 +1,236 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <--------. +; / \ | +; | c | +; | / \ | +; | f h <--. | +; | | / \ | | +; | | | d -' | +; | | | | | +; | | | e ---' +; | | | / +; | | | / +; | | |/ +; | | / +; \|/ +; g +; +; * where nodes b, d, and e are uniform branches, and node h is a varying +; branch. +; * where nodes b, d and g are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; b <--. +; | | +; c | +; /| | +; f h <. | +; | | | | +; | d -' | +; | | | +; | e ---' +; \| +; g +; +; __kernel void partial_linearization20(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0 && n < 5) { +; goto g; +; } +; if (n == 6) { +; goto f; +; } +; while (1) { +; if (ret++ + id >= n) { +; goto d; +; } +; if (n & 1) { +; goto g; +; } +; +; d: +; if (n > 3) { +; goto e; +; } +; } +; e: +; if (n & 1) { +; goto g; +; } +; } +; +; f: +; for (int i = 0; i < n + 1; i++) ret++; +; g: +; out[id] = ret; +; } + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization20(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %e, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %inc, %e ] + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 4 + br i1 %0, label %g, label %if.end + +if.end: ; preds = %while.body + %cmp4 = icmp eq i32 %n, 6 + br i1 %cmp4, label %for.cond, label %while.body9 + +while.body9: ; preds = %d, %if.end + %ret.1 = phi i32 [ %ret.0, %if.end ], [ %inc, %d ] + %inc = add nsw i32 %ret.1, 1 + %add = add nsw i32 %ret.1, %conv + %cmp10 = icmp sge i32 %add, %n + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + %or.cond1 = or i1 %tobool, %cmp10 + br i1 %or.cond1, label %d, label %g + +d: ; preds = %while.body9 + %cmp16 = icmp sgt i32 %n, 3 + br i1 %cmp16, label %e, label %while.body9 + +e: ; preds = %d + %and20 = and i32 %n, 1 + %tobool21 = icmp eq i32 %and20, 0 + br i1 %tobool21, label %while.body, label %g + +for.cond: ; preds = %for.body, %if.end + %ret.2 = phi i32 [ %inc27, %for.body ], [ %ret.0, %if.end ] + %storemerge = phi i32 [ %inc28, %for.body ], [ 0, %if.end ] + %cmp25 = icmp sgt i32 %storemerge, %n + br i1 %cmp25, label %g, label %for.body + +for.body: ; preds = %for.cond + %inc27 = add nsw i32 %ret.2, 1 + %inc28 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +g: ; preds = %for.cond, %e, %while.body9, %while.body + %ret.3 = phi i32 [ %ret.0, %while.body ], [ %inc, %e ], [ %ret.2, %for.cond ], [ %inc, %while.body9 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization20, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization20 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[IFEND:.+]] + +; CHECK: [[IFEND]]: +; CHECK: %[[CMP4:.+]] = icmp +; CHECK: br i1 %[[CMP4]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODY9PREHEADER:.+]] + +; CHECK: [[WHILEBODY9PREHEADER]]: +; CHECK: br label %[[WHILEBODY9:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[FORCONDPREHEADERELSE:.+]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[FORCONDPREHEADERSPLIT:.+]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[WHILEBODY9]]: +; CHECK: br label %[[D:.+]] + +; CHECK: [[D]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY9]], label %[[WHILEBODY9PUREEXIT:.+]] + +; CHECK: [[WHILEBODY9PUREEXIT]]: +; CHECK: br label %[[E:.+]] + +; CHECK: [[E]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[GLOOPEXIT1:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXIT:.+]], label %[[FORBODY:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[GLOOPEXIT]]: +; CHECK: br label %[[G]] + +; CHECK: [[GLOOPEXIT1]]: +; CHECK: br label %[[GLOOPEXIT1ELSE:.+]] + +; CHECK: [[GLOOPEXIT1ELSE]]: +; CHECK: br label %[[GLOOPEXIT2:.+]] + +; CHECK: [[GLOOPEXIT2]]: +; CHECK: br label %[[GLOOPEXIT2ELSE:.+]] + +; CHECK: [[GLOOPEXIT2ELSE]]: +; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERELSE]], label %[[FORCONDPREHEADERSPLIT]] + +; CHECK: [[G]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll new file mode 100644 index 0000000000000..bc11225496785 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll @@ -0,0 +1,197 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <------. +; / \ | +; | c <--. | +; | / \ | | +; | | d -' | +; | | / \ | +; | | | e -' +; | | | / +; | | | / +; | | |/ +; | | / +; \|/ +; f +; +; * where nodes b, d, and e are uniform branches, and node c is a varying +; branch. +; * where nodes b, d, e and f are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; b <--. +; | | +; c <. | +; | | | +; d -' | +; | | +; e ---' +; | +; f +; +; __kernel void partial_linearization21(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0 && n < 5) { +; goto f; +; } +; while (1) { +; if (n <= 2) { +; goto f; +; } else { +; if (ret + id >= n) { +; goto d; +; } +; } +; if (n & 1) { +; goto f; +; } +; +; d: +; if (n > 3) { +; goto e; +; } +; } +; +; e: +; if (n & 1) { +; goto f; +; } +; } +; +; f: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization21(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %e, %entry + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 4 + %cmp6 = icmp slt i32 %n, 3 + %or.cond1 = or i1 %cmp6, %0 + br i1 %or.cond1, label %f, label %if.else + +while.body5: ; preds = %d + %cmp6.old = icmp eq i32 %n, 3 + br i1 %cmp6.old, label %if.else, label %f + +if.else: ; preds = %while.body5, %while.body + %cmp9 = icmp sge i32 %conv, %n + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + %or.cond2 = or i1 %tobool, %cmp9 + br i1 %or.cond2, label %d, label %f + +d: ; preds = %if.else + %cmp16 = icmp sgt i32 %n, 3 + br i1 %cmp16, label %e, label %while.body5 + +e: ; preds = %d + %and20 = and i32 %n, 1 + %tobool21 = icmp eq i32 %and20, 0 + br i1 %tobool21, label %while.body, label %f + +f: ; preds = %e, %if.else, %while.body5, %while.body + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 0, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization21, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization21 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[IFELSEPREHEADER:.+]] + +; CHECK: [[IFELSEPREHEADER]]: +; CHECK: br label %[[IFELSE:.+]] + +; CHECK: [[WHILEBODY5:.+]]: + +; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]] + +; CHECK: [[IFELSEPUREEXIT]]: +; CHECK: br label %[[E:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[D:.+]] + +; CHECK: [[D]]: +; CHECK: br label %[[WHILEBODY5]] + +; CHECK: [[E]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[FLOOPEXIT:.+]] + +; CHECK: [[FLOOPEXIT]]: +; CHECK: br label %[[FLOOPEXITELSE:.+]] + +; CHECK: [[FLOOPEXITELSE]]: +; CHECK: br label %[[FLOOPEXIT1:.+]] + +; CHECK: [[FLOOPEXIT1]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[F]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll new file mode 100644 index 0000000000000..7be8b4bbc187d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll @@ -0,0 +1,263 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization22 -vecz-passes="function(lower-switch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <------. +; / \ | +; f c <--. | +; |\ / \ | | +; | | d -' | +; | |\ / \ | +; | | | e -' +; | | |\ / +; | | | g +; | | |/ +; | | / +; \|/ +; h +; +; * where nodes b, d, and e are uniform branches, and node c is a varying +; branch. +; * where nodes b, d, e and f are divergent. +; +; With partial linearization, it will be transformed as follows: +; +; a +; | +; b <--. +; /| | +; f c <. | +; | | | | +; | d -' | +; | | | +; | e ---' +; \| +; g +; | +; h +; +; __kernel void partial_linearization22(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n > 0 && n < 5) { +; goto f; +; } +; while (1) { +; if (n <= 2) { +; goto f; +; } else { +; if (ret + id >= n) { +; goto d; +; } +; } +; if (n & 1) { +; goto h; +; } +; +; d: +; if (n > 3) { +; goto e; +; } +; } +; +; e: +; if (n & 1) { +; goto g; +; } +; } +; +; f: +; if (n == 2) { +; goto h; +; } +; +; g: +; for (int i = 0; i < n + 1; i++) ret++; +; goto h; +; +; h: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %e, %entry + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 4 + %cmp6 = icmp slt i32 %n, 3 + %or.cond1 = or i1 %cmp6, %0 + br i1 %or.cond1, label %f, label %if.else + +while.body5: ; preds = %d + switch i32 %n, label %g [ + i32 3, label %if.else + i32 2, label %h + ] + +if.else: ; preds = %while.body5, %while.body + %cmp9 = icmp sge i32 %conv, %n + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + %or.cond2 = or i1 %tobool, %cmp9 + br i1 %or.cond2, label %d, label %h + +d: ; preds = %if.else + %cmp16 = icmp sgt i32 %n, 3 + br i1 %cmp16, label %e, label %while.body5 + +e: ; preds = %d + %and20 = and i32 %n, 1 + %tobool21 = icmp eq i32 %and20, 0 + br i1 %tobool21, label %while.body, label %g + +f: ; preds = %while.body + %cmp24 = icmp eq i32 %n, 2 + br i1 %cmp24, label %h, label %g + +g: ; preds = %f, %e, %while.body5 + br label %for.cond + +for.cond: ; preds = %for.body, %g + %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ] + %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ] + %cmp29 = icmp sgt i32 %storemerge, %n + br i1 %cmp29, label %h, label %for.body + +for.body: ; preds = %for.cond + %inc = add nuw nsw i32 %ret.0, 1 + %inc31 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +h: ; preds = %for.cond, %f, %if.else, %while.body5 + %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization22, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization22 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP6:.+]] = icmp slt +; CHECK: %[[ORCOND1:.+]] = or i1 %[[CMP6]] +; CHECK: %[[F_EXIT_MASK:.+]] = select i1 +; CHECK: %[[ORCOND2:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[ORCOND1]]) +; CHECK: br i1 %[[ORCOND2]], label %[[F:.+]], label %[[IFELSEPREHEADER:.+]] + +; CHECK: [[IFELSEPREHEADER]]: +; CHECK: br label %[[IFELSE:.+]] + +; CHECK: [[LEAFBLOCK1:.*]]: +; CHECK: %[[SWITCHLEAF:.+]] = icmp eq i32 %n, 3 +; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]] + +; CHECK: [[IFELSEPUREEXIT]]: +; CHECK: br label %[[E:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[D:.+]] + +; CHECK: [[D]]: +; CHECK: br label %[[LEAFBLOCK1]] + +; CHECK: [[E]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: %[[CMP24MERGE:.+]] = phi i1 [ %[[G_EXIT_MASK:.+]], %[[F]] ], [ false, %[[E]] ] +; CHECK: br label %[[HLOOPEXIT1:.+]] + +; CHECK: [[F]]: +; CHECK: %[[CMP24:.+]] = icmp eq i32 %n, 2 +; CHECK: %[[G_EXIT_MASK]] = select i1 %[[CMP24]], i1 false, i1 %[[F_EXIT_MASK]] +; CHECK: br label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[FELSE:.+]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[FSPLIT:.+]]: +; CHECK: %[[CMP24_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %cmp24.merge) +; CHECK: br i1 %[[CMP24_ANY]], label %[[H:.+]], label %[[G]] + +; CHECK: [[GLOOPEXIT:.+]]: +; CHECK: br label %[[GLOOPEXITELSE:.+]] + +; CHECK: [[GLOOPEXITELSE]]: +; CHECK: br i1 %{{.+}}, label %[[FELSE]], label %[[FSPLIT]] + +; CHECK: [[G]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 true, label %[[HLOOPEXIT:.+]], label %[[FORBODY:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + + + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[HLOOPEXIT1]]: +; CHECK: br label %[[HLOOPEXIT1ELSE:.+]] + +; CHECK: [[HLOOPEXIT1ELSE]]: +; CHECK: br label %[[GLOOPEXIT]] + +;; CHECK: [[H]]: +;; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll new file mode 100644 index 0000000000000..58a1f2548f38e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll @@ -0,0 +1,247 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization23 -vecz-passes=cfg-convert -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; / \ +; / \ +; b c +; / \ / \ +; d e f g +; \ \ / / +; \ X / +; \ / \ / +; h i +; \ / +; j +; +; * where node a is a uniform branch, and nodes b and c are varying branches. +; * where nodes d, e, f, g are divergent. +; +; With partial linearization we will have a CFG of the form: +; +; a +; / \ +; / \ +; / \ +; b c +; / \ +; e - d f - g +; \ / +; i +; | +; h +; | +; j +; +; The purpose of this test is to make sure we correctly handle blending in `i` +; which cannot be considered as a blend block since it is not the join point of +; either div causing blocks. +; We want to make sure the incoming blocks of the phi nodes in `i` are correctly +; translated into select instructions for the predecessors which get linearized. +; +; +; +; __kernel void partial_linearization23(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (n > 10) { +; if (id % 3 == 0) { +; ret = n - 1; goto h; +; } else { +; for (int i = 0; i < n / 3; i++) { ret += 2; } goto i; +; } +; } else { +; if (id % 2 == 0) { +; ret = n * 2; goto h; +; } else { +; for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i; +; } +; } +; +; h: +; ret += 5; +; goto end; +; +; i: +; ret *= 10; +; goto end; +; +; end: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @partial_linearization23(i32 addrspace(1)* %out, i32 %n) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %cmp = icmp sgt i32 %n, 10 + br i1 %cmp, label %if.then, label %if.else7 + +if.then: ; preds = %entry + %rem = srem i32 %conv, 3 + %cmp2 = icmp eq i32 %rem, 0 + br i1 %cmp2, label %if.then4, label %for.cond.preheader + +for.cond.preheader: ; preds = %if.then + %div = sdiv i32 %n, 3 + %cmp52 = icmp sgt i32 %n, 2 + br i1 %cmp52, label %for.body.lr.ph, label %i24 + +for.body.lr.ph: ; preds = %for.cond.preheader + %min.iters.check = icmp ult i32 %div, 8 + br i1 %min.iters.check, label %scalar.ph, label %vector.ph + +vector.ph: ; preds = %for.body.lr.ph + %n.vec = and i32 %div, -8 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi6 = phi i32 [ 0, %vector.ph ], [ %0, %vector.body ] + %vec.phi11 = phi i32 [ 0, %vector.ph ], [ %1, %vector.body ] + %vec.phi17 = phi i32 [ 0, %vector.ph ], [ %2, %vector.body ] + %vec.phi22 = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ] + %vec.phi104 = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ] + %vec.phi109 = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ] + %vec.phi1015 = phi i32 [ 0, %vector.ph ], [ %6, %vector.body ] + %vec.phi1020 = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ] + %0 = add nuw nsw i32 %vec.phi6, 2 + %1 = add nuw nsw i32 %vec.phi11, 2 + %2 = add nuw nsw i32 %vec.phi17, 2 + %3 = add nuw nsw i32 %vec.phi22, 2 + %4 = add nuw nsw i32 %vec.phi104, 2 + %5 = add nuw nsw i32 %vec.phi109, 2 + %6 = add nuw nsw i32 %vec.phi1015, 2 + %7 = add nuw nsw i32 %vec.phi1020, 2 + %index.next = add i32 %index, 8 + %8 = icmp eq i32 %index.next, %n.vec + br i1 %8, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %.lcssa25 = phi i32 [ %0, %vector.body ] + %.lcssa210 = phi i32 [ %1, %vector.body ] + %.lcssa216 = phi i32 [ %2, %vector.body ] + %.lcssa221 = phi i32 [ %3, %vector.body ] + %.lcssa3 = phi i32 [ %4, %vector.body ] + %.lcssa8 = phi i32 [ %5, %vector.body ] + %.lcssa14 = phi i32 [ %6, %vector.body ] + %.lcssa19 = phi i32 [ %7, %vector.body ] + %bin.rdx7 = add nuw i32 %.lcssa3, %.lcssa25 + %bin.rdx12 = add nuw i32 %.lcssa8, %.lcssa210 + %bin.rdx18 = add nuw i32 %.lcssa14, %.lcssa216 + %bin.rdx23 = add nuw i32 %.lcssa19, %.lcssa221 + %bin.rdx1113 = add i32 %bin.rdx7, %bin.rdx12 + %bin.rdx1124 = add i32 %bin.rdx18, %bin.rdx23 + %bin.rdx1325 = add i32 %bin.rdx1113, %bin.rdx1124 + %cmp.n = icmp eq i32 %div, %n.vec + br i1 %cmp.n, label %i24, label %scalar.ph + +scalar.ph: ; preds = %middle.block, %for.body.lr.ph + %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %for.body.lr.ph ] + %bc.merge.rdx = phi i32 [ %bin.rdx1325, %middle.block ], [ 0, %for.body.lr.ph ] + %9 = add i32 %bc.resume.val, 1 + %10 = icmp sgt i32 %div, %9 + %smax = select i1 %10, i32 %div, i32 %9 + %11 = shl i32 %smax, 1 + %12 = shl i32 %bc.resume.val, 1 + br label %for.body + +if.then4: ; preds = %if.then + %sub = add nsw i32 %n, -1 + br label %h + +for.body: ; preds = %for.body, %scalar.ph + %storemerge44 = phi i32 [ %bc.resume.val, %scalar.ph ], [ %inc, %for.body ] + %inc = add nuw nsw i32 %storemerge44, 1 + %cmp5 = icmp slt i32 %inc, %div + br i1 %cmp5, label %for.body, label %i24.loopexit + +if.else7: ; preds = %entry + %rem81 = and i32 %conv, 1 + %cmp9 = icmp eq i32 %rem81, 0 + br i1 %cmp9, label %if.then11, label %for.cond14.preheader + +for.cond14.preheader: ; preds = %if.else7 + %add15 = add nsw i32 %n, 5 + %cmp165 = icmp sgt i32 %add15, 0 + br i1 %cmp165, label %for.body18.preheader, label %i24 + +for.body18.preheader: ; preds = %for.cond14.preheader + %13 = add i32 %n, 5 + br label %for.body18 + +if.then11: ; preds = %if.else7 + %mul = shl nsw i32 %n, 1 + br label %h + +for.body18: ; preds = %for.body18.preheader, %for.body18 + %storemerge7 = phi i32 [ %inc21, %for.body18 ], [ 0, %for.body18.preheader ] + %ret.16 = phi i32 [ %mul19, %for.body18 ], [ 0, %for.body18.preheader ] + %mul19 = shl nsw i32 %ret.16, 1 + %inc21 = add nuw nsw i32 %storemerge7, 1 + %exitcond = icmp ne i32 %inc21, %13 + br i1 %exitcond, label %for.body18, label %i24.loopexit1 + +h: ; preds = %if.then11, %if.then4 + %storemerge3 = phi i32 [ %mul, %if.then11 ], [ %sub, %if.then4 ] + %add23 = add nsw i32 %storemerge3, 5 + br label %end + +i24.loopexit: ; preds = %for.body + %14 = add i32 %bc.merge.rdx, %11 + %15 = sub i32 %14, %12 + br label %i24 + +i24.loopexit1: ; preds = %for.body18 + %mul19.lcssa = phi i32 [ %mul19, %for.body18 ] + br label %i24 + +i24: ; preds = %i24.loopexit1, %i24.loopexit, %for.cond14.preheader, %middle.block, %for.cond.preheader + %ret.2 = phi i32 [ 0, %for.cond.preheader ], [ %bin.rdx1325, %middle.block ], [ 0, %for.cond14.preheader ], [ %15, %i24.loopexit ], [ %mul19.lcssa, %i24.loopexit1 ] + %mul25 = mul nsw i32 %ret.2, 10 + br label %end + +end: ; preds = %i24, %h + %storemerge2 = phi i32 [ %mul25, %i24 ], [ %add23, %h ] + %sext = shl i64 %call, 32 + %idxprom = ashr exact i64 %sext, 32 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization23 +; CHECK: i24: +; CHECK: %i24.entry_mask{{.+}} = or i1 +; CHECK: %i24.entry_mask{{.+}} = or i1 +; CHECK: %i24.entry_mask{{.+}} = or i1 +; CHECK: %i24.entry_mask{{.+}} = or i1 + diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll new file mode 100644 index 0000000000000..ffabf74a42b22 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll @@ -0,0 +1,269 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; / \ +; / \ +; b c +; / \ / \ +; d e f g +; \ \ / / +; \ h / +; \ \ / +; \ i +; \ / +; j +; +; * where node a is a uniform branch, and nodes b and c are varying branches. +; * where nodes d, e, f, g, i and j are divergent. +; +; With partial linearization we will have a CFG of the form: +; +; a +; / \ +; b c +; | | +; e g +; | | +; d f +; \ / +; h +; | +; i +; | +; j +; +; __kernel void partial_linearization3(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (n < 10) { // uniform +; if (id % 3 == 0) { // varying +; for (int i = 0; i < n - 1; i++) { ret /= 2; } goto end; +; } else { // varying +; for (int i = 0; i < n / 3; i++) { ret -= 2; } goto h; +; } +; } else { // uniform +; if (id % 2 == 0) { // varying +; for (int i = 0; i < n * 2; i++) { ret += 1; } goto h; +; } else { // varying +; for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i; +; } +; } +; +; h: +; ret += 5; +; +; i: +; ret *= 10; +; +; end: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %n, 10 + br i1 %cmp, label %if.then, label %if.else17 + +if.then: ; preds = %entry + %rem = srem i32 %conv, 3 + %cmp2 = icmp eq i32 %rem, 0 + br i1 %cmp2, label %if.then4, label %if.else + +if.then4: ; preds = %if.then + br label %for.cond + +for.cond: ; preds = %for.body, %if.then4 + %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ] + %storemerge4 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ] + %sub = add nsw i32 %n, -1 + %cmp5 = icmp slt i32 %storemerge4, %sub + br i1 %cmp5, label %for.body, label %end + +for.body: ; preds = %for.cond + %div = sdiv i32 %ret.0, 2 + %inc = add nsw i32 %storemerge4, 1 + br label %for.cond + +if.else: ; preds = %if.then + br label %for.cond8 + +for.cond8: ; preds = %for.body12, %if.else + %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ] + %storemerge3 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ] + %div9 = sdiv i32 %n, 3 + %cmp10 = icmp slt i32 %storemerge3, %div9 + br i1 %cmp10, label %for.body12, label %h + +for.body12: ; preds = %for.cond8 + %sub13 = add nsw i32 %ret.1, -2 + %inc15 = add nsw i32 %storemerge3, 1 + br label %for.cond8 + +if.else17: ; preds = %entry + %rem181 = and i32 %conv, 1 + %cmp19 = icmp eq i32 %rem181, 0 + br i1 %cmp19, label %if.then21, label %if.else30 + +if.then21: ; preds = %if.else17 + br label %for.cond23 + +for.cond23: ; preds = %for.body26, %if.then21 + %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ] + %storemerge2 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ] + %mul = shl nsw i32 %n, 1 + %cmp24 = icmp slt i32 %storemerge2, %mul + br i1 %cmp24, label %for.body26, label %h + +for.body26: ; preds = %for.cond23 + %add = add nsw i32 %ret.2, 1 + %inc28 = add nsw i32 %storemerge2, 1 + br label %for.cond23 + +if.else30: ; preds = %if.else17 + br label %for.cond32 + +for.cond32: ; preds = %for.body36, %if.else30 + %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ] + %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ] + %add33 = add nsw i32 %n, 5 + %cmp34 = icmp slt i32 %storemerge, %add33 + br i1 %cmp34, label %for.body36, label %i42 + +for.body36: ; preds = %for.cond32 + %mul37 = shl nsw i32 %ret.3, 1 + %inc39 = add nsw i32 %storemerge, 1 + br label %for.cond32 + +h: ; preds = %for.cond23, %for.cond8 + %ret.4 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.2, %for.cond23 ] + %add41 = add nsw i32 %ret.4, 5 + br label %i42 + +i42: ; preds = %h, %for.cond32 + %ret.5 = phi i32 [ %add41, %h ], [ %ret.3, %for.cond32 ] + %mul43 = mul nsw i32 %ret.5, 10 + br label %end + +end: ; preds = %i42, %for.cond + %ret.6 = phi i32 [ %mul43, %i42 ], [ %ret.0, %for.cond ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization3, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization3 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br label %[[FORCOND8PREHEADER:.+]] + +; CHECK: [[FORCOND8PREHEADER:.+]]: +; CHECK: br label %[[FORCOND8:.+]] + +; CHECK: [[FORCONDPREHEADER:.+]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[CMP5:.+]] = icmp +; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[ENDLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND8]]: +; CHECK: %[[CMP10:.+]] = icmp +; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY12]]: +; CHECK: br label %[[FORCOND8]] + +; CHECK: [[IFELSE17]]: +; CHECK: br label %[[FORCOND32PREHEADER:.+]] + +; CHECK: [[FORCOND32PREHEADER]]: +; CHECK: br label %[[FORCOND32:.+]] + +; CHECK: [[FORCOND23PREHEADER:.+]]: +; CHECK: br label %[[FORCOND23:.+]] + +; CHECK: [[FORCOND23]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT2:.+]] + +; CHECK: [[FORBODY26]]: +; CHECK: br label %[[FORCOND23]] + +; CHECK: [[FORCOND32]]: +; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[ENDLOOPEXIT2:.+]] + +; CHECK: [[FORBODY36]]: +; CHECK: br label %[[FORCOND32]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[FORCONDPREHEADER]] + +; CHECK: [[HLOOPEXIT2]]: +; CHECK: br label %[[H:.+]] + +; CHECK: [[H]]: +; CHECK: br label %[[END:.+]] + +; CHECK: [[ENDLOOPEXIT]]: +; CHECK: br label %[[H]] + +; CHECK: [[ENDLOOPEXIT2]]: +; CHECK: br label %[[FORCOND23PREHEADER]] + +; CHECK: [[END]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll new file mode 100644 index 0000000000000..a9158f7ff59c7 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll @@ -0,0 +1,195 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-. +; / \ | +; e c | +; | / \| +; | f d +; |/ +; g +; +; * where node b is a uniform branch, and node c is a varying branch. +; * where nodes f, d and g are divergent. +; +; With partial linearization we will have a CFG of the form: +; +; a +; | +; b <--. +; / \ | +; e c | +; | | | +; | d -' +; \ / +; f +; | +; g +; +; __kernel void partial_linearization4(__global int *out, int n) { +; int id = get_global_id(0); +; +; int x = id / n; +; int y = id % n; +; int i = 0; +; for (;;) { +; if (n > 20) goto e; +; if (x + y > n) goto f; +; y++; +; x++; +; i++; +; } +; +; goto g; +; +; e: +; i *= 2 + n; +; goto g; +; +; f: +; i /= i + n; +; +; g: +; out[id] = x + y + i; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization4(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %0 = icmp eq i32 %conv, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %conv, %5 + %6 = icmp eq i32 %conv, -2147483648 + %7 = icmp eq i32 %n, -1 + %8 = and i1 %7, %6 + %9 = icmp eq i32 %n, 0 + %10 = or i1 %9, %8 + %11 = select i1 %10, i32 1, i32 %n + %rem = srem i32 %conv, %11 + br label %for.cond + +for.cond: ; preds = %if.end5, %entry + %x.0 = phi i32 [ %div, %entry ], [ %inc6, %if.end5 ] + %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end5 ] + %storemerge = phi i32 [ 0, %entry ], [ %inc7, %if.end5 ] + %cmp = icmp sgt i32 %n, 20 + br i1 %cmp, label %e, label %if.end + +if.end: ; preds = %for.cond + %add = add nsw i32 %y.0, %x.0 + %cmp2 = icmp sgt i32 %add, %n + br i1 %cmp2, label %f, label %if.end5 + +if.end5: ; preds = %if.end + %inc = add nsw i32 %y.0, 1 + %inc6 = add nsw i32 %x.0, 1 + %inc7 = add nsw i32 %storemerge, 1 + br label %for.cond + +e: ; preds = %for.cond + %add8 = add nsw i32 %n, 2 + %mul = mul nsw i32 %storemerge, %add8 + br label %g + +f: ; preds = %if.end + %add9 = add nsw i32 %storemerge, %n + %12 = icmp eq i32 %add9, 0 + %13 = select i1 %12, i32 1, i32 %add9 + %div10 = sdiv i32 %storemerge, %13 + br label %g + +g: ; preds = %f, %e + %storemerge1 = phi i32 [ %div10, %f ], [ %mul, %e ] + %add11 = add i32 %y.0, %x.0 + %add12 = add i32 %add11, %storemerge1 + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %add12, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization4, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization4 +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[E:.+]], label %[[IFEND:.+]] + +; CHECK: [[IFEND]]: +; CHECK: br label %[[IFEND5:.+]] + +; CHECK: [[IFEND5]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND:.+]], label %[[FORCONDPUREEXIT:.+]] + +; CHECK: [[FORCONDPUREEXIT]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[E]]: +; CHECK: br label %[[FORCONDPUREEXIT]] + +; CHECK: [[EELSE:.+]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[ESPLIT:.+]]: +; CHECK: br label %[[G]] + +; CHECK: [[F]]: +; CHECK: br label %[[FELSE:.+]] + +; CHECK: [[FELSE]]: +; CHECK: br i1 %{{.+}}, label %[[EELSE]], label %[[ESPLIT]] + +; CHECK: [[G]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll new file mode 100644 index 0000000000000..a65b8bad7dd25 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll @@ -0,0 +1,221 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c +; |\ / \ +; | d e +; | \ / +; | f +; \ / +; g +; +; * where node c is a uniform branch, and nodes a and b are varying branches. +; * where nodes b, c, d, f, g are divergent. +; +; With partial linearization we will have a CFG of the form: +; +; a +; | +; c +; / \ +; | e +; \ / +; b +; | +; d +; | +; f +; | +; g +; +; __kernel void partial_linearization5(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (id % 2 == 0) { // a +; if (id == 4) { // b +; goto g; +; } else { +; goto d; +; } +; } else { // c +; if (n % 2 == 0) { +; goto d; +; } else { +; goto e; +; } +; } +; +; d: +; for (int i = 0; i < n / 4; i++) { ret += i - 2; } +; goto f; +; +; e: +; for (int i = 0; i < n + 5; i++) { ret += i + 5; } +; +; f: +; ret *= ret % n; +; ret *= ret + 4; +; +; g: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %rem1 = and i32 %conv, 1 + %cmp = icmp eq i32 %rem1, 0 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + %cmp2 = icmp eq i32 %conv, 4 + br i1 %cmp2, label %g, label %d + +if.else5: ; preds = %entry + %rem62 = and i32 %n, 1 + %cmp7 = icmp eq i32 %rem62, 0 + br i1 %cmp7, label %d, label %e + +d: ; preds = %if.else5, %if.then + br label %for.cond + +for.cond: ; preds = %for.body, %d + %ret.0 = phi i32 [ 0, %d ], [ %add, %for.body ] + %storemerge3 = phi i32 [ 0, %d ], [ %inc, %for.body ] + %div = sdiv i32 %n, 4 + %cmp11 = icmp slt i32 %storemerge3, %div + br i1 %cmp11, label %for.body, label %f + +for.body: ; preds = %for.cond + %sub = add i32 %ret.0, -2 + %add = add i32 %sub, %storemerge3 + %inc = add nsw i32 %storemerge3, 1 + br label %for.cond + +e: ; preds = %if.else5 + br label %for.cond14 + +for.cond14: ; preds = %for.body18, %e + %ret.1 = phi i32 [ 0, %e ], [ %add20, %for.body18 ] + %storemerge = phi i32 [ 0, %e ], [ %inc22, %for.body18 ] + %add15 = add nsw i32 %n, 5 + %cmp16 = icmp slt i32 %storemerge, %add15 + br i1 %cmp16, label %for.body18, label %f + +for.body18: ; preds = %for.cond14 + %add19 = add i32 %ret.1, 5 + %add20 = add i32 %add19, %storemerge + %inc22 = add nsw i32 %storemerge, 1 + br label %for.cond14 + +f: ; preds = %for.cond14, %for.cond + %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond14 ] + %0 = icmp eq i32 %ret.2, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %rem24 = srem i32 %ret.2, %5 + %mul = mul nsw i32 %rem24, %ret.2 + %add25 = add nsw i32 %mul, 4 + %mul26 = mul nsw i32 %add25, %mul + br label %g + +g: ; preds = %f, %if.then + %ret.3 = phi i32 [ %mul26, %f ], [ 0, %if.then ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization5, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization5 +; CHECK: br label %[[IFELSE5:.+]] + +; CHECK: [[IFTHEN:.+]]: +; CHECK: br label %[[D:.+]] + +; CHECK: [[IFELSE5]]: +; CHECK: %[[CMP7:.+]] = icmp +; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]] + +; CHECK: [[FORCOND14PREHEADER]]: +; CHECK: br label %[[FORCOND14:.+]] + +; CHECK: [[D]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: %[[CMP11:.+]] = icmp +; CHECK: br i1 %[[CMP11]], label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FORCOND14]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18:.+]], label %[[FLOOPEXIT2:.+]] + +; CHECK: [[FORBODY18]]: +; CHECK: br label %[[FORCOND14]] + +; CHECK: [[FLOOPEXIT]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[FLOOPEXIT2]]: +; CHECK: br label %[[IFTHEN]] + +; CHECK: [[F]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[G]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll new file mode 100644 index 0000000000000..5425139b5d888 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll @@ -0,0 +1,200 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization6 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-. +; / \ | +; c d | +; / \ / | +; e f --' +; \ | +; \ g +; \| +; h +; +; * where nodes b and c are uniform branches, and node f is a varying +; branch. +; * where nodes g and h are divergent. +; +; With partial linearization, it can be transformed in the following way: +; +; a +; | +; b <-. +; / \ | +; c d | +; / \ / | +; e f --' +; \ | +; \ | +; \| +; g +; | +; h +; +; __kernel void partial_linearization6(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; while (1) { +; if (n % 2 == 0) { +; if (n > 2) { +; goto e; +; } +; } else { +; ret += n + 1; +; } +; if (id == n) break; +; } +; +; ret += n * 2; +; ret /= n; +; goto early; +; +; e: +; ret += n * 4; +; ret -= n; +; +; early: +; out[id] = ret; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end10, %entry + %ret.0 = phi i32 [ 0, %entry ], [ %ret.1, %if.end10 ] + %rem1 = and i32 %n, 1 + %cmp = icmp eq i32 %rem1, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %while.body + %cmp2 = icmp sgt i32 %n, 2 + br i1 %cmp2, label %e, label %if.end6 + +if.else: ; preds = %while.body + %add = add nsw i32 %n, 1 + %add5 = add nsw i32 %add, %ret.0 + br label %if.end6 + +if.end6: ; preds = %if.else, %if.then + %ret.1 = phi i32 [ %add5, %if.else ], [ %ret.0, %if.then ] + %cmp7 = icmp eq i32 %conv, %n + br i1 %cmp7, label %while.end, label %if.end10 + +if.end10: ; preds = %if.end6 + br label %while.body + +while.end: ; preds = %if.end6 + %mul = shl nsw i32 %n, 1 + %add11 = add nsw i32 %ret.1, %mul + %0 = icmp eq i32 %add11, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %add11, %5 + br label %early + +e: ; preds = %if.then + %mul12 = mul i32 %n, 4 + %n.neg = sub i32 0, %n + %add13 = add i32 %mul12, %n.neg + %sub = add i32 %add13, %ret.0 + br label %early + +early: ; preds = %e, %while.end + %storemerge = phi i32 [ %div, %while.end ], [ %sub, %e ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %storemerge, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization6, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization6 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: %[[CMP2:.+]] = icmp +; CHECK: br i1 %[[CMP2]], label %[[E:.+]], label %[[IFEND6:.+]] + +; CHECK: [[IFELSE]]: +; CHECK: br label %[[IFEND6]] + +; CHECK: [[IFEND6]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[WHILEEND:.+]] + +; CHECK: [[WHILEEND]]: +; CHECK: br label %[[WHILEENDELSE:.+]] + +; CHECK: [[WHILEENDELSE]]: +; CHECK: br i1 %{{.+}}, label %[[EELSE:.+]], label %[[ESPLIT:.+]] + +; CHECK: [[E]]: +; CHECK: br label %[[WHILEBODYPUREEXIT]] + +; CHECK: [[EELSE]]: +; CHECK: br label %[[EARLY:.+]] + +; CHECK: [[ESPLIT]]: +; CHECK: br label %[[EARLY]] + +; CHECK: [[EARLY]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll new file mode 100644 index 0000000000000..1c59a75ab15d8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll @@ -0,0 +1,228 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; / \ +; b c +; / \ / \ +; d e f +; \ / \ / +; g h +; \ / +; i +; +; * where nodes a, c and e are uniform branches, and node b is a varying +; branch. +; * where nodes d, e, g and i are divergent. +; +; With partial linearization, it can be transformed in the following way: +; +; a +; / \ +; b c +; | /| +; d / | +; |/ | +; e f +; |\ | +; | \ | +; | \| +; g - h +; | +; i +; +; __kernel void partial_linearization7(__global int *out, int n) { +; int id = get_global_id(0); +; int i = 0; +; +; if (n > 10) { // a +; if (n + id > 10) { // b +; i = n * 10; // d +; goto g; +; } else { +; goto e; +; } +; } else { +; if (n < 5) { // c +; goto e; +; } else { +; for (int j = 0; j < n; j++) { i++; } +; goto h; +; } +; } +; +; e: +; if (n > 5) { +; goto g; +; } else { +; i = n * 3 / 5; +; goto h; +; } +; +; g: +; for (int j = 0; j < n; j++) { i++; } +; goto i; +; +; h: +; i = n + id / 3; +; +; i: +; out[id] = i; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 noundef %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp sgt i32 %n, 10 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + %add = add nsw i32 %conv, %n + %cmp2 = icmp sgt i32 %add, 10 + br i1 %cmp2, label %if.then4, label %e + +if.then4: ; preds = %if.then + %mul = mul nsw i32 %n, 10 + br label %g + +if.else5: ; preds = %entry + %cmp6 = icmp slt i32 %n, 5 + br i1 %cmp6, label %e, label %if.else9 + +if.else9: ; preds = %if.else5 + br label %for.cond + +for.cond: ; preds = %for.body, %if.else9 + %storemerge = phi i32 [ 0, %if.else9 ], [ %inc12, %for.body ] + %cmp10 = icmp slt i32 %storemerge, %n + br i1 %cmp10, label %for.body, label %h + +for.body: ; preds = %for.cond + %inc12 = add nsw i32 %storemerge, 1 + br label %for.cond + +e: ; preds = %if.else5, %if.then + %cmp13 = icmp sgt i32 %n, 5 + br i1 %cmp13, label %g, label %h + +g: ; preds = %e, %if.then4 + %i.1 = phi i32 [ %mul, %if.then4 ], [ 0, %e ] + br label %for.cond19 + +for.cond19: ; preds = %for.body22, %g + %i.2 = phi i32 [ %i.1, %g ], [ %inc23, %for.body22 ] + %storemerge1 = phi i32 [ 0, %g ], [ %inc25, %for.body22 ] + %cmp20 = icmp slt i32 %storemerge1, %n + br i1 %cmp20, label %for.body22, label %i29 + +for.body22: ; preds = %for.cond19 + %inc23 = add nsw i32 %i.2, 1 + %inc25 = add nsw i32 %storemerge1, 1 + br label %for.cond19 + +h: ; preds = %e, %for.cond + %div27 = sdiv i32 %conv, 3 + %add28 = add nsw i32 %div27, %n + br label %i29 + +i29: ; preds = %h, %for.cond19 + %i.3 = phi i32 [ %add28, %h ], [ %i.2, %for.cond19 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %i.3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization7, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization7 +; CHECK: %[[CMP:.+]] = icmp +; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE5:.+]] + +; CHECK: [[IFTHEN]]: +; CHECK: br label %[[IFTHEN4:.+]] + +; CHECK: [[IFTHEN4]]: +; CHECK: br label %[[E:.+]] + +; CHECK: [[IFELSE5]]: +; CHECK: %[[CMP6:.+]] = icmp +; CHECK: br i1 %[[CMP6]], label %[[E]], label %[[FORCONDPREHEADER:.+]] + +; CHECK: [[FORCONDPREHEADER]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]] + +; CHECK: [[FORBODY]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[E]]: +; CHECK: %[[CMP13:.+]] = icmp +; CHECK: br i1 %[[CMP13]], label %[[G:.+]], label %[[H:.+]] + +; CHECK: [[G]]: +; CHECK: br label %[[FORCOND19:.+]] + +; CHECK: [[FORCOND19]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[I29LOOPEXIT:.+]] + +; CHECK: [[FORBODY22]]: +; CHECK: br label %[[FORCOND19]] + +; CHECK: [[HLOOPEXIT]]: +; CHECK: br label %[[H]] + +; CHECK: [[H]]: +; CHECK: br label %[[G]] + +; CHECK: [[I29LOOPEXIT]]: +; CHECK: br label %[[I29:.+]] + +; CHECK: [[I29]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll new file mode 100644 index 0000000000000..b5c22f6b5c588 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll @@ -0,0 +1,191 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization8 -vecz-passes=cfg-convert -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <-. +; / \ | +; e c | +; | / \| +; | f d +; |/ +; g +; +; * where nodes b and c varying branches. +; * where nodes e, f, d and g are divergent. +; +; With partial linearization we will have a CFG of the form: +; +; a +; | +; b <. +; | | +; c | +; | | +; d -' +; | +; f +; | +; e +; | +; g +; +; __kernel void partial_linearization8(__global int *out, int n) { +; int id = get_global_id(0); +; +; int x = id / n; +; int y = id % n; +; int i = 0; +; for (;;) { +; if (i + id > n) goto e; +; if (x + y > n) goto f; +; y++; +; x++; +; i++; +; } +; +; goto g; +; +; e: +; i *= 2 + n; +; goto g; +; +; f: +; i /= i + n; +; +; g: +; out[id] = x + y + i; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization8(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %0 = icmp eq i32 %conv, -2147483648 + %1 = icmp eq i32 %n, -1 + %2 = and i1 %1, %0 + %3 = icmp eq i32 %n, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %n + %div = sdiv i32 %conv, %5 + %6 = icmp eq i32 %conv, -2147483648 + %7 = icmp eq i32 %n, -1 + %8 = and i1 %7, %6 + %9 = icmp eq i32 %n, 0 + %10 = or i1 %9, %8 + %11 = select i1 %10, i32 1, i32 %n + %rem = srem i32 %conv, %11 + br label %for.cond + +for.cond: ; preds = %if.end6, %entry + %x.0 = phi i32 [ %div, %entry ], [ %inc7, %if.end6 ] + %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end6 ] + %storemerge = phi i32 [ 0, %entry ], [ %inc8, %if.end6 ] + %add = add nsw i32 %storemerge, %conv + %cmp = icmp sgt i32 %add, %n + br i1 %cmp, label %e, label %if.end + +if.end: ; preds = %for.cond + %add2 = add nsw i32 %y.0, %x.0 + %cmp3 = icmp sgt i32 %add2, %n + br i1 %cmp3, label %f, label %if.end6 + +if.end6: ; preds = %if.end + %inc = add nsw i32 %y.0, 1 + %inc7 = add nsw i32 %x.0, 1 + %inc8 = add nsw i32 %storemerge, 1 + br label %for.cond + +e: ; preds = %for.cond + %add9 = add nsw i32 %n, 2 + %mul = mul nsw i32 %storemerge, %add9 + br label %g + +f: ; preds = %if.end + %add10 = add nsw i32 %storemerge, %n + %12 = icmp eq i32 %add10, 0 + %13 = select i1 %12, i32 1, i32 %add10 + %div11 = sdiv i32 %storemerge, %13 + br label %g + +g: ; preds = %f, %e + %storemerge1 = phi i32 [ %div11, %f ], [ %mul, %e ] + %add12 = add i32 %y.0, %x.0 + %add13 = add i32 %add12, %storemerge1 + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %add13, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization8, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization8 +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br label %[[IFEND:.+]] + +; CHECK: [[IFEND]]: +; CHECK: br label %[[IFEND6:.+]] + +; CHECK: [[IFEND6]]: +; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]] + +; CHECK: [[FORCONDPUREEXIT]]: +; CHECK: br label %[[F:.+]] + +; CHECK: [[E:.+]]: +; CHECK: br label %[[G:.+]] + +; CHECK: [[F]]: +; CHECK: br label %[[FELSE:.+]] + +; CHECK: [[FELSE]]: +; CHECK: br label %[[E]] + +; CHECK: [[G]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll new file mode 100644 index 0000000000000..12ff83e3ac98d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll @@ -0,0 +1,148 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k partial_linearization9 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s + +; The CFG of the following kernel is: +; +; a +; | +; b <--. +; | | +; c <. | +; | | | +; d -' | +; | | +; e ---' +; | +; f +; +; * where node e is a varying branch. +; * where node f is divergent. +; +; With partial linearization we will have a CFG of the form: +; +; a +; | +; b <--. +; | | +; c <. | +; | | | +; d -' | +; | | +; e ---' +; | +; f +; +; __kernel void partial_linearization9(__global int *out, int n) { +; int id = get_global_id(0); +; int i = 0; +; +; while (1) { +; int j = 0; +; for (; ; i++) { +; if (j++ > n) break; +; } +; if (i++ + id > n) break; +; } +; +; out[id] = i; +; } + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @partial_linearization9(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %if.end7, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc3, %if.end7 ] + br label %for.cond + +for.cond: ; preds = %for.inc, %while.body + %i.1 = phi i32 [ %i.0, %while.body ], [ %inc3, %for.inc ] + %j.0 = phi i32 [ 0, %while.body ], [ %inc, %for.inc ] + %cmp = icmp sgt i32 %j.0, %n + %inc3 = add nsw i32 %i.1, 1 + br i1 %cmp, label %for.end, label %for.inc + +for.inc: ; preds = %for.cond + %inc = add nsw i32 %j.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %add = add nsw i32 %i.1, %conv + %cmp4 = icmp sgt i32 %add, %n + br i1 %cmp4, label %while.end, label %if.end7 + +if.end7: ; preds = %for.end + br label %while.body + +while.end: ; preds = %for.end + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %inc3, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind readonly } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization9, !3, !4, !5, !6, !7, !8} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int"} +!7 = !{!"kernel_arg_type_qual", !"", !""} +!8 = !{!"kernel_arg_name", !"out", !"n"} + +; CHECK: spir_kernel void @__vecz_v4_partial_linearization9 +; CHECK: br label %[[WHILEBODY:.+]] + +; CHECK: [[WHILEBODY]]: +; CHECK: br label %[[FORCOND:.+]] + +; CHECK: [[FORCOND]]: +; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FOREND:.+]], label %[[FORINC:.+]] + +; CHECK: [[FORINC]]: +; CHECK: br label %[[FORCOND]] + +; CHECK: [[FOREND]]: +; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]] + +; CHECK: [[WHILEBODYPUREEXIT]]: +; CHECK: br label %[[WHILEEND:.+]] + +; CHECK: [[WHILEEND]]: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll new file mode 100644 index 0000000000000..2f8b137532493 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll @@ -0,0 +1,65 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test < %s + +; This test ensures that VECZ does not crash during control flow conversion due +; to a missing exit mask. As such, we need only verify that the return code from +; veczc is 0, and FileCheck is not required. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(i32 addrspace(1)* %out, i32 %n) { +entry: + %call = tail call i32 @__mux_get_global_id(i32 0) + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %for.body.preheader, label %if.end.thread + +for.body.preheader: + %cmp2 = icmp sgt i32 %n, 1 + %0 = and i32 %call, 1 + %cmp3 = icmp eq i32 %0, 0 + br i1 %cmp2, label %if.end2, label %if.else + +if.end.thread: + %cmp4 = icmp eq i32 %call, 0 + br i1 %cmp4, label %if.end, label %for.cond.preheader + +if.else: + br i1 %cmp3, label %if.end, label %for.body + +for.cond.preheader: + %cmp5 = icmp sgt i32 %n, 1 + br i1 %cmp5, label %for.body, label %if.end + +for.body: + br i1 0, label %if.end, label %for.body + +if.end: + %div = sdiv i32 %call, 2 + br label %if.end2 + +if.end2: + %ret = phi i32 [ 0, %for.body.preheader ], [ %div, %if.end ] + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 0 + store i32 %ret, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +declare i32 @__mux_get_global_id(i32) + +declare spir_func i32 @_Z3maxii(i32, i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll new file mode 100644 index 0000000000000..08f72b45bf6de --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll @@ -0,0 +1,48 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k foo -w 2 -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s +; RUN: veczc -k foo -w 2 -vecz-passes scalarize -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s --check-prefix=PASSES1 +; RUN: veczc -k foo -w 2 -vecz-passes scalarize,packetizer -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s --check-prefix=PASSES2 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Don't check specific passes, but assume that *some* analyses and passes are run. +; CHECK: Running analysis: {{.*}}> on __vecz_v2_foo +; CHECK: Running pass: {{.*}} on __vecz_v2_foo + +; PASSES1: Running pass: RequireAnalysisPass<{{(class )?}}compiler::utils::DeviceInfoAnalysis, +; PASSES1-NOT: Running pass: +; PASSES1: Running pass: Function scalarization on __vecz_v2_foo +; PASSES1-NOT: Running pass: +; PASSES1-NOT: Running pass: + +; PASSES2: Running pass: RequireAnalysisPass<{{(class )?}}compiler::utils::DeviceInfoAnalysis, +; PASSES2-NOT: Running pass: +; PASSES2: Running pass: Function scalarization on __vecz_v2_foo +; PASSES2: Running pass: Function packetization on __vecz_v2_foo +; PASSES2-NOT: Running pass: +; PASSES2-NOT: Running pass: + +define spir_kernel void @foo(i32 addrspace(1)* %out) { + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx + store i32 0, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll new file mode 100644 index 0000000000000..cad700234785a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll @@ -0,0 +1,44 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k foo -w 2 -vecz-passes scalarize,mask-memops,packetizer -print-after mask-memops -S < %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +; CHECK: IR Dump After Simplify masked memory operations{{( on __vecz_v2_foo)?}} +; CHECK-NEXT: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) #0 { +; CHECK-NEXT: %idx = call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 %idx +; CHECK-NEXT: store i32 0, ptr addrspace(1) %arrayidx, align 4 +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +; CHECK: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) {{.*}} { +; CHECK-NEXT: %idx = call i64 @__mux_get_global_id(i32 0) +; CHECK-NEXT: %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 %idx +; CHECK-NEXT: store <2 x i32> zeroinitializer, ptr addrspace(1) %arrayidx, align 4 +; CHECK-NEXT: ret void +; CHECK-NEXT: } + +define spir_kernel void @foo(i32 addrspace(1)* %out) { + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx + store i32 0, i32 addrspace(1)* %arrayidx, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll new file mode 100644 index 0000000000000..4b289100a3ffb --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll @@ -0,0 +1,89 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k codegen_2 -vecz-simd-width 16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @codegen_2(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %size, i32 %reps) local_unnamed_addr { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = sext i32 %reps to i64 + %mul = mul i64 %call, %conv + %add = add i64 %call, 1 + %mul2 = mul i64 %add, %conv + %cmp19 = icmp ult i64 %mul, %mul2 + br i1 %cmp19, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv4 = sext i32 %size to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.inc, %entry + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %sum.1, %for.inc ] + %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %sum.0.lcssa, i32 addrspace(1)* %arrayidx8, align 4, !tbaa !9 + ret void + +for.body: ; preds = %for.inc, %for.body.lr.ph + %i.021 = phi i64 [ %mul, %for.body.lr.ph ], [ %inc, %for.inc ] + %sum.020 = phi i32 [ 0, %for.body.lr.ph ], [ %sum.1, %for.inc ] + %cmp5 = icmp ult i64 %i.021, %conv4 + br i1 %cmp5, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %i.021 + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4, !tbaa !9 + %add7 = add nsw i32 %0, %sum.020 + br label %for.inc + +for.inc: ; preds = %if.then, %for.body + %sum.1 = phi i32 [ %add7, %if.then ], [ %sum.020, %for.body ] + %inc = add nuw i64 %i.021, 1 + %cmp = icmp ult i64 %inc, %mul2 + br i1 %cmp, label %for.body, label %for.cond.cleanup +} + +declare i64 @__mux_get_global_id(i32) local_unnamed_addr + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!1} +!opencl.kernels = !{!2} +!host.build_options = !{!8} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 2} +!2 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @codegen_2, !3, !4, !5, !6, !7} +!3 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 0, i32 0} +!4 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"} +!5 = !{!"kernel_arg_type", !"int*", !"int*", !"int", !"int"} +!6 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int", !"int"} +!7 = !{!"kernel_arg_type_qual", !"const", !"", !"", !""} +!8 = !{!""} +!9 = !{!10, !10, i64 0} +!10 = !{!"int", !11, i64 0} +!11 = !{!"omnipotent char", !12, i64 0} +!12 = !{!"Simple C/C++ TBAA"} + + +; It checks that the PHI node did not prevent the interleave factor from being determined +; CHECK: define spir_kernel void @__vecz_v16_codegen_2 +; CHECK-NOT: call <16 x i32> @__vecz_b_masked_gather_load4_4_Dv16_jDv16_u3ptrU3AS1Dv16_b +; CHECK: call <16 x i32> @__vecz_b_masked_interleaved_load4_V_Dv16_ju3ptrU3AS1Dv16_b diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll new file mode 100644 index 0000000000000..aabbd65bf1059 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll @@ -0,0 +1,138 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that debug info intrinsics are correctly placed after +; phi nodes. + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +; CHECK: define spir_kernel void @__vecz_v4_loop_phi( +define spir_kernel void @loop_phi(i32 addrspace(3)* %a, i32 addrspace(3)* %b) #0 !dbg !4 { +entry: + %a.addr = alloca i32 addrspace(3)*, align 8 + %b.addr = alloca i32 addrspace(3)*, align 8 + %tid = alloca i64, align 8 + %i = alloca i32, align 4 + store i32 addrspace(3)* %a, i32 addrspace(3)** %a.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(3)** %a.addr, metadata !12, metadata !30), !dbg !31 + store i32 addrspace(3)* %b, i32 addrspace(3)** %b.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(3)** %b.addr, metadata !13, metadata !30), !dbg !31 + call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !30), !dbg !32 + %call = call i64 @__mux_get_local_id(i32 0) #3, !dbg !32 + store i64 %call, i64* %tid, align 8, !dbg !32 + call void @llvm.dbg.declare(metadata i32* %i, metadata !19, metadata !30), !dbg !33 + %0 = load i64, i64* %tid, align 8, !dbg !33 + %conv = trunc i64 %0 to i32, !dbg !33 + store i32 %conv, i32* %i, align 4, !dbg !33 + br label %for.cond, !dbg !33 + + +; CHECK: for.cond: +; CHECK: %[[PHI1:.+]] = phi {{i[0-9]+}} [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ] +; CHECK: #dbg_value(i64 %[[PHI1]], !{{[0-9]+}}, +; CHECK-SAME: !DIExpression({{.*}}), +; CHECK-SAME: !{{[0-9]+}} +; Check we haven't inserted a llvm.dbg.value intrinsic before the last of the PHIs. +; CHECK-NOT: phi +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4, !dbg !34 + %cmp = icmp slt i32 %1, 128, !dbg !34 + br i1 %cmp, label %for.body, label %for.end, !dbg !33 + +for.body: ; preds = %for.cond + %2 = load i32, i32* %i, align 4, !dbg !36 + %idxprom = sext i32 %2 to i64, !dbg !36 + %3 = load i32 addrspace(3)*, i32 addrspace(3)** %b.addr, align 8, !dbg !36 + %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %3, i64 %idxprom, !dbg !36 + %4 = load i32, i32 addrspace(3)* %arrayidx, align 4, !dbg !36 + %5 = load i32, i32* %i, align 4, !dbg !36 + %idxprom2 = sext i32 %5 to i64, !dbg !36 + %6 = load i32 addrspace(3)*, i32 addrspace(3)** %a.addr, align 8, !dbg !36 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(3)* %6, i64 %idxprom2, !dbg !36 + store i32 %4, i32 addrspace(3)* %arrayidx3, align 4, !dbg !36 + br label %for.inc, !dbg !38 + +for.inc: ; preds = %for.body + %7 = load i32, i32* %i, align 4, !dbg !34 + %add = add nsw i32 %7, 32, !dbg !34 + store i32 %add, i32* %i, align 4, !dbg !34 + br label %for.cond, !dbg !34 + +for.end: ; preds = %for.cond +; CHECK: ret void + ret void, !dbg !39 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +declare i64 @__mux_get_local_id(i32) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nobuiltin } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!21} +!llvm.module.flags = !{!28} +!llvm.ident = !{!29} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2) +!1 = !DIFile(filename: "kernel.opencl", directory: "/home/Aorta/build") +!2 = !{} +!3 = !{!4} +!4 = distinct !DISubprogram(name: "loop_phi", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11) +!5 = !DISubroutineType(types: !6) +!6 = !{null, !7, !9} +!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 64) +!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64) +!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !8) +!11 = !{!12, !13, !14, !19} +!12 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 2, type: !7) +!13 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 2, type: !9) +!14 = !DILocalVariable(name: "tid", scope: !4, file: !1, line: 3, type: !15) +!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17) +!16 = !DIFile(filename: "/home/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/home/Aorta/build") +!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18) +!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) +!19 = !DILocalVariable(name: "i", scope: !20, file: !1, line: 4, type: !8) +!20 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4) +!21 = !{void (i32 addrspace(3)*, i32 addrspace(3)*)* @loop_phi, !22, !23, !24, !25, !26, !27} +!22 = !{!"kernel_arg_addr_space", i32 3, i32 3} +!23 = !{!"kernel_arg_access_qual", !"none", !"none"} +!24 = !{!"kernel_arg_type", !"int*", !"int*"} +!25 = !{!"kernel_arg_base_type", !"int*", !"int*"} +!26 = !{!"kernel_arg_type_qual", !"", !"const"} +!27 = !{!"reqd_work_group_size", i32 32, i32 1, i32 1} +!28 = !{i32 2, !"Debug Info Version", i32 3} +!29 = !{!"clang version 3.8.1 "} +!30 = !DIExpression() +!31 = !DILocation(line: 2, scope: !4) +!32 = !DILocation(line: 3, scope: !4) +!33 = !DILocation(line: 4, scope: !20) +!34 = !DILocation(line: 4, scope: !35) +!35 = distinct !DILexicalBlock(scope: !20, file: !1, line: 4) +!36 = !DILocation(line: 5, scope: !37) +!37 = distinct !DILexicalBlock(scope: !35, file: !1, line: 4) +!38 = !DILocation(line: 6, scope: !37) +!39 = !DILocation(line: 7, scope: !4) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll new file mode 100644 index 0000000000000..0885f8a058592 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll @@ -0,0 +1,65 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k phi_memory -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @phi_memory(i32 addrspace(1)* %input, i32 addrspace(1)* %output, i32 %size) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %idx.ext = sext i32 %conv to i64 + %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idx.ext + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %output.addr.0 = phi i32 addrspace(1)* [ %add.ptr, %entry ], [ %add.ptr2, %for.body ] + %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %storemerge, %size + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i32 %storemerge, %conv + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %idxprom + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + store i32 %0, i32 addrspace(1)* %output.addr.0, align 4 + %add.ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %output.addr.0, i64 1 + %inc = add nsw i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +; It checks that the contiguity of the load and store is identified through the +; loop-incrementing pointer PHI node +; +; CHECK: void @__vecz_v4_phi_memory +; CHECK: %[[LD:.+]] = load <4 x i32> +; CHECK: store <4 x i32> %[[LD]] +; CHECK-NOT: scatter_store +; CHECK-NOT: gather_load diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll new file mode 100644 index 0000000000000..e13dc4ed88a66 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll @@ -0,0 +1,60 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k phi_memory -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @phi_memory(i32 addrspace(1)* %input, i32 addrspace(1)* %output, i64 %size) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %call + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %output.addr.0 = phi i32 addrspace(1)* [ %add.ptr, %entry ], [ %add.ptr2, %for.body ] + %storemerge = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i64 %storemerge, %size + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %add = add nsw i64 %storemerge, %call + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %add + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + store i32 %0, i32 addrspace(1)* %output.addr.0, align 4 + %add.ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %output.addr.0, i64 %call + %inc = add nsw i64 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +; It checks that the NON-contiguity of the store is identified through the +; loop-incrementing pointer PHI node +; +; CHECK: void @__vecz_v4_phi_memory +; CHECK: %[[LD:.+]] = load <4 x i32> +; CHECK: call void @__vecz_b_scatter_store4_Dv4_jDv4_u3ptrU3AS1(<4 x i32> %[[LD]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll new file mode 100644 index 0000000000000..027a688a8614a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -w 4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i32:32-f80:128-n8:16:32:64-S128" +target triple = "spir-unknown-unknown" + +; Function Attrs: convergent nounwind readonly +declare i32 @__mux_get_local_id(i32) #2 + +; Function Attrs: convergent nounwind +define spir_kernel void @test() #0 { +entry: + %call8 = call i32 @__mux_get_local_id(i32 0) #3 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* poison, i32 %call8 + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %conv9 = uitofp i8 %0 to float + %phitmp = fptoui float %conv9 to i8 + %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* poison, i32 %call8 + store i8 %phitmp, i8 addrspace(1)* %arrayidx16, align 1 + ret void +} + +; The "poison"s in the above IR should "optimize" to a trap call and an unreachable +; terminator instruction. +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: unreachable diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll new file mode 100644 index 0000000000000..7d52fda58e6f3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll @@ -0,0 +1,61 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k predicate_with_switch -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_local_id(i32) + +declare i64 @__mux_get_global_id(i32) + +@predicate_with_switch.tmpIn = internal addrspace(3) global [16 x i32] poison, align 4 + +define spir_kernel void @predicate_with_switch(i32 addrspace(1)* %A, i32 addrspace(1)* %B) #0 { +entry: + %call = call i64 @__mux_get_local_id(i32 0) #2 + %call1 = call i64 @__mux_get_global_id(i32 0) #2 + switch i64 %call, label %if.end [ + i64 0, label %return + i64 200, label %return + ] + +if.end: + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %call1 + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @predicate_with_switch.tmpIn, i64 0, i64 %call + store i32 %0, i32 addrspace(3)* %arrayidx3, align 4 + %sub = add i64 %call, -1 + %arrayidx4 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @predicate_with_switch.tmpIn, i64 0, i64 %sub + %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %call1 + store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 + br label %return + +return: + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_predicate_with_switch + +; We should use masked stores +; CHECK: vecz_b_masked_store4 +; CHECK: vecz_b_masked_store4 + +; We should *not* have unconditional stores +; CHECK-NOT: store <4 x i32> +; CHECK-NOT: store <4 x i32> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll new file mode 100644 index 0000000000000..34c892ca5dea6 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll @@ -0,0 +1,35 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S -vecz-passes=packetizer < %s | FileCheck %s + +; CHECK: %{{.*}} = fcmp nnan ninf olt <4 x float> %{{.*}}, %{{.*}} + +define spir_kernel void @fast_nan(float addrspace(1)* %src1, float addrspace(1)* %src2, i16 addrspace(1)* %dst, i32 %width) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %src1, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %src2, i64 %call + %1 = load float, float addrspace(1)* %arrayidx2, align 4 + %cmp = fcmp nnan ninf olt float %0, %1 + %conv4 = zext i1 %cmp to i16 + %arrayidx6 = getelementptr inbounds i16, i16 addrspace(1)* %dst, i64 %call + store i16 %conv4, i16 addrspace(1)* %arrayidx6, align 2 + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll new file mode 100644 index 0000000000000..695e6d0a39696 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll @@ -0,0 +1,88 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_float -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1 +@.strf = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1 + +; Function Attrs: nounwind +define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #3 + %cmp = icmp eq i32 %width, 13 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8 + x i8] addrspace(2)* @.str, i64 0, i64 0), i32 %0) #3 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +define spir_kernel void @test_float(float* %in) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float* %in, i64 %call + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, %0 + %conv = fpext float %mul to double + %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(2)* @.strf, i64 0, i64 0), double %conv) + ret void +} + + + +declare i64 @__mux_get_global_id(i32) #1 + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @printf_kernel, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0, i32 0} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"int*", !"int*", !"int*", !"int", !"int"} +!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*", !"int", !"int"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !""} +!6 = !{!"clang version 3.8.0 "} + +; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1 + +; CHECK: define spir_kernel void @__vecz_v4_test_float +; CHECK: %[[CONV2:.+]] = fpext <4 x float> %{{.+}} to <4 x double> +; CHECK: %[[V2:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 0 +; CHECK: %[[V3:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 1 +; CHECK: %[[V4:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 2 +; CHECK: %[[V5:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 3 +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V2]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V3]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V4]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V5]]) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll new file mode 100644 index 0000000000000..533f710b34a01 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll @@ -0,0 +1,126 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + + +; RUN: veczc -k regression_by_all -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | FileCheck %s + +; The purpose of this test is to make sure the block `c` does not get considered +; as a by_all because one of its predecessors is by_all. In fact, because `c` +; has a div causing block (b) as one of its predecessors, then it cannot be +; considered by_all + +; The CFG of the following kernel is: +; +; a +; |\ +; | b +; |/ \ +; c d +; \ / +; e +; +; * where node a is a uniform branch, and node b is a varying branch. +; * where nodes c, d and e are divergent. +; +; With partial linearization we will have a CFG of the form: +; +; a +; /| +; | b +; | | +; | d +; \| +; c +; | +; e +; +; __kernel void regression_by_all(__global int *out, int n) { +; int id = get_global_id(0); +; int ret = 0; +; +; if (n % 2 == 0) { +; goto d; +; } else { +; ret = 1; +; if (id % 2 != 0) { +; goto d; +; } else { +; for (int i = 0; i < n; ++i) { ret++; } +; goto e; +; } +; } +; +; d: +; ret += id; +; ret *= n; +; +; e: +; out[id] = ret; +; } + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @regression_by_all(i32 addrspace(1)* %out, i32 %n) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %rem1 = and i32 %n, 1 + %cmp = icmp eq i32 %rem1, 0 + br i1 %cmp, label %d, label %if.else + +if.else: ; preds = %entry + %rem22 = and i32 %conv, 1 + %cmp3 = icmp eq i32 %rem22, 0 + br i1 %cmp3, label %for.cond, label %d + +for.cond: ; preds = %if.else, %for.body + %ret.0 = phi i32 [ %inc, %for.body ], [ 1, %if.else ] + %storemerge = phi i32 [ %inc9, %for.body ], [ 0, %if.else ] + %cmp7 = icmp slt i32 %storemerge, %n + br i1 %cmp7, label %for.body, label %e + +for.body: ; preds = %for.cond + %inc = add nuw nsw i32 %ret.0, 1 + %inc9 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +d: ; preds = %if.else, %entry + %ret.1 = phi i32 [ 0, %entry ], [ 1, %if.else ] + %add = add nsw i32 %ret.1, %conv + %mul = mul nsw i32 %add, %n + br label %e + +e: ; preds = %for.cond, %d + %ret.2 = phi i32 [ %mul, %d ], [ %ret.0, %for.cond ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @__vecz_v4_regression_by_all +; CHECK: br i1 %[[CMP:.+]], label %[[D:.+]], label %[[IFELSE:.+]] + +; CHECK: [[D]]: +; CHECK-NOT: %d.entry_mask = and i1 true, true +; CHECK: %d.entry_mask = phi i1 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll new file mode 100644 index 0000000000000..cc64e2641a2b0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll @@ -0,0 +1,52 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i8( +; CHECK: %shl = shl i64 %call, 2 +; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl +; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64 +; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8 +define spir_kernel void @intptr_cast_i8(i8 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = ptrtoint i8 addrspace(1)* %in to i64 + %shl = shl i64 %call, 2 + %add = add i64 %shl, %0 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i16( +; CHECK: %shl = shl i64 %call, 2 +; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl +; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64 +; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8 +define spir_kernel void @intptr_cast_i16(i16 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = ptrtoint i16 addrspace(1)* %in to i64 + %shl = shl i64 %call, 2 + %add = add i64 %shl, %0 + store i64 %add, i64 addrspace(1)* %out, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll new file mode 100644 index 0000000000000..8dd706f51977b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll @@ -0,0 +1,42 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @remove_intptr(i8 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = ptrtoint i8 addrspace(1)* %in to i64 + %shl = shl nuw nsw i64 %call, 2 + %add = add i64 %shl, %0 + %1 = inttoptr i64 %add to i32 addrspace(1)* + %2 = load i32, i32 addrspace(1)* %1, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %2, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @__vecz_v4_remove_intptr +; CHECK-NOT: ptrtoint +; CHECK-NOT: inttoptr +; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in +; CHECK: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %remove_intptr, align 4 +; CHECK: store <4 x i32> %[[LOAD]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll new file mode 100644 index 0000000000000..64234b9019781 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll @@ -0,0 +1,52 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @remove_intptr(i8 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = ptrtoint i8 addrspace(1)* %in to i64 + %shl = shl nuw nsw i64 %call, 2 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %shl + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %x.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %intin.06 = phi i64 [ %0, %entry ], [ %add, %for.body ] + %add = add i64 %intin.06, 4 + %1 = inttoptr i64 %add to i32 addrspace(1)* + %2 = load i32, i32 addrspace(1)* %1, align 4 + store i32 %2, i32 addrspace(1)* %arrayidx, align 4 + %inc = add nuw nsw i32 %x.07, 1 + %exitcond.not = icmp eq i32 %inc, 4 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @__vecz_v4_remove_intptr +; CHECK-NOT: ptrtoint +; CHECK-NOT: inttoptr +; CHECK: %[[RPHI:.+]] = phi ptr addrspace(1) [ %in, %entry ], [ %[[RGEP:.+]], %for.body ] +; CHECK: %[[RGEP]] = getelementptr i8, ptr addrspace(1) %[[RPHI]], i{{32|64}} 4 +; CHECK: load i32, ptr addrspace(1) %[[RGEP]], align 4 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll new file mode 100644 index 0000000000000..7f4a881552699 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Let vecz pick the right vectorization factor for this kernel +; RUN: veczc --vecz-auto -k bar_sg8 -k foo_sg13 -S < %s | FileCheck %s +; RUN: veczc --vecz-auto -k bar_sg8:4 -k foo_sg13:8 -S < %s | FileCheck %s + +; Check we auto-vectorize to 8, despite any other options telling us a +; different vectorization factor. +; CHECK: define void @__vecz_v8_bar_sg8 +define void @bar_sg8(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !intel_reqd_sub_group_size !0 { + %id = call i64 @__mux_get_global_id(i32 0) + %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id + %x = load i32, ptr addrspace(1) %in.addr +; CHECK: = add <8 x i32> + %y = add i32 %x, 1 + %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id + store i32 %y, ptr addrspace(1) %out.addr + ret void +} + +; Check we auto-vectorize to 13, despite any other options telling us a +; different vectorization factor. This is a silly number but it if we're told +; to do it we must obey. +; CHECK: define void @__vecz_v13_foo_sg13 +define void @foo_sg13(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !intel_reqd_sub_group_size !1 { + %id = call i64 @__mux_get_global_id(i32 0) + %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id + %x = load i32, ptr addrspace(1) %in.addr +; CHECK: = add <13 x i32> + %y = add i32 %x, 1 + %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id + store i32 %y, ptr addrspace(1) %out.addr + ret void +} + +declare i64 @__mux_get_global_id(i32) + +attributes #0 = { "mux-kernel"="entry-point" } + +!0 = !{i32 8} +!1 = !{i32 13} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll new file mode 100644 index 0000000000000..dcf78d89930d2 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll @@ -0,0 +1,53 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s -w 16 | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out, i64 addrspace(1)* %N) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %0 = load i64, i64 addrspace(1)* %N, align 8 + %cmp = icmp ult i64 %call, %0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %call + %1 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2, i64 %call + %2 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %add = add nsw i32 %2, %1 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %add, i32 addrspace(1)* %arrayidx2, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @__vecz_v16_add +; CHECK: entry: +; CHECK: br i1 %{{.+}}, label %[[END:.+]], label %[[THEN:.+]] +; CHECK-EMPTY: +; CHECK-NEXT: [[THEN]]: +; CHECK: br label %[[END]] +; CHECK-EMPTY: +; CHECK-NEXT: [[END]]: +; CHECK-NEXT: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll new file mode 100644 index 0000000000000..1c9a90a942684 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll @@ -0,0 +1,51 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare spir_func i32 @__mux_get_local_id(i32); +declare spir_func i32 @__mux_get_global_id(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %lid = call i32 @__mux_get_local_id(i32 0) + %cmp = icmp eq i32 %lid, 0 + br i1 %cmp, label %if, label %merge + +if: + %single_load = load i32, i32 addrspace(1)* %in + %single_add = add i32 %single_load, 42 + store i32 %single_add, i32 addrspace(1)* %in + br label %merge + +merge: + %multi_load = load i32, i32 addrspace(1)* %in + %multi_add = add i32 %multi_load, 42 + %gid = call i32 @__mux_get_global_id(i32 0) + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid + store i32 %multi_add, i32 addrspace(1)* %slot + + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %cmp3 to i4 +; CHECK: %[[MASK:.+]] = icmp ne i4 %[[BITCAST]], 0 +; CHECK: %single_load{{[0-9]*}} = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]]) +; CHECK: %multi_load = load i32, ptr addrspace(1) %in diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll new file mode 100644 index 0000000000000..fb2b8e8076f5f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll @@ -0,0 +1,36 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i32 @__mux_get_global_id(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %load = load i32, i32 addrspace(1)* %in + %gid = call i32 @__mux_get_global_id(i32 0) + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid + store i32 %load, i32 addrspace(1)* %slot + + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: entry: +; CHECK: %load = load i32, ptr addrspace(1) %in diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll new file mode 100644 index 0000000000000..c563b79b6917e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll @@ -0,0 +1,47 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i32 @__mux_get_local_id(i32); +declare i32 @__mux_get_global_id(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %lid = call i32 @__mux_get_local_id(i32 0) + %cmp = icmp eq i32 %lid, 0 + br i1 %cmp, label %if, label %merge + +if: + %secretly_scalar_load = load i32, i32 addrspace(1)* %in + %add = add i32 %secretly_scalar_load, 42 + store i32 %add, i32 addrspace(1)* %in + br label %merge + +merge: + %load = load i32, i32 addrspace(1)* %in + %gid = call i32 @__mux_get_global_id(i32 0) + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid + store i32 %load, i32 addrspace(1)* %slot + + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %load = load i32, ptr addrspace(1) %in diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll new file mode 100644 index 0000000000000..62ea24d8e2c5e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll @@ -0,0 +1,44 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i32 @__mux_get_local_id(i32); +declare i32 @__mux_get_global_id(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %lid = call i32 @__mux_get_local_id(i32 0) + %cmp = icmp eq i32 %lid, 0 + br i1 %cmp, label %if, label %merge + +if: + br label %merge + +merge: + %load = load i32, i32 addrspace(1)* %in + %gid = call i32 @__mux_get_global_id(i32 0) + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid + store i32 %load, i32 addrspace(1)* %slot + + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %load = load i32, ptr addrspace(1) %in diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll new file mode 100644 index 0000000000000..e7b76a778e784 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i32 @__mux_get_local_id(i32); +declare i32 @__mux_get_global_id(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %lid = call i32 @__mux_get_local_id(i32 0) + %and = and i32 %lid, 1 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if, label %merge + +if: + %lid1 = call i32 @__mux_get_local_id(i32 1) + %cmp1 = icmp eq i32 %lid1, 0 + br i1 %cmp1, label %deeper_if, label %deeper_merge + +deeper_if: + br label %deeper_merge + +deeper_merge: + %load = load i32, i32 addrspace(1)* %in + %gid = call i32 @__mux_get_global_id(i32 0) + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid + store i32 %load, i32 addrspace(1)* %slot + br label %merge + +merge: + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %[[LOAD:.+]] = load i32, ptr addrspace(1) %in +; CHECK: %[[SPLAT_IN:.+]] = insertelement <4 x i32> poison, i32 %[[LOAD]], {{(i32|i64)}} 0 +; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLAT_IN]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32> %[[SPLAT]], ptr addrspace(1){{( nonnull)? %.*}}, <4 x i1> %{{.+}}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll new file mode 100644 index 0000000000000..e5fe580b0ac22 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll @@ -0,0 +1,78 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_local_id(i32) #0 + +; Function Attrs: nounwind readnone +declare spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) #0 + +declare spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float>, i64, float addrspace(1)*) + +declare spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64, float addrspace(1)*) +; Function Attrs: inlinehint norecurse nounwind readnone +declare spir_func float @_Z3madfff(float, float, float) local_unnamed_addr #2 + +define spir_kernel void @scalar_vector_user(float addrspace(1)* %inout, i64 %n) { +entry: + %lid = tail call i64 @__mux_get_local_id(i32 0) #0 + %inout.address = getelementptr inbounds float, float addrspace(1)* %inout, i64 %lid + br label %loop + +loop: ; preds = %entry, %loop + %madv4.prev = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4, %loop ] + %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ] + %i.inc = add nuw nsw i64 %i, 1 + %cmp = icmp slt i64 %i.inc, %n + %inout.vload = tail call spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64 0, float addrspace(1)* %inout.address) + %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> poison, <4 x i32> zeroinitializer + %madv4 = tail call spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float> %inout.vload, <4 x float> %inout.vec0, <4 x float> %madv4.prev) #0 + br i1 %cmp, label %loop, label %end + +end: ; preds = %loop + %mad.vec0 = extractelement <4 x float> %madv4, i32 0 + store float %mad.vec0, float addrspace(1)* %inout.address, align 4 + tail call spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float> %madv4, i64 0, float addrspace(1)* %inout.address) + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { noduplicate } +attributes #2 = { inlinehint norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } + +; The purpose of this test is to make sure we correctly scalarize an instruction +; used by both a scalar and vector instruction. We would previously try to +; scalarize its users twice thus resulting in invalid IR. + +; CHECK: define spir_kernel void @__vecz_v4_scalar_vector_user +; CHECK: loop: +; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S0:[0-9]+]], %loop ]{{$}} +; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S1:[0-9]+]], %loop ]{{$}} +; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S2:[0-9]+]], %loop ]{{$}} +; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S3:[0-9]+]], %loop ]{{$}} + +; make sure the above PHI incomings are unique by looking for their definitions +; CHECK: %madv4[[S0]] = +; CHECK: %madv4[[S1]] = +; CHECK: %madv4[[S2]] = +; CHECK: %madv4[[S3]] = diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll new file mode 100644 index 0000000000000..97ccb3494c1ac --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll @@ -0,0 +1,86 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x i32>* %pc, <4 x float>* %pd) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx + %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx + %c = getelementptr <4 x i32>, <4 x i32>* %pc, i64 %idx + %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx + %0 = load <4 x float>, <4 x float>* %a, align 16 + %1 = load <4 x float>, <4 x float>* %b, align 16 + %2 = load <4 x i32>, <4 x i32>* %c, align 16 + %call = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %2) + %3 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %call) + store <4 x float> %3, <4 x float>* %d, align 16 + ret void +} + +declare spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32>) +declare spir_func float @_Z13convert_floati(i32) +declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; CHECK: define spir_kernel void @__vecz_v4_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd) +; CHECK: entry: +; CHECK: %[[A_0:.+]] = getelementptr float, ptr %a, i32 0 +; CHECK: %[[A_1:.+]] = getelementptr float, ptr %a, i32 1 +; CHECK: %[[A_2:.+]] = getelementptr float, ptr %a, i32 2 +; CHECK: %[[A_3:.+]] = getelementptr float, ptr %a, i32 3 +; CHECK: %[[LA_0:.+]] = load float, ptr %[[A_0]] +; CHECK: %[[LA_1:.+]] = load float, ptr %[[A_1]] +; CHECK: %[[LA_2:.+]] = load float, ptr %[[A_2]] +; CHECK: %[[LA_3:.+]] = load float, ptr %[[A_3]] +; CHECK: %[[B_0:.+]] = getelementptr float, ptr %b, i32 0 +; CHECK: %[[B_1:.+]] = getelementptr float, ptr %b, i32 1 +; CHECK: %[[B_2:.+]] = getelementptr float, ptr %b, i32 2 +; CHECK: %[[B_3:.+]] = getelementptr float, ptr %b, i32 3 +; CHECK: %[[LB_0:.+]] = load float, ptr %[[B_0]] +; CHECK: %[[LB_1:.+]] = load float, ptr %[[B_1]] +; CHECK: %[[LB_2:.+]] = load float, ptr %[[B_2]] +; CHECK: %[[LB_3:.+]] = load float, ptr %[[B_3]] +; CHECK: %[[C_0:.+]] = getelementptr i32, ptr %c, i32 0 +; CHECK: %[[C_1:.+]] = getelementptr i32, ptr %c, i32 1 +; CHECK: %[[C_2:.+]] = getelementptr i32, ptr %c, i32 2 +; CHECK: %[[C_3:.+]] = getelementptr i32, ptr %c, i32 3 +; CHECK: %[[LC_0:.+]] = load i32, ptr %[[C_0]] +; CHECK: %[[LC_1:.+]] = load i32, ptr %[[C_1]] +; CHECK: %[[LC_2:.+]] = load i32, ptr %[[C_2]] +; CHECK: %[[LC_3:.+]] = load i32, ptr %[[C_3]] +; CHECK: %[[CALL1:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_0]]) +; CHECK: %[[CALL2:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_1]]) +; CHECK: %[[CALL3:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_2]]) +; CHECK: %[[CALL4:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_3]]) +; CHECK: %[[FMAD_0:.+]] = call float @llvm.fmuladd.f32(float %[[LA_0]], float %[[LB_0]], float %[[CALL1]]) +; CHECK: %[[FMAD_1:.+]] = call float @llvm.fmuladd.f32(float %[[LA_1]], float %[[LB_1]], float %[[CALL2]]) +; CHECK: %[[FMAD_2:.+]] = call float @llvm.fmuladd.f32(float %[[LA_2]], float %[[LB_2]], float %[[CALL3]]) +; CHECK: %[[FMAD_3:.+]] = call float @llvm.fmuladd.f32(float %[[LA_3]], float %[[LB_3]], float %[[CALL4]]) +; CHECK: %[[D_0:.+]] = getelementptr float, ptr %d, i32 0 +; CHECK: %[[D_1:.+]] = getelementptr float, ptr %d, i32 1 +; CHECK: %[[D_2:.+]] = getelementptr float, ptr %d, i32 2 +; CHECK: %[[D_3:.+]] = getelementptr float, ptr %d, i32 3 +; CHECK: store float %[[FMAD_0]], ptr %[[D_0]] +; CHECK: store float %[[FMAD_1]], ptr %[[D_1]] +; CHECK: store float %[[FMAD_2]], ptr %[[D_2]] +; CHECK: store float %[[FMAD_3]], ptr %[[D_3]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll new file mode 100644 index 0000000000000..f016562ea54ef --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll @@ -0,0 +1,47 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_calls(<4 x float>* %a, <4 x float>* %b, <4 x i32>* %c, <4 x float>* %d) { +entry: + %0 = load <4 x float>, <4 x float>* %a, align 16 + %1 = load <4 x float>, <4 x float>* %b, align 16 + %2 = load <4 x i32>, <4 x i32>* %c, align 16 + %call = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %2) + %3 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %call) + store <4 x float> %3, <4 x float>* %d, align 16 + ret void +} + +declare spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32>) +declare spir_func float @_Z13convert_floati(i32) +declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) + +; Checks that this function gets vectorized, although because every instruction is +; uniform, the process of vectorization makes no actual changes whatsoever! +; CHECK: define spir_kernel void @__vecz_v4_test_calls(ptr %a, ptr %b, ptr %c, ptr %d) +; CHECK: entry: +; CHECK: %[[LA:.+]] = load <4 x float>, ptr %a, align 16 +; CHECK: %[[LB:.+]] = load <4 x float>, ptr %b, align 16 +; CHECK: %[[LC:.+]] = load <4 x i32>, ptr %c, align 16 +; CHECK: %[[CALL:.+]] = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %[[LC]]) +; CHECK: %[[FMAD:.+]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %[[LA]], <4 x float> %[[LB]], <4 x float> %[[CALL]]) +; CHECK: store <4 x float> %[[FMAD]], ptr %d, align 16 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll new file mode 100644 index 0000000000000..2e7c1a2202c71 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll @@ -0,0 +1,183 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Check that debug info is preserved in the vectorized kernel. +; Specifically that the scalarization pass doesn't destroy DI +; intrinsics attached to the vector instructions it scalarizes. + +; RUN: veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + + +; Function Attrs: nounwind +define spir_kernel void @mul2(<2 x i32> addrspace(1)* %in1, <2 x i32> addrspace(1)* %in2, <2 x i32> addrspace(1)* %out) #0 !dbg !4 { +entry: + %in1.addr = alloca <2 x i32> addrspace(1)*, align 8 + %in2.addr = alloca <2 x i32> addrspace(1)*, align 8 + %out.addr = alloca <2 x i32> addrspace(1)*, align 8 + %tid = alloca i64, align 8 + %a = alloca <2 x i32>, align 8 + %b = alloca <2 x i32>, align 8 + %tmp = alloca <2 x i32>, align 8 + store <2 x i32> addrspace(1)* %in1, <2 x i32> addrspace(1)** %in1.addr, align 8 + call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %in1.addr, metadata !16, metadata !34), !dbg !35 + store <2 x i32> addrspace(1)* %in2, <2 x i32> addrspace(1)** %in2.addr, align 8 + call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %in2.addr, metadata !17, metadata !34), !dbg !35 + store <2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)** %out.addr, align 8 + call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %out.addr, metadata !18, metadata !34), !dbg !35 + call void @llvm.dbg.declare(metadata i64* %tid, metadata !19, metadata !34), !dbg !36 + %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !36 + store i64 %call, i64* %tid, align 8, !dbg !36 + call void @llvm.dbg.declare(metadata <2 x i32>* %a, metadata !23, metadata !34), !dbg !37 + %0 = load i64, i64* %tid, align 8, !dbg !37 + %1 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %in1.addr, align 8, !dbg !37 + %arrayidx = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %1, i64 %0, !dbg !37 + %2 = load <2 x i32>, <2 x i32> addrspace(1)* %arrayidx, align 8, !dbg !37 + store <2 x i32> %2, <2 x i32>* %a, align 8, !dbg !37 + call void @llvm.dbg.declare(metadata <2 x i32>* %b, metadata !24, metadata !34), !dbg !38 + %3 = load i64, i64* %tid, align 8, !dbg !38 + %4 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %in2.addr, align 8, !dbg !38 + %arrayidx1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %4, i64 %3, !dbg !38 + %5 = load <2 x i32>, <2 x i32> addrspace(1)* %arrayidx1, align 8, !dbg !38 + store <2 x i32> %5, <2 x i32>* %b, align 8, !dbg !38 + call void @llvm.dbg.declare(metadata <2 x i32>* %tmp, metadata !25, metadata !34), !dbg !39 + %6 = load <2 x i32>, <2 x i32>* %a, align 8, !dbg !39 + %7 = load <2 x i32>, <2 x i32>* %b, align 8, !dbg !39 + %mul = mul <2 x i32> %6, %7, !dbg !39 + store <2 x i32> %mul, <2 x i32>* %tmp, align 8, !dbg !39 + %8 = load <2 x i32>, <2 x i32>* %tmp, align 8, !dbg !40 + %9 = load i64, i64* %tid, align 8, !dbg !40 + %10 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %out.addr, align 8, !dbg !40 + %arrayidx2 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %10, i64 %9, !dbg !40 + store <2 x i32> %8, <2 x i32> addrspace(1)* %arrayidx2, align 8, !dbg !40 + ret void, !dbg !41 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +declare i64 @__mux_get_global_id(i32) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nobuiltin } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!26} +!llvm.module.flags = !{!32} +!llvm.ident = !{!33} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2) +!1 = !DIFile(filename: "", directory: "Aorta/vecz_build") +!2 = !{} +!3 = !{!4} +!4 = distinct !DISubprogram(name: "mul2", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !15) +!5 = !DIFile(filename: "kernel.opencl", directory: "Aorta/vecz_build") +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8, !8, !8} +!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64) +!9 = !DIDerivedType(tag: DW_TAG_typedef, name: "int2", file: !10, line: 63, baseType: !11) +!10 = !DIFile(filename: "Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "Aorta/vecz_build") +!11 = !DICompositeType(tag: DW_TAG_array_type, baseType: !12, size: 64, align: 64, flags: DIFlagVector, elements: !13) +!12 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!13 = !{!14} +!14 = !DISubrange(count: 2) +!15 = !{!16, !17, !18, !19, !23, !24, !25} +!16 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8) +!17 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8) +!18 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8) +!19 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !20) +!20 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !10, line: 33, baseType: !21) +!21 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !10, line: 31, baseType: !22) +!22 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) +!23 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 4, type: !9) +!24 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 5, type: !9) +!25 = !DILocalVariable(name: "tmp", scope: !4, file: !5, line: 6, type: !9) +!26 = !{void (<2 x i32> addrspace(1)*, <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)*)* @mul2, !27, !28, !29, !30, !31} +!27 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1} +!28 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} +!29 = !{!"kernel_arg_type", !"int2*", !"int2*", !"int2*"} +!30 = !{!"kernel_arg_base_type", !"int __attribute__((ext_vector_type(2)))*", !"int __attribute__((ext_vector_type(2)))*", !"int __attribute__((ext_vector_type(2)))*"} +!31 = !{!"kernel_arg_type_qual", !"", !"", !""} +!32 = !{i32 2, !"Debug Info Version", i32 3} +!33 = !{!"clang version 3.8.0 "} +!34 = !DIExpression() +!35 = !DILocation(line: 1, scope: !4) +!36 = !DILocation(line: 3, scope: !4) +!37 = !DILocation(line: 4, scope: !4) +!38 = !DILocation(line: 5, scope: !4) +!39 = !DILocation(line: 6, scope: !4) +!40 = !DILocation(line: 7, scope: !4) +!41 = !DILocation(line: 8, scope: !4) + +; Vectorized kernel function +; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_mul2({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]] + +; Check that intrinsics for user variable locations are still present +; CHECK: #dbg_value({{.*}} %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]] +; CHECK-SAME: [[PARAM_LOC:![0-9]+]] + +; CHECK: #dbg_value({{.*}} %in2, [[DI_IN2:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[PARAM_LOC]] + +; CHECK: #dbg_value({{.*}} %out, [[DI_OUT:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[PARAM_LOC]] + +; CHECK: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[TID_LOC:![0-9]+]] + +; CHECK: #dbg_declare(ptr %a, [[DI_A:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[A_LOC:![0-9]+]] + +; CHECK: #dbg_declare(ptr %b, [[DI_B:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[B_LOC:![0-9]+]] + +; CHECK: #dbg_declare(ptr %tmp, [[DI_TMP:![0-9]+]], [[EXPR]] +; CHECK-SAME: [[TMP_LOC:![0-9]+]] + +; Debug info metadata entries +; CHECK:[[PTR_TYPE:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[DI_INT2:![0-9]+]], size: 64, align: 64) +; CHECK:[[DI_INT2]] = !DIDerivedType(tag: DW_TAG_typedef, name: "int2" + +; CHECK: [[VECZ_SUBPROG]] = distinct !DISubprogram(name: "mul2" +; CHECK-SAME: retainedNodes: [[VECZ_VARS:![0-9]+]] + +; CHECK: [[VECZ_VARS]] = !{[[DI_IN1]], [[DI_IN2]], [[DI_OUT]], [[DI_TID]], [[DI_A]], [[DI_B]], [[DI_TMP]]} + +; CHECK: [[DI_IN1]] = !DILocalVariable(name: "in1", arg: 1, scope: [[VECZ_SUBPROG]], +; CHECK-SAME:line: 1, type: [[PTR_TYPE]] + +; CHECK: [[DI_IN2]] = !DILocalVariable(name: "in2", arg: 2, scope: [[VECZ_SUBPROG]], +; CHECK-SAME:line: 1, type: [[PTR_TYPE]] + +; CHECK: [[DI_OUT]] = !DILocalVariable(name: "out", arg: 3, scope: [[VECZ_SUBPROG]], +; CHECK-SAME: line: 1, type: [[PTR_TYPE]] + +; CHECK: [[DI_TID]] = !DILocalVariable(name: "tid", scope: [[VECZ_SUBPROG]] +; CHECK-SAME:line: 3 + +; CHECK: [[DI_A]] = !DILocalVariable(name: "a", scope: [[VECZ_SUBPROG]], +; CHECK-SAME:line: 4 + +; CHECK: [[DI_B]] = !DILocalVariable(name: "b", scope: [[VECZ_SUBPROG]], +; CHECK-SAME: line: 5 + +; CHECK: [[DI_TMP]] = !DILocalVariable(name: "tmp", scope: [[VECZ_SUBPROG]], +; CHECK-SAME: line: 6 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll new file mode 100644 index 0000000000000..3e4db8b32697c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll @@ -0,0 +1,142 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +define spir_kernel void @test_instructions(<4 x i32>* %pa, <4 x i32>* %pb, <4 x i32>* %pc) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %a = getelementptr <4 x i32>, <4 x i32>* %pa, i64 %idx + %b = getelementptr <4 x i32>, <4 x i32>* %pb, i64 %idx + %c = getelementptr <4 x i32>, <4 x i32>* %pc, i64 %idx + %0 = load <4 x i32>, <4 x i32>* %a, align 16 + %1 = load <4 x i32>, <4 x i32>* %b, align 16 + %add = add <4 x i32> %1, %0 + store <4 x i32> %add, <4 x i32>* %c, align 16 + %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 1 + %2 = load <4 x i32>, <4 x i32>* %arrayidx3, align 16 + %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32>* %b, i64 1 + %3 = load <4 x i32>, <4 x i32>* %arrayidx4, align 16 + %cmp = icmp sgt <4 x i32> %2, %3 + %sext = sext <4 x i1> %cmp to <4 x i32> + %arrayidx5 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 1 + store <4 x i32> %sext, <4 x i32>* %arrayidx5, align 16 + %arrayidx6 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 2 + %4 = load <4 x i32>, <4 x i32>* %arrayidx6, align 16 + %cmp7 = icmp slt <4 x i32> %4, + %sext8 = sext <4 x i1> %cmp7 to <4 x i32> + %arrayidx9 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 2 + store <4 x i32> %sext8, <4 x i32>* %arrayidx9, align 16 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test_instructions(ptr %pa, ptr %pb, ptr %pc) +; CHECK: entry: +; CHECK: %[[A_0:.+]] = getelementptr i32, ptr %a, i32 0 +; CHECK: %[[A_1:.+]] = getelementptr i32, ptr %a, i32 1 +; CHECK: %[[A_2:.+]] = getelementptr i32, ptr %a, i32 2 +; CHECK: %[[A_3:.+]] = getelementptr i32, ptr %a, i32 3 +; CHECK: %[[LA_0:.+]] = load i32, ptr %[[A_0]] +; CHECK: %[[LA_1:.+]] = load i32, ptr %[[A_1]] +; CHECK: %[[LA_2:.+]] = load i32, ptr %[[A_2]] +; CHECK: %[[LA_3:.+]] = load i32, ptr %[[A_3]] +; CHECK: %[[B_0:.+]] = getelementptr i32, ptr %b, i32 0 +; CHECK: %[[B_1:.+]] = getelementptr i32, ptr %b, i32 1 +; CHECK: %[[B_2:.+]] = getelementptr i32, ptr %b, i32 2 +; CHECK: %[[B_3:.+]] = getelementptr i32, ptr %b, i32 3 +; CHECK: %[[LB_0:.+]] = load i32, ptr %[[B_0]] +; CHECK: %[[LB_1:.+]] = load i32, ptr %[[B_1]] +; CHECK: %[[LB_2:.+]] = load i32, ptr %[[B_2]] +; CHECK: %[[LB_3:.+]] = load i32, ptr %[[B_3]] +; CHECK: %[[ADD1:.+]] = add i32 %[[LB_0]], %[[LA_0]] +; CHECK: %[[ADD2:.+]] = add i32 %[[LB_1]], %[[LA_1]] +; CHECK: %[[ADD3:.+]] = add i32 %[[LB_2]], %[[LA_2]] +; CHECK: %[[ADD4:.+]] = add i32 %[[LB_3]], %[[LA_3]] +; CHECK: %[[C_0:.+]] = getelementptr i32, ptr %c, i32 0 +; CHECK: %[[C_1:.+]] = getelementptr i32, ptr %c, i32 1 +; CHECK: %[[C_2:.+]] = getelementptr i32, ptr %c, i32 2 +; CHECK: %[[C_3:.+]] = getelementptr i32, ptr %c, i32 3 +; CHECK: store i32 %[[ADD1]], ptr %[[C_0]] +; CHECK: store i32 %[[ADD2]], ptr %[[C_1]] +; CHECK: store i32 %[[ADD3]], ptr %[[C_2]] +; CHECK: store i32 %[[ADD4]], ptr %[[C_3]] +; CHECK: %arrayidx3 = getelementptr <4 x i32>, ptr %a, i64 1 +; CHECK: %[[A1_0:.+]] = getelementptr i32, ptr %arrayidx3, i32 0 +; CHECK: %[[A1_1:.+]] = getelementptr i32, ptr %arrayidx3, i32 1 +; CHECK: %[[A1_2:.+]] = getelementptr i32, ptr %arrayidx3, i32 2 +; CHECK: %[[A1_3:.+]] = getelementptr i32, ptr %arrayidx3, i32 3 +; CHECK: %[[LA1_0:.+]] = load i32, ptr %[[A1_0]] +; CHECK: %[[LA1_1:.+]] = load i32, ptr %[[A1_1]] +; CHECK: %[[LA1_2:.+]] = load i32, ptr %[[A1_2]] +; CHECK: %[[LA1_3:.+]] = load i32, ptr %[[A1_3]] +; CHECK: %arrayidx4 = getelementptr <4 x i32>, ptr %b, i64 1 +; CHECK: %[[B1_0:.+]] = getelementptr i32, ptr %arrayidx4, i32 0 +; CHECK: %[[B1_1:.+]] = getelementptr i32, ptr %arrayidx4, i32 1 +; CHECK: %[[B1_2:.+]] = getelementptr i32, ptr %arrayidx4, i32 2 +; CHECK: %[[B1_3:.+]] = getelementptr i32, ptr %arrayidx4, i32 3 +; CHECK: %[[LB1_0:.+]] = load i32, ptr %[[B1_0]] +; CHECK: %[[LB1_1:.+]] = load i32, ptr %[[B1_1]] +; CHECK: %[[LB1_2:.+]] = load i32, ptr %[[B1_2]] +; CHECK: %[[LB1_3:.+]] = load i32, ptr %[[B1_3]] +; CHECK: %[[CMP5:.+]] = icmp sgt i32 %[[LA1_0]], %[[LB1_0]] +; CHECK: %[[CMP6:.+]] = icmp sgt i32 %[[LA1_1]], %[[LB1_1]] +; CHECK: %[[CMP8:.+]] = icmp sgt i32 %[[LA1_2]], %[[LB1_2]] +; CHECK: %[[CMP9:.+]] = icmp sgt i32 %[[LA1_3]], %[[LB1_3]] +; CHECK: %[[SEXT10:.+]] = sext i1 %[[CMP5]] to i32 +; CHECK: %[[SEXT11:.+]] = sext i1 %[[CMP6]] to i32 +; CHECK: %[[SEXT12:.+]] = sext i1 %[[CMP8]] to i32 +; CHECK: %[[SEXT13:.+]] = sext i1 %[[CMP9]] to i32 +; CHECK: %arrayidx5 = getelementptr <4 x i32>, ptr %c, i64 1 +; CHECK: %[[C1_0:.+]] = getelementptr i32, ptr %arrayidx5, i32 0 +; CHECK: %[[C1_1:.+]] = getelementptr i32, ptr %arrayidx5, i32 1 +; CHECK: %[[C1_2:.+]] = getelementptr i32, ptr %arrayidx5, i32 2 +; CHECK: %[[C1_3:.+]] = getelementptr i32, ptr %arrayidx5, i32 3 +; CHECK: store i32 %[[SEXT10]], ptr %[[C1_0]] +; CHECK: store i32 %[[SEXT11]], ptr %[[C1_1]] +; CHECK: store i32 %[[SEXT12]], ptr %[[C1_2]] +; CHECK: store i32 %[[SEXT13]], ptr %[[C1_3]] +; CHECK: %arrayidx6 = getelementptr <4 x i32>, ptr %a, i64 2 +; CHECK: %[[A2_0:.+]] = getelementptr i32, ptr %arrayidx6, i32 0 +; CHECK: %[[A2_1:.+]] = getelementptr i32, ptr %arrayidx6, i32 1 +; CHECK: %[[A2_2:.+]] = getelementptr i32, ptr %arrayidx6, i32 2 +; CHECK: %[[A2_3:.+]] = getelementptr i32, ptr %arrayidx6, i32 3 +; CHECK: %[[LA2_0:.+]] = load i32, ptr %[[A2_0]] +; CHECK: %[[LA2_1:.+]] = load i32, ptr %[[A2_1]] +; CHECK: %[[LA2_2:.+]] = load i32, ptr %[[A2_2]] +; CHECK: %[[LA2_3:.+]] = load i32, ptr %[[A2_3]] +; CHECK: %[[CMP714:.+]] = icmp slt i32 %[[LA2_0]], 11 +; CHECK: %[[CMP715:.+]] = icmp slt i32 %[[LA2_1]], 12 +; CHECK: %[[CMP716:.+]] = icmp slt i32 %[[LA2_2]], 13 +; CHECK: %[[CMP717:.+]] = icmp slt i32 %[[LA2_3]], 14 +; CHECK: %[[SEXT818:.+]] = sext i1 %[[CMP714]] to i32 +; CHECK: %[[SEXT819:.+]] = sext i1 %[[CMP715]] to i32 +; CHECK: %[[SEXT820:.+]] = sext i1 %[[CMP716]] to i32 +; CHECK: %[[SEXT821:.+]] = sext i1 %[[CMP717]] to i32 +; CHECK: %arrayidx9 = getelementptr <4 x i32>, ptr %c, i64 2 +; CHECK: %[[C2_0:.+]] = getelementptr i32, ptr %arrayidx9, i32 0 +; CHECK: %[[C2_1:.+]] = getelementptr i32, ptr %arrayidx9, i32 1 +; CHECK: %[[C2_2:.+]] = getelementptr i32, ptr %arrayidx9, i32 2 +; CHECK: %[[C2_3:.+]] = getelementptr i32, ptr %arrayidx9, i32 3 +; CHECK: store i32 %[[SEXT818]], ptr %[[C2_0]] +; CHECK: store i32 %[[SEXT819]], ptr %[[C2_1]] +; CHECK: store i32 %[[SEXT820]], ptr %[[C2_2]] +; CHECK: store i32 %[[SEXT821]], ptr %[[C2_3]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll new file mode 100644 index 0000000000000..74bc119bb130c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll @@ -0,0 +1,67 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_instructions(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c) { +entry: + %0 = load <4 x i32>, <4 x i32>* %a, align 16 + %1 = load <4 x i32>, <4 x i32>* %b, align 16 + %add = add <4 x i32> %1, %0 + store <4 x i32> %add, <4 x i32>* %c, align 16 + %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 1 + %2 = load <4 x i32>, <4 x i32>* %arrayidx3, align 16 + %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32>* %b, i64 1 + %3 = load <4 x i32>, <4 x i32>* %arrayidx4, align 16 + %cmp = icmp sgt <4 x i32> %2, %3 + %sext = sext <4 x i1> %cmp to <4 x i32> + %arrayidx5 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 1 + store <4 x i32> %sext, <4 x i32>* %arrayidx5, align 16 + %arrayidx6 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 2 + %4 = load <4 x i32>, <4 x i32>* %arrayidx6, align 16 + %cmp7 = icmp slt <4 x i32> %4, + %sext8 = sext <4 x i1> %cmp7 to <4 x i32> + %arrayidx9 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 2 + store <4 x i32> %sext8, <4 x i32>* %arrayidx9, align 16 + ret void +} + +; Checks that this function gets vectorized, although because every instruction is +; uniform, the process of vectorization makes no actual changes whatsoever! +; CHECK: define spir_kernel void @__vecz_v4_test_instructions(ptr %a, ptr %b, ptr %c) +; CHECK: entry: +; CHECK: %[[LA:.+]] = load <4 x i32>, ptr %a, align 16 +; CHECK: %[[LB:.+]] = load <4 x i32>, ptr %b, align 16 +; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LB]], %[[LA]] +; CHECK: store <4 x i32> %[[ADD]], ptr %c, align 16 +; CHECK: %[[A1:.+]] = getelementptr inbounds <4 x i32>, ptr %a, i64 1 +; CHECK: %[[LA1:.+]] = load <4 x i32>, ptr %[[A1]], align 16 +; CHECK: %[[B1:.+]] = getelementptr inbounds <4 x i32>, ptr %b, i64 1 +; CHECK: %[[LB1:.+]] = load <4 x i32>, ptr %[[B1]], align 16 +; CHECK: %[[CMP:.+]] = icmp sgt <4 x i32> %[[LA1]], %[[LB1]] +; CHECK: %[[SEXT:.+]] = sext <4 x i1> %[[CMP]] to <4 x i32> +; CHECK: %[[C1:.+]] = getelementptr inbounds <4 x i32>, ptr %c, i64 1 +; CHECK: store <4 x i32> %[[SEXT]], ptr %[[C1]], align 16 +; CHECK: %[[A2:.+]] = getelementptr inbounds <4 x i32>, ptr %a, i64 2 +; CHECK: %[[LA2:.+]] = load <4 x i32>, ptr %[[A2]], align 16 +; CHECK: %[[CMP7:.+]] = icmp slt <4 x i32> %[[LA2]], +; CHECK: %[[SEXT8:.+]] = sext <4 x i1> %[[CMP7]] to <4 x i32> +; CHECK: %[[C2:.+]] = getelementptr inbounds <4 x i32>, ptr %c, i64 2 +; CHECK: store <4 x i32> %[[SEXT8]], ptr %[[C2]], align 16 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll new file mode 100644 index 0000000000000..712271d2b12b3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +declare i64 @__mux_get_global_id(i32) + +declare <2 x float> @__vecz_b_masked_load4_Dv2_fPDv2_fDv2_b(<2 x float>*, <2 x i1>) +declare void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float>, <2 x float>*, <2 x i1>) + +define spir_kernel void @scalarize_masked_memops(<2 x float>* %pa, <2 x float>* %pz) { +entry: + %idx = call i64 @__mux_get_global_id(i32 0) + %head = insertelement <2 x i64> poison, i64 %idx, i64 0 + %splat = shufflevector <2 x i64> %head, <2 x i64> poison, <2 x i32> zeroinitializer + %idxs = add <2 x i64> %splat, + %mask = icmp slt <2 x i64> %idxs, + %aptr = getelementptr <2 x float>, <2 x float>* %pa, i64 %idx + %ld = call <2 x float> @__vecz_b_masked_load4_Dv2_fPDv2_fDv2_b(<2 x float>* %aptr, <2 x i1> %mask) + %zptr = getelementptr <2 x float>, <2 x float>* %pz, i64 %idx + call void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float> %ld, <2 x float>* %zptr, <2 x i1> %mask) + ret void + ; CHECK: %idx = call i64 @__mux_get_global_id(i32 0) + ; CHECK: %[[IDXS0:.*]] = add i64 %idx, 0 + ; CHECK: %[[IDXS1:.*]] = add i64 %idx, 1 + ; CHECK: %[[MASK0:.*]] = icmp slt i64 %[[IDXS0]], 8 + ; CHECK: %[[MASK1:.*]] = icmp slt i64 %[[IDXS1]], 8 + ; CHECK: %aptr = getelementptr <2 x float>, ptr %pa, i64 %idx + ; CHECK: %[[TMP1:.*]] = getelementptr float, ptr %aptr, i32 0 + ; CHECK: %[[TMP2:.*]] = getelementptr float, ptr %aptr, i32 1 + ; CHECK: %[[TMP3:.*]] = call float @__vecz_b_masked_load4_fu3ptrb(ptr %[[TMP1]], i1 %[[MASK0]]) + ; CHECK: %[[TMP4:.*]] = call float @__vecz_b_masked_load4_fu3ptrb(ptr %[[TMP2]], i1 %[[MASK1]]) + ; CHECK: %zptr = getelementptr <2 x float>, ptr %pz, i64 %idx + ; CHECK: %[[TMP6:.*]] = getelementptr float, ptr %zptr, i32 0 + ; CHECK: %[[TMP7:.*]] = getelementptr float, ptr %zptr, i32 1 + ; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float %[[TMP3]], ptr %[[TMP6]], i1 %[[MASK0]]) + ; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float %[[TMP4]], ptr %[[TMP7]], i1 %[[MASK1]]) + ; CHECK: ret void + +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll new file mode 100644 index 0000000000000..443104d84af75 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll @@ -0,0 +1,135 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define dso_local spir_kernel void @bitcast1(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %gid = tail call i64 @__mux_get_global_id(i32 noundef 0) + %pin = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i64 %gid + %pout = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %gid + %0 = load <2 x float>, ptr addrspace(1) %pin, align 4 + %1 = bitcast <2 x float> %0 to <4 x half> + store <4 x half> %1, ptr addrspace(1) %pout, align 4 + ret void +} + +; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast1 +; CHECK: [[A0:%.+]] = load float, +; CHECK-NEXT: [[C0:%.+]] = load float, +; CHECK-NEXT: [[A1:%.+]] = bitcast float [[A0]] to i32 +; CHECK-NEXT: [[A2:%.+]] = trunc i32 [[A1]] to i16 +; CHECK-NEXT: [[A3:%.+]] = bitcast i16 [[A2]] to half +; CHECK-NEXT: [[B1:%.+]] = bitcast float [[A0]] to i32 +; CHECK-NEXT: [[B2:%.+]] = lshr i32 [[B1]], 16 +; CHECK-NEXT: [[B3:%.+]] = trunc i32 [[B2]] to i16 +; CHECK-NEXT: [[B4:%.+]] = bitcast i16 [[B3]] to half +; CHECK-NEXT: [[C1:%.+]] = bitcast float [[C0]] to i32 +; CHECK-NEXT: [[C2:%.+]] = trunc i32 [[C1]] to i16 +; CHECK-NEXT: [[C3:%.+]] = bitcast i16 [[C2]] to half +; CHECK-NEXT: [[D1:%.+]] = bitcast float [[C0]] to i32 +; CHECK-NEXT: [[D2:%.+]] = lshr i32 [[D1]], 16 +; CHECK-NEXT: [[D3:%.+]] = trunc i32 [[D2]] to i16 +; CHECK-NEXT: [[D4:%.+]] = bitcast i16 [[D3]] to half +; CHECK: store half [[A3]], +; CHECK-NEXT: store half [[B4]], +; CHECK-NEXT: store half [[C3]], +; CHECK-NEXT: store half [[D4]], +; CHECK-NEXT: ret void + +define dso_local spir_kernel void @bitcast2(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %gid = tail call i64 @__mux_get_global_id(i32 noundef 0) + %pin = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %gid + %pout = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i64 %gid + %0 = load <4 x half>, ptr addrspace(1) %pin, align 4 + %1 = bitcast <4 x half> %0 to <2 x float> + store <2 x float> %1, ptr addrspace(1) %pout, align 4 + ret void +} + +; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast2 +; CHECK: [[A0:%.+]] = load half, +; CHECK-NEXT: [[B0:%.+]] = load half, +; CHECK-NEXT: [[C0:%.+]] = load half, +; CHECK-NEXT: [[D0:%.+]] = load half, +; CHECK-NEXT: [[A1:%.+]] = bitcast half [[A0]] to i16 +; CHECK-NEXT: [[A2:%.+]] = zext i16 [[A1]] to i32 +; CHECK-NEXT: [[B1:%.+]] = bitcast half [[B0]] to i16 +; CHECK-NEXT: [[B2:%.+]] = zext i16 [[B1]] to i32 +; CHECK-NEXT: [[B3:%.+]] = shl i32 [[B2]], 16 +; CHECK-NEXT: [[AB4:%.+]] = or i32 [[A2]], [[B3]] +; CHECK-NEXT: [[AB5:%.+]] = bitcast i32 [[AB4]] to float +; CHECK-NEXT: [[C1:%.+]] = bitcast half [[C0]] to i16 +; CHECK-NEXT: [[C2:%.+]] = zext i16 [[C1]] to i32 +; CHECK-NEXT: [[D1:%.+]] = bitcast half [[D0]] to i16 +; CHECK-NEXT: [[D2:%.+]] = zext i16 [[D1]] to i32 +; CHECK-NEXT: [[D3:%.+]] = shl i32 [[D2]], 16 +; CHECK-NEXT: [[CD4:%.+]] = or i32 [[C2]], [[D3]] +; CHECK-NEXT: [[CD5:%.+]] = bitcast i32 [[CD4]] to float +; CHECK: store float [[AB5]], +; CHECK-NEXT: store float [[CD5]], +; CHECK-NEXT: ret void + +define dso_local spir_kernel void @bitcast3(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %gid = tail call i64 @__mux_get_global_id(i32 noundef 0) + %pin = getelementptr inbounds <2 x i32>, ptr addrspace(1) %in, i64 %gid + %pout = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i64 %gid + %0 = load <2 x i32>, ptr addrspace(1) %pin, align 4 + %1 = bitcast <2 x i32> %0 to <2 x float> + store <2 x float> %1, ptr addrspace(1) %pout, align 4 + ret void +} + +; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast3 +; CHECK: [[A0:%.+]] = load i32, +; CHECK-NEXT: [[B0:%.+]] = load i32, +; CHECK-NEXT: [[A1:%.+]] = bitcast i32 [[A0]] to float +; CHECK-NEXT: [[B1:%.+]] = bitcast i32 [[B0]] to float +; CHECK: store float [[A1]], +; CHECK-NEXT: store float [[B1]], +; CHECK-NEXT: ret void + +define dso_local spir_kernel void @bitcast4(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %gid = tail call i64 @__mux_get_global_id(i32 noundef 0) + %pin = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %gid + %pout = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %gid + %0 = load i32, ptr addrspace(1) %pin, align 4 + %1 = insertelement <2 x i32> poison, i32 %0, i32 0 + %2 = bitcast <2 x i32> %1 to <4 x i16> + %3 = shufflevector <4 x i16> %2, <4 x i16> poison, <4 x i32> + store <4 x i16> %3, ptr addrspace(1) %pout, align 4 + ret void +} + +; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast4 +; CHECK: [[A0:%.+]] = load i32, +; CHECK-NEXT: [[A1:%.+]] = trunc i32 [[A0]] to i16 +; CHECK-NEXT: [[B0:%.+]] = lshr i32 %0, 16 +; CHECK-NEXT: [[B1:%.+]] = trunc i32 [[B0]] to i16 +; CHECK: store i16 [[A1]], +; CHECK-NEXT: store i16 [[B1]], +; CHECK-NEXT: store i16 [[A1]], +; CHECK-NEXT: store i16 [[B1]], +; CHECK-NEXT: ret void + +declare i64 @__mux_get_global_id(i32 noundef) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll new file mode 100644 index 0000000000000..7d361eaa47399 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define dso_local spir_kernel void @splat(i32 addrspace(1)* %data, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 noundef 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %data, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %splat.splatinsert = insertelement <4 x i32> poison, i32 %0, i64 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + %add = add <4 x i32> %splat.splat, + %call1 = tail call spir_func i32 @not_scalarizable(<4 x i32> noundef %add) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32 noundef) +declare spir_func i32 @not_scalarizable(<4 x i32> noundef) + +; It checks that the scalarizer scalarizes the add and reconstructs the vector +; using insert element instructions to be consumed by the unscalarizable +; function. +; CHECK: void @__vecz_v4_splat({{.*}}) +; CHECK: entry: +; CHECK: %[[LD:.*]] = load i32 +; CHECK: %[[ADD0:.*]] = add i32 %[[LD]] +; CHECK: %[[ADD1:.*]] = add i32 %[[LD]] +; CHECK: %[[ADD2:.*]] = add i32 %[[LD]] +; CHECK: %[[ADD3:.*]] = add i32 %[[LD]] +; CHECK: %[[INS0:.*]] = insertelement <4 x i32> poison, i32 %[[ADD0]], i32 0 +; CHECK: %[[INS1:.+]] = insertelement <4 x i32> %[[INS0]], i32 %[[ADD1]], i32 1 +; CHECK: %[[INS2:.+]] = insertelement <4 x i32> %[[INS1]], i32 %[[ADD2]], i32 2 +; CHECK: %[[INS3:.+]] = insertelement <4 x i32> %[[INS2]], i32 %[[ADD3]], i32 3 +; CHECK-NOT: shufflevector <4 x i32> +; CHECK: %{{.*}} = tail call spir_func i32 @not_scalarizable(<4 x i32> noundef %[[INS3]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll new file mode 100644 index 0000000000000..b40ac87870871 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll @@ -0,0 +1,72 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k gep -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define dso_local spir_kernel void @gep(ptr addrspace(1) %data, ptr addrspace(1) %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 noundef 0) + %ptrdata = getelementptr inbounds <2 x ptr>, ptr addrspace(1) %data, i64 %call + %ptrdatavec = load <2 x ptr addrspace(1)>, ptr addrspace(1) %ptrdata + %ptrdatavec.gep = getelementptr inbounds i32, <2 x ptr addrspace(1)> %ptrdatavec, i64 1 + %vec1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %ptrdatavec, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer) + %vec2 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %ptrdatavec.gep, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer) + %vec.add = add <2 x i32> %vec1, %vec2 + %ptrout = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i64 %call + store <2 x i32> %vec.add, ptr addrspace(1) %ptrout + ret void +} + +declare i64 @__mux_get_global_id(i32 noundef) + +declare <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)>, i32, <2 x i1>, <2 x ptr addrspace(1)>) + +; The full scalarization has not completely removed the vectors, the gather +; operation should have been replaced by non-vector loads, but check that at +; least we do not crash. + +; CHECK: void @__vecz_v4_gep({{.*}}) +; CHECK: entry: +; CHECK: %call = tail call i64 @__mux_get_global_id(i32 noundef 0) +; CHECK: %ptrdata = getelementptr <2 x ptr>, ptr addrspace(1) %data, i64 %call +; CHECK: %0 = getelementptr ptr addrspace(1), ptr addrspace(1) %ptrdata, i32 0 +; CHECK: %1 = getelementptr ptr addrspace(1), ptr addrspace(1) %ptrdata, i32 1 +; CHECK: %ptrdatavec1 = load ptr addrspace(1), ptr addrspace(1) %0, align 1 +; CHECK: %ptrdatavec2 = load ptr addrspace(1), ptr addrspace(1) %1, align 1 +; CHECK: %2 = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) %ptrdatavec1, i32 0 +; CHECK: %3 = insertelement <2 x ptr addrspace(1)> %2, ptr addrspace(1) %ptrdatavec2, i32 1 +; CHECK: %ptrdatavec.gep3 = getelementptr i32, ptr addrspace(1) %ptrdatavec1, i64 1 +; CHECK: %ptrdatavec.gep4 = getelementptr i32, ptr addrspace(1) %ptrdatavec2, i64 1 +; CHECK: %4 = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) %ptrdatavec.gep3, i32 0 +; CHECK: %5 = insertelement <2 x ptr addrspace(1)> %4, ptr addrspace(1) %ptrdatavec.gep4, i32 1 +; CHECK: %vec1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %3, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer) +; CHECK: %6 = extractelement <2 x i32> %vec1, i32 0 +; CHECK: %7 = extractelement <2 x i32> %vec1, i32 1 +; CHECK: %vec2 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %5, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer) +; CHECK: %8 = extractelement <2 x i32> %vec2, i32 0 +; CHECK: %9 = extractelement <2 x i32> %vec2, i32 1 +; CHECK: %vec.add5 = add i32 %6, %8 +; CHECK: %vec.add6 = add i32 %7, %9 +; CHECK: %ptrout = getelementptr <2 x i32>, ptr addrspace(1) %out, i64 %call +; CHECK: %10 = getelementptr i32, ptr addrspace(1) %ptrout, i32 0 +; CHECK: %11 = getelementptr i32, ptr addrspace(1) %ptrout, i32 1 +; CHECK: store i32 %vec.add5, ptr addrspace(1) %10, align 4 +; CHECK: store i32 %vec.add6, ptr addrspace(1) %11, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll new file mode 100644 index 0000000000000..4492b16c1c978 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define dso_local spir_kernel void @splat(float addrspace(1)* %data, float addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 noundef 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %data, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %splat.splatinsert = insertelement <4 x float> poison, float %0, i64 0 + %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer + %call1 = tail call spir_func float @not_scalarizable(<4 x float> noundef %splat.splat) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32 noundef) +declare spir_func float @not_scalarizable(<4 x float> noundef) + +; It checks that the scalarizer turns the original vector splat back into a vector splat, +; instead of a series of insertelement instructions. +; CHECK: void @__vecz_v4_splat({{.*}}) +; CHECK: entry: +; CHECK: %[[LD:.*]] = load float +; CHECK: %[[INS0:.*]] = insertelement <4 x float> poison, float %[[LD]], {{i32|i64}} 0 +; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 1 +; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 2 +; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 3 +; CHECK: %[[SPLAT:.*]] = shufflevector <4 x float> %[[INS0]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: %{{.*}} = tail call spir_func float @not_scalarizable(<4 x float> noundef %[[SPLAT]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll new file mode 100644 index 0000000000000..d7bbd4a2d9ed8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll @@ -0,0 +1,46 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +; RUN: veczc -k bar -vecz-simd-width=4 -S -o - %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +define void @bar(i64** %ptrptrs, i64 %val) { + %idx = call i64 @__mux_get_global_id(i32 0) + %arrayidxa = getelementptr inbounds i64*, i64** %ptrptrs, i64 %idx + %ptrs = load i64*, i64** %arrayidxa, align 4 + %addr = getelementptr inbounds i64, i64* %ptrs, <4 x i32> + + %elt0 = extractelement <4 x i64*> %addr, i32 0 + %elt1 = extractelement <4 x i64*> %addr, i32 1 + %elt2 = extractelement <4 x i64*> %addr, i32 2 + %elt3 = extractelement <4 x i64*> %addr, i32 3 + + store i64 %val, i64* %elt0 + store i64 %val, i64* %elt1 + store i64 %val, i64* %elt2 + store i64 %val, i64* %elt3 + ret void +} + +; it checks that the GEP with mixed scalar/vector operands in the kernel +; gets scalarized/re-packetized correctly + +; CHECK: define void @__vecz_v4_bar +; CHECK: %[[ADDR:.+]] = getelementptr {{i64|i8}}, <4 x ptr> %{{.+}}, {{i64 2|i64 16}} +; CHECK: call void @__vecz_b_scatter_store8_Dv4_mDv4_u3ptr(<4 x i64> %.splat{{.*}}, <4 x ptr> %[[ADDR]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll new file mode 100644 index 0000000000000..530b01b7a0d88 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll @@ -0,0 +1,197 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k scan_fact -vecz-passes=cfg-convert -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@scan_fact.temp = internal addrspace(3) global [16 x i32] poison, align 4 + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #0 + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_local_id(i32) #0 + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_local_size(i32) #0 + +; Function Attrs: convergent nounwind +define spir_kernel void @scan_fact(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 { +entry: + %call = call i64 @__mux_get_local_id(i32 0) #3 + %call1 = call i64 @__mux_get_global_id(i32 0) #3 + %call2 = call i64 @__mux_get_local_size(i32 0) #3 + %mul = shl i64 %call1, 1 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %mul + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %mul3 = shl i64 %call, 1 + %arrayidx4 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %mul3 + store i32 %0, i32 addrspace(3)* %arrayidx4, align 4 + %mul5 = shl i64 %call1, 1 + %add = or i64 %mul5, 1 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add + %1 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %mul7 = shl i64 %call, 1 + %add8 = or i64 %mul7, 1 + %arrayidx9 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add8 + store i32 %1, i32 addrspace(3)* %arrayidx9, align 4 + %mul10 = shl i64 %call, 1 + %add11 = or i64 %mul10, 1 + %arrayidx12 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add11 + %2 = load i32, i32 addrspace(3)* %arrayidx12, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %storemerge = phi i64 [ 1, %entry ], [ %mul29, %for.inc ] + %mul13 = shl i64 %call2, 1 + %cmp = icmp ult i64 %storemerge, %mul13 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + call void @__mux_work_group_barrier(i32 1, i32 1, i32 272) #4 + %mul14 = shl i64 %call, 1 + %mul15 = mul i64 %storemerge, %mul14 + %mul16 = shl i64 %call2, 1 + %cmp17 = icmp ult i64 %mul15, %mul16 + br i1 %cmp17, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %mul18 = mul i64 %storemerge, 2 + %add19 = add i64 %mul15, -1 + %sub = add i64 %add19, %mul18 + %arrayidx20 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub + %3 = load i32, i32 addrspace(3)* %arrayidx20, align 4 + %add21 = add i64 %mul15, -1 + %sub22 = add i64 %add21, %storemerge + %arrayidx23 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub22 + %4 = load i32, i32 addrspace(3)* %arrayidx23, align 4 + %mul24 = mul nsw i32 %4, %3 + %mul25 = mul i64 %storemerge, 2 + %add26 = add i64 %mul15, -1 + %sub27 = add i64 %add26, %mul25 + %arrayidx28 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub27 + store i32 %mul24, i32 addrspace(3)* %arrayidx28, align 4 + br label %for.inc + +for.inc: ; preds = %if.then, %for.body + %mul29 = shl i64 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %cmp30 = icmp eq i64 %call, 0 + br i1 %cmp30, label %if.then31, label %if.end35 + +if.then31: ; preds = %for.end + %mul32 = mul i64 %call2, 2 + %sub33 = add i64 %mul32, -1 + %arrayidx34 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub33 + store i32 1, i32 addrspace(3)* %arrayidx34, align 4 + br label %if.end35 + +if.end35: ; preds = %if.then31, %for.end + br label %for.cond37 + +for.cond37: ; preds = %for.inc62, %if.end35 + %storemerge1 = phi i64 [ %call2, %if.end35 ], [ %shr, %for.inc62 ] + %cmp38 = icmp eq i64 %storemerge1, 0 + call void @__mux_work_group_barrier(i32 1, i32 1, i32 272) #4 + %mul64 = shl i64 %call, 1 + br i1 %cmp38, label %for.end63, label %for.body39 + +for.body39: ; preds = %for.cond37 + %mul42 = mul i64 %storemerge1, %mul64 + %mul43 = shl i64 %call2, 1 + %cmp44 = icmp ult i64 %mul42, %mul43 + br i1 %cmp44, label %if.then45, label %for.inc62 + +if.then45: ; preds = %for.body39 + %add46 = add i64 %mul42, -1 + %sub47 = add i64 %add46, %storemerge1 + %arrayidx48 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub47 + %5 = load i32, i32 addrspace(3)* %arrayidx48, align 4 + %mul49 = mul i64 %storemerge1, 2 + %add50 = add i64 %mul42, -1 + %sub51 = add i64 %add50, %mul49 + %arrayidx52 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub51 + %6 = load i32, i32 addrspace(3)* %arrayidx52, align 4 + %add53 = add i64 %mul42, -1 + %sub54 = add i64 %add53, %storemerge1 + %arrayidx55 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub54 + store i32 %6, i32 addrspace(3)* %arrayidx55, align 4 + %mul56 = mul nsw i32 %6, %5 + %mul57 = mul i64 %storemerge1, 2 + %add58 = add i64 %mul42, -1 + %sub59 = add i64 %add58, %mul57 + %arrayidx60 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub59 + store i32 %mul56, i32 addrspace(3)* %arrayidx60, align 4 + br label %for.inc62 + +for.inc62: ; preds = %if.then45, %for.body39 + %shr = lshr i64 %storemerge1, 1 + br label %for.cond37 + +for.end63: ; preds = %for.cond37 + %add65 = or i64 %mul64, 1 + %arrayidx66 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add65 + %7 = load i32, i32 addrspace(3)* %arrayidx66, align 4 + %mul67 = shl i64 %call1, 1 + %arrayidx68 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %mul67 + store i32 %7, i32 addrspace(1)* %arrayidx68, align 4 + %sub69 = add i64 %call2, -1 + %cmp70 = icmp eq i64 %call, %sub69 + br i1 %cmp70, label %if.then71, label %if.else + +if.then71: ; preds = %for.end63 + %mul72 = shl i64 %call, 1 + %add73 = or i64 %mul72, 1 + %arrayidx74 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add73 + %8 = load i32, i32 addrspace(3)* %arrayidx74, align 4 + %mul75 = mul nsw i32 %8, %2 + %mul76 = shl i64 %call1, 1 + %add77 = or i64 %mul76, 1 + %arrayidx78 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add77 + store i32 %mul75, i32 addrspace(1)* %arrayidx78, align 4 + br label %if.end85 + +if.else: ; preds = %for.end63 + %mul79 = mul i64 %call, 2 + %add80 = add i64 %mul79, 2 + %arrayidx81 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add80 + %9 = load i32, i32 addrspace(3)* %arrayidx81, align 4 + %mul82 = shl i64 %call1, 1 + %add83 = or i64 %mul82, 1 + %arrayidx84 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add83 + store i32 %9, i32 addrspace(1)* %arrayidx84, align 4 + br label %if.end85 + +if.end85: ; preds = %if.else, %if.then71 + ret void +} + +declare void @__mux_work_group_barrier(i32, i32, i32) + +; The purpose of this test is to make sure we simply manage to vectorize this +; test. We would previously not because a phi node of a uniform loop has an +; incoming value from a divergent block, but all the incoming values of the +; phi node are the same. +; We would thus previously consider the phi node varying and that would make +; the loop divergent, with a barrier in it. + +; CHECK: spir_kernel void @__vecz_v4_scan_fact diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll new file mode 100644 index 0000000000000..e82e58b6ac662 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i32 @__mux_get_global_id(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %gid = call i32 @__mux_get_global_id(i32 0) + %and = and i32 %gid, 1 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if, label %early_ret + +early_ret: +; just to prevent ROSCC from sticking its oar in + %gid1 = call i32 @__mux_get_global_id(i32 1) + ret void + +if: + %single_load = load i32, i32 addrspace(1)* %in + %single_add = add i32 %single_load, 42 + store i32 %single_add, i32 addrspace(1)* %in + + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %[[BITCAST:.*]] = bitcast <4 x i1> %cmp{{[0-9]*}} to i4 +; CHECK: %[[MASK:.*]] = icmp ne i4 %[[BITCAST]], 0 +; CHECK: %[[single_load:single_load[0-9]*]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]]) +; CHECK: %[[single_add:single_add[0-9]*]] = add i32 %[[single_load]], 42 +; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[single_add]], ptr addrspace(1) %in, i1 %[[MASK]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll new file mode 100644 index 0000000000000..0b4377802877b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll @@ -0,0 +1,93 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-fail-quietly -k test -vecz-passes="cfg-convert" -S < %s + +; This tests only that the kernel does not crash the vectorizer. + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @test(i32 addrspace(1)* %out, i32 %n) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + br label %while.body + +while.body: ; preds = %e, %entry + %n.off = add i32 %n, -1 + %0 = icmp ult i32 %n.off, 4 + %cmp6 = icmp slt i32 %n, 3 + %or.cond1 = or i1 %cmp6, %0 + br i1 %or.cond1, label %f, label %if.else + +while.body5: ; preds = %d + switch i32 %n, label %g [ + i32 3, label %if.else + i32 2, label %h + ] + +if.else: ; preds = %while.body5, %while.body + %cmp9 = icmp sge i32 %conv, %n + %and = and i32 %n, 1 + %tobool = icmp eq i32 %and, 0 + %or.cond2 = or i1 %tobool, %cmp9 + br i1 %or.cond2, label %d, label %h + +d: ; preds = %if.else + %cmp16 = icmp sgt i32 %n, 3 + br i1 %cmp16, label %e, label %while.body5 + +e: ; preds = %d + %and20 = and i32 %n, 1 + %tobool21 = icmp eq i32 %and20, 0 + br i1 %tobool21, label %while.body, label %g + +f: ; preds = %while.body + %cmp24 = icmp eq i32 %n, 2 + br i1 %cmp24, label %h, label %g + +g: ; preds = %f, %e, %while.body5 + br label %for.cond + +for.cond: ; preds = %for.body, %g + %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ] + %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ] + %cmp29 = icmp sgt i32 %storemerge, %n + br i1 %cmp29, label %h, label %for.body + +for.body: ; preds = %for.cond + %inc = add nuw nsw i32 %ret.0, 1 + %inc31 = add nuw nsw i32 %storemerge, 1 + br label %for.cond + +h: ; preds = %for.cond, %f, %if.else, %while.body5 + %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll new file mode 100644 index 0000000000000..2728251ca02b3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %idx.ext = sext i32 %mul3 to i64 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext + %0 = load i8, i8 addrspace(1)* %add.ptr, align 1 + %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1 + %1 = load i8, i8 addrspace(1)* %arrayidx4, align 1 + %add7 = add i8 %1, %0 + %idxprom = sext i32 %add to i64 + %arrayidx11 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom + store i8 %add7, i8 addrspace(1)* %arrayidx11, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @load16 +; CHECK: load <4 x i8> +; CHECK: load <4 x i8> +; CHECK-NOT: load <4 x i8> +; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load +; CHECK: shufflevector <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK-NOT: shufflevector <4 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll new file mode 100644 index 0000000000000..b1082899dce4d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll @@ -0,0 +1,55 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %conv4 = sext i32 %mul3 to i64 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %conv4 + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %add5 = or i64 %conv4, 1 + %arrayidx6 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %add5 + %1 = load i8, i8 addrspace(1)* %arrayidx6, align 1 + %add9 = add i8 %1, %0 + %idxprom = sext i32 %add to i64 + %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom + store i8 %add9, i8 addrspace(1)* %arrayidx13, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @load16 +; CHECK: load <4 x i8> +; CHECK: load <4 x i8> +; CHECK-NOT: load <4 x i8> +; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load +; CHECK: shufflevector <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK-NOT: shufflevector <4 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll new file mode 100644 index 0000000000000..373c37fb20114 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %idxprom = sext i32 %mul3 to i64 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %add7 = or i32 %mul3, 1 + %idxprom8 = sext i32 %add7 to i64 + %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8 + %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1 + %add12 = add i8 %1, %0 + %idxprom16 = sext i32 %add to i64 + %arrayidx17 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom16 + store i8 %add12, i8 addrspace(1)* %arrayidx17, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @load16 +; CHECK: load <4 x i8> +; CHECK: load <4 x i8> +; CHECK-NOT: load <4 x i8> +; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load +; CHECK: shufflevector <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK-NOT: shufflevector <4 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll new file mode 100644 index 0000000000000..240d52a220cda --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %add4 = or i32 %mul3, 1 + %idxprom = sext i32 %add4 to i64 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %idxprom8 = sext i32 %mul3 to i64 + %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8 + %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1 + %sub = sub i8 %0, %1 + %idxprom15 = sext i32 %add to i64 + %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom15 + store i8 %sub, i8 addrspace(1)* %arrayidx16, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @load16 +; CHECK: load <4 x i8> +; CHECK: load <4 x i8> +; CHECK-NOT: load <4 x i8> +; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load +; CHECK: shufflevector <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK-NOT: shufflevector <4 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll new file mode 100644 index 0000000000000..23533d2130155 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll @@ -0,0 +1,70 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %idxprom = sext i32 %mul3 to i64 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %add7 = or i32 %mul3, 1 + %idxprom8 = sext i32 %add7 to i64 + %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8 + %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1 + %add13 = add nsw i32 %mul3, 2 + %idxprom14 = sext i32 %add13 to i64 + %arrayidx15 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom14 + %2 = load i8, i8 addrspace(1)* %arrayidx15, align 1 + %add19 = add nsw i32 %mul3, 3 + %idxprom20 = sext i32 %add19 to i64 + %arrayidx21 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom20 + %3 = load i8, i8 addrspace(1)* %arrayidx21, align 1 + %add24 = add i8 %1, %0 + %add26 = add i8 %add24, %2 + %add28 = add i8 %add26, %3 + %idxprom32 = sext i32 %add to i64 + %arrayidx33 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom32 + store i8 %add28, i8 addrspace(1)* %arrayidx33, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @load16 +; CHECK: load <4 x i8> +; CHECK: load <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK: load <4 x i8> +; CHECK: load <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK-NOT: load <4 x i8> +; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load +; CHECK-NOT: shufflevector <4 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll new file mode 100644 index 0000000000000..12b0cca975cd0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll @@ -0,0 +1,58 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %conv = trunc i64 %call to i32 + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %conv2 = trunc i64 %call1 to i32 + %mul = mul nsw i32 %conv2, %stride + %add = add nsw i32 %mul, %conv + %mul3 = shl nsw i32 %add, 1 + %add4 = add nsw i32 %mul3, 3 + %idxprom = sext i32 %add4 to i64 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom + %0 = load i8, i8 addrspace(1)* %arrayidx, align 1 + %shl = shl i8 %0, 1 + %add10 = add nsw i32 %mul3, 2 + %idxprom11 = sext i32 %add10 to i64 + %arrayidx12 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom11 + %1 = load i8, i8 addrspace(1)* %arrayidx12, align 1 + %sub = sub i8 %shl, %1 + %idxprom18 = sext i32 %add to i64 + %arrayidx19 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom18 + store i8 %sub, i8 addrspace(1)* %arrayidx19, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK: spir_kernel void @load16 +; CHECK: load <4 x i8> +; CHECK: load <4 x i8> +; CHECK-NOT: load <4 x i8> +; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load +; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load +; CHECK: shufflevector <4 x i8> +; CHECK: shufflevector <4 x i8> +; CHECK-NOT: shufflevector <4 x i8> +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll new file mode 100644 index 0000000000000..b28b347ade826 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll @@ -0,0 +1,42 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k foo -vecz-passes=mask-memops -S < %s | FileCheck %s + +define void @foo(i16 %x, i32 %y, ptr addrspace(1) %p) { +entry: + call void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16 %x, ptr addrspace(1) %p, i1 true) + call void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32 %y, ptr addrspace(1) %p, i1 true) + %f = call float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1) %p, i1 true) + %v4f = call <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1) %p, <4 x i1> ) + ret void +} + +; Check we correctly set the alignment on the optimized loads and stores. The +; alignment must come from the builtin, not from the natural/preferred +; alignment for that type. +; CHECK: define void @__vecz_v4_foo(i16 %x, i32 %y, ptr addrspace(1) %p) +; CHECK: entry: +; CHECK: store i16 %x, ptr addrspace(1) %p, align 2 +; CHECK-NEXT: store i32 %y, ptr addrspace(1) %p, align 2 +; CHECK-NEXT: %f = load float, ptr addrspace(1) %p, align 2 +; CHECK-NEXT: %v4f = load <4 x float>, ptr addrspace(1) %p, align 2 +; CHECK-NEXT: ret void + +declare void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16, ptr addrspace(1), i1) +declare void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32, ptr addrspace(1), i1) +declare float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1), i1) +declare <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1), <4 x i1>) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll new file mode 100644 index 0000000000000..dc7c2fed68520 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll @@ -0,0 +1,69 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i64 0) #2 + %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid + %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8 + %ele0 = extractelement <4 x i8> %data.ld, i32 0 + %ele1 = extractelement <4 x i8> %data.ld, i32 1 + %ele2 = extractelement <4 x i8> %data.ld, i32 2 + %ele3 = extractelement <4 x i8> %data.ld, i32 3 + %zext0 = sext i8 %ele0 to i32 + %zext1 = sext i8 %ele1 to i32 + %zext2 = sext i8 %ele2 to i32 + %zext3 = sext i8 %ele3 to i32 + %sum1 = add i32 %zext0, %zext1 + %sum2 = xor i32 %sum1, %zext2 + %sum3 = and i32 %sum2, %zext3 + %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid + store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i64) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +; It checks that the <4 x i8> is converted into a i32 and uses shifts +; to implement the extract elements and sexts. +; +; CHECK: void @__vecz_v4_squash +; CHECK: %[[DATA:.+]] = load <16 x i8> +; CHECK-NOT: shufflevector +; CHECK: %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]] +; CHECK: %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32> +; CHECK: %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}} +; CHECK: %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}} +; CHECK: %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[SUM1:.+]] = add <4 x i32> %[[SEXT0]], %[[SEXT1]] +; CHECK: %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[SEXT2]] +; CHECK: %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[SEXT3]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll new file mode 100644 index 0000000000000..c329b342b5835 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll @@ -0,0 +1,69 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "E-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i64 0) #2 + %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid + %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8 + %ele0 = extractelement <4 x i8> %data.ld, i32 3 + %ele1 = extractelement <4 x i8> %data.ld, i32 2 + %ele2 = extractelement <4 x i8> %data.ld, i32 1 + %ele3 = extractelement <4 x i8> %data.ld, i32 0 + %zext0 = sext i8 %ele0 to i32 + %zext1 = sext i8 %ele1 to i32 + %zext2 = sext i8 %ele2 to i32 + %zext3 = sext i8 %ele3 to i32 + %sum1 = add i32 %zext0, %zext1 + %sum2 = xor i32 %sum1, %zext2 + %sum3 = and i32 %sum2, %zext3 + %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid + store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i64) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +; It checks that the <4 x i8> is converted into a i32 and uses shifts +; to implement the extract elements and sexts. +; +; CHECK: void @__vecz_v4_squash +; CHECK: %[[DATA:.+]] = load <16 x i8> +; CHECK-NOT: shufflevector +; CHECK: %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]] +; CHECK: %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32> +; CHECK: %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}} +; CHECK: %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}} +; CHECK: %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[SUM1:.+]] = add <4 x i32> %[[SEXT0]], %[[SEXT1]] +; CHECK: %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[SEXT2]] +; CHECK: %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[SEXT3]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll new file mode 100644 index 0000000000000..94e72dc92e09f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll @@ -0,0 +1,122 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=squash-small-vecs -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; It checks that the <4 x i8> is converted into a i32 and uses shifts and masks +; to implement the extract elements and zexts. +; CHECK: void @__vecz_v4_squashv4i8( +; CHECK: %[[DATA:.+]] = load <4 x i8> +; CHECK: %[[FREEZE:.+]] = freeze <4 x i8> %[[DATA]] +; CHECK: %[[SQUASH:.+]] = bitcast <4 x i8> %[[FREEZE]] to i32 +; CHECK: %[[ZEXT0:.+]] = and i32 %[[SQUASH]], 255 +; CHECK: %[[EXTR1:.+]] = lshr i32 %[[SQUASH]], 8 +; CHECK: %[[ZEXT1:.+]] = and i32 %[[EXTR1]], 255 +; CHECK: %[[EXTR2:.+]] = lshr i32 %[[SQUASH]], 16 +; CHECK: %[[ZEXT2:.+]] = and i32 %[[EXTR2]], 255 +; CHECK: %[[EXTR3:.+]] = lshr i32 %[[SQUASH]], 24 +; CHECK: %[[ZEXT3:.+]] = and i32 %[[EXTR3]], 255 +; CHECK: %[[SUM1:.+]] = add i32 %[[ZEXT0]], %[[ZEXT1]] +; CHECK: %[[SUM2:.+]] = xor i32 %[[SUM1]], %[[ZEXT2]] +; CHECK: %[[SUM3:.+]] = and i32 %[[SUM2]], %[[ZEXT3]] +; CHECK: ret void +define spir_kernel void @squashv4i8(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i64 0) #1 + %data.ptr = getelementptr inbounds <4 x i8>, ptr addrspace(1) %data, i64 %gid + %data.ld = load <4 x i8>, ptr addrspace(1) %data.ptr, align 4 + %ele0 = extractelement <4 x i8> %data.ld, i32 0 + %ele1 = extractelement <4 x i8> %data.ld, i32 1 + %ele2 = extractelement <4 x i8> %data.ld, i32 2 + %ele3 = extractelement <4 x i8> %data.ld, i32 3 + %zext0 = zext i8 %ele0 to i32 + %zext1 = zext i8 %ele1 to i32 + %zext2 = zext i8 %ele2 to i32 + %zext3 = zext i8 %ele3 to i32 + %sum1 = add i32 %zext0, %zext1 + %sum2 = xor i32 %sum1, %zext2 + %sum3 = and i32 %sum2, %zext3 + %output.ptr = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %gid + store i32 %sum3, ptr addrspace(1) %output.ptr, align 4 + ret void +} + +; CHECK: void @__vecz_v4_squashv2i32( +; CHECK: %[[DATA:.+]] = load <2 x i32> +; CHECK: %[[FREEZE:.+]] = freeze <2 x i32> %[[DATA]] +; CHECK: %[[SQUASH:.+]] = bitcast <2 x i32> %[[FREEZE]] to i64 +; CHECK: %[[ZEXT0:.+]] = and i64 %[[SQUASH]], 4294967295 +; CHECK: %[[EXTR1:.+]] = lshr i64 %[[SQUASH]], 32 +; CHECK: %[[ZEXT1:.+]] = and i64 %[[EXTR1]], 4294967295 +; CHECK: %[[SUM1:.+]] = add i64 %[[ZEXT0]], %[[ZEXT1]] +define spir_kernel void @squashv2i32(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i64 0) #1 + %data.ptr = getelementptr inbounds <2 x i32>, ptr addrspace(1) %data, i64 %gid + %data.ld = load <2 x i32>, ptr addrspace(1) %data.ptr, align 4 + %ele0 = extractelement <2 x i32> %data.ld, i32 0 + %ele1 = extractelement <2 x i32> %data.ld, i32 1 + %zext0 = zext i32 %ele0 to i64 + %zext1 = zext i32 %ele1 to i64 + %sum = add i64 %zext0, %zext1 + %output.ptr = getelementptr inbounds i64, ptr addrspace(1) %output, i64 %gid + store i64 %sum, ptr addrspace(1) %output.ptr, align 4 + ret void +} + +; Check we don't squash vectors we consider too large. +; CHECK: void @__vecz_v4_squashv8i32( +; CHECK-NOT: bitcast +define spir_kernel void @squashv8i32(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i64 0) #1 + %data.ptr = getelementptr inbounds <8 x i32>, ptr addrspace(1) %data, i64 %gid + %data.ld = load <8 x i32>, ptr addrspace(1) %data.ptr, align 32 + %ele0 = extractelement <8 x i32> %data.ld, i32 0 + %ele1 = extractelement <8 x i32> %data.ld, i32 1 + %zext0 = zext i32 %ele0 to i256 + %zext1 = zext i32 %ele1 to i256 + %sum = add i256 %zext0, %zext1 + %output.ptr = getelementptr inbounds i256, ptr addrspace(1) %output, i64 %gid + store i256 %sum, ptr addrspace(1) %output.ptr, align 32 + ret void +} + +; Check we don't squash vectors we consider too large. +; CHECK: void @__vecz_v4_squashv4i64( +; CHECK-NOT: bitcast +define spir_kernel void @squashv4i64(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i64 0) #1 + %data.ptr = getelementptr inbounds <4 x i64>, ptr addrspace(1) %data, i64 %gid + %data.ld = load <4 x i64>, ptr addrspace(1) %data.ptr, align 32 + %ele0 = extractelement <4 x i64> %data.ld, i32 0 + %ele1 = extractelement <4 x i64> %data.ld, i32 1 + %zext0 = zext i64 %ele0 to i256 + %zext1 = zext i64 %ele1 to i256 + %sum = add i256 %zext0, %zext1 + %output.ptr = getelementptr inbounds i256, ptr addrspace(1) %output, i64 %gid + store i256 %sum, ptr addrspace(1) %output.ptr, align 32 + ret void +} + +declare i64 @__mux_get_global_id(i64) + +attributes #0 = { nounwind } +attributes #1 = { nobuiltin nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll new file mode 100644 index 0000000000000..e336a961b2576 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll @@ -0,0 +1,69 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "E-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i64 0) #2 + %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid + %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8 + %ele0 = extractelement <4 x i8> %data.ld, i32 3 + %ele1 = extractelement <4 x i8> %data.ld, i32 2 + %ele2 = extractelement <4 x i8> %data.ld, i32 1 + %ele3 = extractelement <4 x i8> %data.ld, i32 0 + %zext0 = zext i8 %ele0 to i32 + %zext1 = zext i8 %ele1 to i32 + %zext2 = zext i8 %ele2 to i32 + %zext3 = zext i8 %ele3 to i32 + %sum1 = add i32 %zext0, %zext1 + %sum2 = xor i32 %sum1, %zext2 + %sum3 = and i32 %sum2, %zext3 + %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid + store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i64) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +; It checks that the <4 x i8> is converted into a i32 and uses shifts and masks +; to implement the extract elements and zexts. +; +; CHECK: void @__vecz_v4_squash +; CHECK: %[[DATA:.+]] = load <16 x i8> +; CHECK-NOT: shufflevector +; CHECK: %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]] +; CHECK: %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32> +; CHECK: %[[ZEXT0:.+]] = and <4 x i32> %[[SQUASH]], {{<(i32 255(, )?)+>|splat \(i32 255\)}} +; CHECK: %[[EXTR1:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}} +; CHECK: %[[ZEXT1:.+]] = and <4 x i32> %[[EXTR1]], {{<(i32 255(, )?)+>|splat \(i32 255\)}} +; CHECK: %[[EXTR2:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}} +; CHECK: %[[ZEXT2:.+]] = and <4 x i32> %[[EXTR2]], {{<(i32 255(, )?)+>|splat \(i32 255\)}} +; CHECK: %[[EXTR3:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}} +; CHECK: %[[ZEXT3:.+]] = and <4 x i32> %[[EXTR3]], {{<(i32 255(, )?)+>|splat \(i32 255\)}} +; CHECK: %[[SUM1:.+]] = add <4 x i32> %[[ZEXT0]], %[[ZEXT1]] +; CHECK: %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[ZEXT2]] +; CHECK: %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[ZEXT3]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll new file mode 100644 index 0000000000000..5615f7107d892 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k squash -vecz-passes="squash-small-vecs,packetizer" -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @squash(i64 addrspace(1)* %idx, <2 x float> addrspace(1)* %data, <2 x float> addrspace(1)* %output) #0 { +entry: + %gid = call i64 @__mux_get_global_id(i64 0) #2 + %idx.ptr = getelementptr inbounds i64, i64 addrspace(1)* %idx, i64 %gid + %idx.ld = load i64, i64 addrspace(1)* %idx.ptr, align 8 + %data.ptr = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %data, i64 %idx.ld + %data.ld = load <2 x float>, <2 x float> addrspace(1)* %data.ptr, align 8 + %output.ptr = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %output, i64 %gid + store <2 x float> %data.ld, <2 x float> addrspace(1)* %output.ptr, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i64) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +; It checks that the <2 x float> is converted into a i64 for the purpose of the +; gather load +; +; CHECK: void @__vecz_v4_squash +; CHECK: %[[GID:.+]] = call i64 @__mux_get_global_id(i64 0) #[[ATTRS:[0-9]+]] +; CHECK: %[[IDX_PTR:.+]] = getelementptr i64, ptr addrspace(1) %idx, i64 %[[GID]] +; CHECK: %[[WIDE_LOAD:.+]] = load <4 x i64>, ptr addrspace(1) %[[IDX_PTR]], align 8 +; CHECK: %[[DATA_PTR:.+]] = getelementptr <2 x float>, ptr addrspace(1) %data, <4 x i64> %[[WIDE_LOAD]] +; CHECK: %[[GATHER:.+]] = call <4 x i64> @__vecz_b_gather_load8_Dv4_mDv4_u3ptrU3AS1(<4 x ptr addrspace(1)> %[[DATA_PTR]]) +; CHECK: %[[UNSQUASH:.+]] = bitcast <4 x i64> %[[GATHER]] to <8 x float> +; CHECK: %[[OUTPUT_PTR:.+]] = getelementptr <2 x float>, ptr addrspace(1) %output, i64 %[[GID]] +; CHECK: store <8 x float> %[[UNSQUASH]], ptr addrspace(1) %[[OUTPUT_PTR]], align 8 +; CHECK: ret void + +; CHECK: attributes #[[ATTRS]] = { nobuiltin nounwind } diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll new file mode 100644 index 0000000000000..73993e3c2883b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll @@ -0,0 +1,76 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +%struct.PerItemKernelInfo = type <{ <4 x i64>, i32, i32 }> + +; Function start +; CHECK: spir_kernel void @__vecz_v4_foo( + +; There should be exactly 4 vector stores +; CHECK: store <4 x i64> +; CHECK: store <4 x i64> +; CHECK: store <4 x i64> +; CHECK: store <4 x i64> +; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64> +; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_{{.*}}(<4 x i64> + +; There is one interleaved store from the scalar write +; CHECK: call void @__vecz_b_interleaved_store1_10_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32> + +; There shouldn't be any other stores +; CHECK-NOT: call void @__vecz_b_{{.*}}_store + +; Function end +; CHECK: ret void + +define dso_local spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* nocapture noundef writeonly %info) !reqd_work_group_size !11 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 noundef 0) + %call1 = tail call i64 @__mux_get_global_id(i32 noundef 1) + %call2 = tail call i64 @__mux_get_global_id(i32 noundef 2) + %call3 = tail call i64 @__mux_get_global_size(i32 noundef 0) + %call5 = tail call i64 @__mux_get_global_size(i32 noundef 1) + %mul7 = mul nuw nsw i64 %call5, %call2 + %reass.add = add nuw nsw i64 %mul7, %call1 + %reass.mul = mul nuw nsw i64 %reass.add, %call3 + %add8 = add nuw nsw i64 %reass.mul, %call + %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0 + %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1 + %call12 = tail call i64 @__mux_get_global_size(i32 noundef 2) + %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2 + %call14 = tail call i64 @__mux_get_global_size(i32 noundef 3) + %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3 + %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0 + store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1 + %call16 = tail call i32 @__mux_get_work_dim() + %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1 + store i32 %call16, i32 addrspace(1)* %work_dim, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare i64 @__mux_get_global_size(i32) + +declare i32 @__mux_get_work_dim() + +!11 = !{i32 4, i32 1, i32 1} + diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll new file mode 100644 index 0000000000000..dfba183808512 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll @@ -0,0 +1,63 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s -vecz-choices=FullScalarization | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%struct.PerItemKernelInfo = type <{ <4 x i64>, i32, i32 }> + +define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %call2 = tail call i64 @__mux_get_global_id(i32 2) + %call3 = tail call i64 @__mux_get_global_size(i32 0) + %call5 = tail call i64 @__mux_get_global_size(i32 1) + %mul7 = mul nuw nsw i64 %call5, %call2 + %reass.add = add nuw nsw i64 %mul7, %call1 + %reass.mul = mul nuw nsw i64 %reass.add, %call3 + %add8 = add nuw nsw i64 %reass.mul, %call + %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0 + %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1 + %call12 = tail call i64 @__mux_get_global_size(i32 2) + %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2 + %call14 = tail call i64 @__mux_get_global_size(i32 3) + %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3 + %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0 + store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1 + %call16 = tail call i32 @__mux_get_work_dim() + %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1 + store i32 %call16, i32 addrspace(1)* %work_dim, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare i64 @__mux_get_global_size(i32) + +declare i32 @__mux_get_work_dim() + +; CHECK: spir_kernel void @foo +; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64> +; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64> +; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64> +; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64> +; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_m{{.*}}(<4 x i64> +; CHECK: call void @__vecz_b_interleaved_store1_10_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32> +; CHECK-NOT: call void @__vecz_b_{{.*}}_store +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll new file mode 100644 index 0000000000000..95dfb9f4ef732 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll @@ -0,0 +1,178 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes="print" -S < %s -o /dev/null 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_foo': +define spir_kernel void @foo(ptr addrspace(1) align 1 %input) { +entry: + %localid0 = tail call i64 @__mux_get_local_id(i32 0) + %localsize0 = tail call i64 @__mux_get_local_size(i32 0) + %groupid0 = tail call i64 @__mux_get_group_id(i32 0) + %globalid0 = tail call i64 @__mux_get_global_id(i32 0) + +; CHECK: Stride for ptr addrspace(1) %input +; CHECK-NEXT: uniform + %lduniform = load i8, ptr addrspace(1) %input, align 1 + +; CHECK: Stride for %arrayidx0 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0 +; CHECK-NEXT: linear stride of 1 + %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0 + %ld0 = load i8, ptr addrspace(1) %arrayidx0, align 1 + + %truncglobalid0 = trunc i64 %globalid0 to i32 + +; CHECK: Stride for %arrayidx1 = getelementptr i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0 +; CHECK-NEXT: linear stride of 1 + %sexttruncglobalid0 = sext i32 %truncglobalid0 to i64 + %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0 + %ld1 = load i8, ptr addrspace(1) %arrayidx1, align 1 + +; CHECK: Stride for %arrayidx2 = getelementptr i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0 +; CHECK-NEXT: divergent + %zexttruncglobalid0 = zext i32 %truncglobalid0 to i64 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0 + %ld2 = load i8, ptr addrspace(1) %arrayidx2, align 1 + +; CHECK: Stride for %arrayidx3 = getelementptr i32, ptr addrspace(1) %input, i64 %globalid0 +; CHECK-NEXT: linear stride of 4 + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0 + %ld3 = load i8, ptr addrspace(1) %arrayidx3, align 1 + +; CHECK: Stride for %arrayidx4 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0mul8 +; CHECK-NEXT: linear stride of 8 + %globalid0mul8 = mul i64 %globalid0, 8 + %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul8 + %ld4 = load i8, ptr addrspace(1) %arrayidx4, align 1 + +; CHECK: Stride for %arrayidx5 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0mul16 +; CHECK-NEXT: linear stride of 16 + %globalid0mul16 = mul i64 %globalid0mul8, 2 + %arrayidx5 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul16 + %ld5 = load i8, ptr addrspace(1) %arrayidx5, align 1 + +; CHECK: Stride for %arrayidx6 = getelementptr i32, ptr addrspace(1) %input, i64 %globalid0mul8 +; CHECK-NEXT: linear stride of 32 + %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0mul8 + %ld6 = load i32, ptr addrspace(1) %arrayidx6, align 1 + +; CHECK: Stride for %arrayidx7 = getelementptr i16, ptr addrspace(1) %input, i64 %idxprom7 +; CHECK-NEXT: linear stride of 2 + %mul7 = mul i64 %localsize0, %groupid0 + %add7 = add i64 %mul7, %localid0 + %trunc7 = trunc i64 %add7 to i32 + %conv7 = add i32 %trunc7, -1 + %idxprom7 = sext i32 %conv7 to i64 + %arrayidx7 = getelementptr inbounds i16, ptr addrspace(1) %input, i64 %idxprom7 + %ld7 = load i16, ptr addrspace(1) %arrayidx7, align 1 + +; CHECK: Stride for %arrayidx8 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom8 +; CHECK-NEXT: divergent + %mul8 = mul i64 %localsize0, %groupid0 + %add8 = add i64 %mul8, %localid0 + %trunc8 = trunc i64 %add8 to i32 + %conv8 = add i32 %trunc8, -1 + %idxprom8 = zext i32 %conv8 to i64 + %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom8 + %ld8 = load i8, ptr addrspace(1) %arrayidx8, align 1 + +; CHECK: Stride for %arrayidx9 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom9 +; CHECK-NEXT: divergent + %mul9 = mul i64 %groupid0, %localsize0 + %add9 = add nuw nsw i64 %localid0, 4294967295 + %conv9 = add i64 %add9, %mul9 + %idxprom9 = and i64 %conv9, 4294967295 + %arrayidx9 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom9 + %ld9 = load i8, ptr addrspace(1) %arrayidx9, align 1 + + ret void +} + +; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_canny_regression': +define spir_kernel void @canny_regression(ptr addrspace(1) align 1 %input) { +entry: + %groupid0 = tail call i64 @__mux_get_group_id(i32 0) + %localid0 = tail call i64 @__mux_get_local_id(i32 0) + %localsize0 = tail call i64 @__mux_get_local_size(i32 0) + %mul = mul i64 %groupid0, %localsize0 + %add = add i64 %mul, %localid0 + %0 = trunc i64 %add to i32 + %conv = add i32 %0, -1 + %trunclocalsize0 = trunc i64 %localsize0 to i32 + +; CHECK: Stride for %arrayidx_pre = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom_pre +; CHECK-NEXT: divergent + %idxprom_pre = zext i32 %conv to i64 + %arrayidx_pre = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_pre + %ld_pre = load i8, ptr addrspace(1) %arrayidx_pre, align 1 + + br label %for.body + +for.body: +; The below is fundamentally the same stride calculation as %arrayidx_pre - +; make sure the loop and the PHI don't throw off the analysis. +; CHECK: Stride for %arrayidx_loop = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom_loop +; CHECK-NEXT: divergent + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gx2.050.us = phi i32 [ %conv, %entry ], [ %conv26.us, %for.body ] + %idxprom_loop = zext i32 %gx2.050.us to i64 + %arrayidx_loop = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_loop + + %ld_loop = load i8, ptr addrspace(1) %arrayidx_loop, align 1 + + %conv26.us = add i32 %gx2.050.us, %trunclocalsize0 + %iv.next = add nuw nsw i64 %iv, 1 + %exit_cond = icmp ult i64 %iv.next, 2 + br i1 %exit_cond, label %for.body, label %exit + +exit: + ret void +} + +; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_select_regression': +define spir_kernel void @select_regression(ptr addrspace(1) align 1 %input, i1 %cmp) { +entry: + %groupid0 = tail call i64 @__mux_get_group_id(i32 0) + %localid0 = tail call i64 @__mux_get_local_id(i32 0) + %localsize0 = tail call i64 @__mux_get_local_size(i32 0) + %mul = mul i64 %groupid0, %localsize0 + %add = add i64 %mul, %localid0 + %addtrunc = trunc i64 %add to i32 + +; CHECK: Stride for %arrayidx0 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom0 +; CHECK-NEXT: divergent + %idxprom0 = zext i32 %addtrunc to i64 + %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom0 + %ld0 = load i8, ptr addrspace(1) %arrayidx0, align 1 + +; The below is fundamentally the same stride calculation as %arrayidx0 - make +; sure the select doesn't throw off the analysis. +; CHECK: Stride for %arrayidx1 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom1 +; CHECK-NEXT: divergent + %sel1 = select i1 %cmp, i32 %addtrunc, i32 %addtrunc + %idxprom1 = zext i32 %sel1 to i64 + %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom1 + %ld1 = load i8, ptr addrspace(1) %arrayidx1, align 1 + + ret void +} + +declare i64 @__mux_get_local_id(i32) +declare i64 @__mux_get_local_size(i32) +declare i64 @__mux_get_group_id(i32) +declare i64 @__mux_get_global_id(i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll new file mode 100644 index 0000000000000..0b51e0f078b05 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll @@ -0,0 +1,64 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%struct.PerItemKernelInfo = type <{ <4 x i64>, i32 }> + +define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %call2 = tail call i64 @__mux_get_global_id(i32 2) + %call3 = tail call i64 @__mux_get_global_size(i32 0) + %call5 = tail call i64 @__mux_get_global_size(i32 1) + %mul7 = mul nuw nsw i64 %call5, %call2 + %reass.add = add nuw nsw i64 %mul7, %call1 + %reass.mul = mul nuw nsw i64 %reass.add, %call3 + %add8 = add nuw nsw i64 %reass.mul, %call + %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0 + %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1 + %call12 = tail call i64 @__mux_get_global_size(i32 2) + %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2 + %call14 = tail call i64 @__mux_get_global_size(i32 3) + %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3 + %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0 + store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1 + %call16 = tail call i32 @__mux_get_work_dim() + %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1 + store i32 %call16, i32 addrspace(1)* %work_dim, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare i64 @__mux_get_global_size(i32) + +declare i32 @__mux_get_work_dim() + +; CHECK: spir_kernel void @foo +; CHECK: store <4 x i64> +; CHECK: store <4 x i64> +; CHECK: store <4 x i64> +; CHECK: store <4 x i64> +; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64> +; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_{{.*}}(<4 x i64> +; CHECK: call void @__vecz_b_interleaved_store1_9_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32> +; CHECK-NOT: call void @__vecz_b_{{.*}}_store +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll new file mode 100644 index 0000000000000..ffdb64718d8b8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll @@ -0,0 +1,63 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -S < %s -vecz-choices=FullScalarization | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +%struct.PerItemKernelInfo = type <{ <4 x i64>, i32 }> + +define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %call1 = tail call i64 @__mux_get_global_id(i32 1) + %call2 = tail call i64 @__mux_get_global_id(i32 2) + %call3 = tail call i64 @__mux_get_global_size(i32 0) + %call5 = tail call i64 @__mux_get_global_size(i32 1) + %mul7 = mul nuw nsw i64 %call5, %call2 + %reass.add = add nuw nsw i64 %mul7, %call1 + %reass.mul = mul nuw nsw i64 %reass.add, %call3 + %add8 = add nuw nsw i64 %reass.mul, %call + %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0 + %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1 + %call12 = tail call i64 @__mux_get_global_size(i32 2) + %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2 + %call14 = tail call i64 @__mux_get_global_size(i32 3) + %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3 + %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0 + store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1 + %call16 = tail call i32 @__mux_get_work_dim() + %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1 + store i32 %call16, i32 addrspace(1)* %work_dim, align 1 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare i64 @__mux_get_global_size(i32) + +declare i32 @__mux_get_work_dim() + +; CHECK: spir_kernel void @foo +; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64> +; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64> +; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64> +; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64> +; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64> +; CHECK: call void @__vecz_b_interleaved_store1_9_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32> +; CHECK-NOT: call void @__vecz_b_{{.*}}_store +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll new file mode 100644 index 0000000000000..ee78646485d04 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll @@ -0,0 +1,107 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +%struct_type = type { i32, i32 } + +define spir_kernel void @test(i32* %in, i32* %out, %struct_type* %sin) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %inp = getelementptr inbounds i32, i32* %in, i64 %call + %oup = getelementptr inbounds i32, i32* %out, i64 %call + %o = load i32, i32* %oup + ; do this little compare + phi to throw off the InstCombine pass and ensure + ; we end up with a phi %struct_type that must be instantiated + %s = insertvalue %struct_type poison, i32 %o, 1 + %cmpcall = icmp ult i64 16, %call + br i1 %cmpcall, label %lower, label %higher + +lower: + %lowers = insertvalue %struct_type %s, i32 0, 0 + br label %lower.higher.phi + +higher: + %highers = insertvalue %struct_type %s, i32 1, 0 + br label %lower.higher.phi + +lower.higher.phi: + %lowerhigherstruct = phi %struct_type [%lowers, %lower], [%highers, %higher] + br label %for.cond + +for.cond: + %storemerge = phi %struct_type [ %incv, %for.inc ], [ %lowerhigherstruct, %lower.higher.phi ] + %s1 = extractvalue %struct_type %storemerge, 1 + %s1ext = zext i32 %s1 to i64 + %cmp = icmp ult i64 %s1ext, %call + br i1 %cmp, label %for.body, label %for.end + +for.body: + %l = load i32, i32* %inp, align 4 + store i32 %l, i32* %oup, align 4 + br label %for.inc + +for.inc: + %toadd = extractvalue %struct_type %storemerge, 1 + %toadd64 = zext i32 %toadd to i64 + %ca = add i64 %toadd64, %call + %sinp = getelementptr inbounds %struct_type, %struct_type* %sin, i64 %ca + %sinv = load %struct_type, %struct_type* %sinp + %sinintv = extractvalue %struct_type %sinv, 1 + %incv = insertvalue %struct_type %storemerge, i32 %sinintv, 1 + br label %for.cond + +for.end: + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1) + +; CHECK: define spir_kernel void @__vecz_v4_test + +; Check if the struct creation has been instantiated +; CHECK: %[[V2:[0-9]+]] = load <4 x i32>, ptr %oup, align 4 +; CHECK: %[[V3:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 0 +; CHECK: %[[V4:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 1 +; CHECK: %[[V5:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 2 +; CHECK: %[[V6:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 3 +; CHECK: %[[S24:.+]] = insertvalue %struct_type poison, i32 %[[V3]], 1 +; CHECK: %[[S25:.+]] = insertvalue %struct_type poison, i32 %[[V4]], 1 +; CHECK: %[[S26:.+]] = insertvalue %struct_type poison, i32 %[[V5]], 1 +; CHECK: %[[S27:.+]] = insertvalue %struct_type poison, i32 %[[V6]], 1 + +; Check if the phi node has been instantiated +; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ] +; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ] +; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ] +; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ] +; CHECK: extractvalue %struct_type %{{.+}}, 1 +; CHECK: extractvalue %struct_type %{{.+}}, 1 +; CHECK: extractvalue %struct_type %{{.+}}, 1 +; CHECK: extractvalue %struct_type %{{.+}}, 1 + +; Check if the operations that use integer types are vectorized +; CHECK: zext <4 x i32> +; CHECK: icmp ugt <4 x i64> +; CHECK: select <4 x i1> +; CHECK: %[[L423:.+]] = call <4 x i32> @__vecz_b_masked_load4_Dv4_ju3ptrDv4_b(ptr %{{.*}}, <4 x i1> +; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrDv4_b(<4 x i32> %[[L423]], ptr{{( nonnull)? %.*}}, <4 x i1> + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll new file mode 100644 index 0000000000000..a4b88856af96a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +%struct_type = type { i32, i64 } + +define spir_kernel void @test(%struct_type* %in1, %struct_type* %in2, %struct_type* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %in1p = getelementptr inbounds %struct_type, %struct_type* %in1, i64 %call + %in2p = getelementptr inbounds %struct_type, %struct_type* %in2, i64 %call + %outp = getelementptr inbounds %struct_type, %struct_type* %out, i64 %call + %in1v = load %struct_type, %struct_type* %in1p + %in2v = load %struct_type, %struct_type* %in2p + %mod = urem i64 %call, 3 + %cmp = icmp eq i64 %mod, 0 + %res = select i1 %cmp, %struct_type %in1v, %struct_type %in2v + store %struct_type %res, %struct_type* %outp + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1) + +; CHECK: define spir_kernel void @__vecz_v4_test + +; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}} +; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}} +; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}} +; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}} + +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll new file mode 100644 index 0000000000000..bf1f2b19b178b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll @@ -0,0 +1,45 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i32 @__mux_get_sub_group_id() +declare i32 @__mux_get_sub_group_local_id() +declare i32 @__mux_sub_group_broadcast_i32(i32, i32) + +; It makes sure broadcast still works when its source operand is uniform +define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call = tail call i32 @__mux_get_sub_group_id() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call + %v = load i32, i32 addrspace(1)* %arrayidx, align 4 + %broadcast = call i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0) + %idx = tail call i32 @__mux_get_sub_group_local_id() + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idx + store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast( +; CHECK: [[LD:%.+]] = load i32, ptr addrspace(1) %{{.+}}, align 4 +; CHECK: [[INS:%.+]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0 +; CHECK: [[BCAST:%.+]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: %idx = tail call i32 @__mux_get_sub_group_local_id() +; CHECK: [[EXT:%.*]] = sext i32 %idx to i64 +; CHECK: %arrayidx2 = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]] +; CHECK: store <4 x i32> [[BCAST]], ptr addrspace(1) %arrayidx2, align 4 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll new file mode 100644 index 0000000000000..d6b074d1d266f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll @@ -0,0 +1,115 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i32 @__mux_get_sub_group_id() +declare spir_func i32 @__mux_get_sub_group_size() +declare spir_func i32 @__mux_get_sub_group_local_id() +declare spir_func i32 @__mux_sub_group_broadcast_i32(i32, i32) +declare spir_func i64 @__mux_get_global_id(i32) +declare spir_func i1 @__mux_sub_group_any_i1(i1) + +define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call.i = tail call spir_func i32 @__mux_get_sub_group_id() + %conv = zext i32 %call.i to i64 + %call2 = tail call spir_func i32 @__mux_get_sub_group_size() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx, align 4 + ret void +; CHECK-LABEL: define spir_kernel void @__vecz_v4_get_sub_group_size( +; CHECK: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 4) +; CHECK: store i32 [[RED]], ptr addrspace(1) {{.*}} +} + +define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call = tail call spir_func i32 @__mux_get_sub_group_local_id() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call + store i32 %call, i32 addrspace(1)* %arrayidx, align 4 + ret void +; CHECK-LABEL: define spir_kernel void @__vecz_v4_get_sub_group_local_id( +; CHECK: %call = tail call spir_func i32 @__mux_get_sub_group_local_id() +; CHECK: [[MUL:%.*]] = shl i32 %call, 2 +; CHECK: [[SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[ID:%.*]] = or {{(disjoint )?}}<4 x i32> [[SPLAT]], +; CHECK: [[EXT:%.*]] = sext i32 %call to i64 +; CHECK: %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]] +; CHECK: store <4 x i32> [[ID]], ptr addrspace(1) %arrayidx +} + +define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call = tail call spir_func i32 @__mux_get_sub_group_local_id() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call + %v = load i32, i32 addrspace(1)* %arrayidx, align 4 + %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call + store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast( +; CHECK: [[LD:%.*]] = load <4 x i32>, ptr addrspace(1) {{%.*}}, align 4 +; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 0 +; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 0) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1) +} + +define spir_kernel void @sub_group_broadcast_wider_than_vf(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { + %call = tail call spir_func i32 @__mux_get_sub_group_local_id() + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call + %v = load i32, i32 addrspace(1)* %arrayidx, align 4 + %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 6) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call + store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast_wider_than_vf( +; CHECK: [[LD:%.*]] = load <4 x i32>, ptr addrspace(1) {{%.*}}, align 4 +; The sixth sub-group member is the (6 % 4 ==) 2nd vector group member +; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 2 +; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 1) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1) +} + +; This used to crash as packetizing get_sub_group_local_id produces a Constant, which we weren't expecting. +define spir_kernel void @regression_sub_group_local_id(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %xy, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_local_id() + %0 = shl i64 %call, 32 + %idxprom = ashr exact i64 %0, 32 + %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %xy, i64 %idxprom + %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 16 + %2 = insertelement <4 x i32> %1, i32 %call1, i64 0 + %3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, i64 0, i64 0 + store i32 %call1, i32 addrspace(1)* %3, align 16 + %call2 = tail call spir_func i32 @__mux_get_sub_group_id() + %4 = insertelement <4 x i32> %2, i32 %call2, i64 1 + store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx, align 16 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %6 = icmp ne i32 %5, 0 + %call7 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %6) + %7 = sext i1 %call7 to i32 + %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %7, i32 addrspace(1)* %arrayidx9, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll new file mode 100644 index 0000000000000..c69d993acdd18 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll @@ -0,0 +1,260 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) +declare spir_func i32 @__mux_get_sub_group_id() + +declare spir_func i1 @__mux_sub_group_all_i1(i1) +declare spir_func i1 @__mux_sub_group_any_i1(i1) + +declare spir_func i32 @__mux_sub_group_reduce_add_i32(i32) +declare spir_func i64 @__mux_sub_group_reduce_add_i64(i64) +declare spir_func float @__mux_sub_group_reduce_fadd_f32(float) +declare spir_func i32 @__mux_sub_group_reduce_smin_i32(i32) +declare spir_func i32 @__mux_sub_group_reduce_umin_i32(i32) +declare spir_func i32 @__mux_sub_group_reduce_smax_i32(i32) +declare spir_func i32 @__mux_sub_group_reduce_umax_i32(i32) +declare spir_func float @__mux_sub_group_reduce_fmin_f32(float) +declare spir_func float @__mux_sub_group_reduce_fmax_f32(float) + +define spir_kernel void @reduce_all_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %1 = icmp ne i32 %0, 0 + %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 %1) + %2 = sext i1 %call2 to i32 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_all_i32( +; CHECK: [[T2:%.*]] = icmp eq <4 x i32> %{{.*}}, zeroinitializer + +; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[T2]] to i4 +; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], 0 +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]]) +; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32 +; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_any_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %1 = icmp ne i32 %0, 0 + %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %1) + %2 = sext i1 %call2 to i32 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_any_i32( +; CHECK: [[T2:%.*]] = icmp ne <4 x i32> %{{.*}}, zeroinitializer + +; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[T2]] to i4 +; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0 +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]]) +; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32 +; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_add_i32( +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %{{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +; Given we've checked a "full" expanded reduction sequence above for LLVM < 13, +; reduce duplicate CHECKs by assuming all reductions work orthogonally. + +define spir_kernel void @reduce_add_i32_uniform(i32 addrspace(1)* %out, i32 %n) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %n) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_add_i32_uniform( +; LLVM is clever enough to optimize this reduction, but not when it's an +; intrinsic. LLVM 10 does the shift-left in a vector, LLVMs 11 and 12 do it in +; scalar. +; CHECK: [[CALL:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> {{%.*}}) +; CHECK: %call1 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[CALL]]) +; CHECK: [[INS:%.*]] = insertelement <4 x i32> poison, i32 %call1, {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: store <4 x i32> [[SPLAT]], +} + +define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call + %0 = load i64, i64 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 %0) + %arrayidx3 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %conv + store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_add_i64( +; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %{{.*}}) +; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]]) +; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float %0) + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv + store float %call2, float addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_add_f32( +; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %{{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]]) +; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_smin_i32( +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %{{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_umin_i32( +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %{{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_smax_i32( +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %{{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv + store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_umax_i32( +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %{{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float %0) + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv + store float %call2, float addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_fmin_f32( +; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %{{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]]) +; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float %0) + %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv + store float %call2, float addrspace(1)* %arrayidx3, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_fmax_f32( +; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %{{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]]) +; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 +} + +!opencl.ocl.version = !{!0} + +!0 = !{i32 3, i32 0} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll new file mode 100644 index 0000000000000..4719739ded72b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll @@ -0,0 +1,197 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) +declare spir_func i32 @__mux_get_sub_group_id() + +declare spir_func i32 @__mux_sub_group_reduce_mul_i32(i32) +declare spir_func i64 @__mux_sub_group_reduce_mul_i64(i64) +declare spir_func float @__mux_sub_group_reduce_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_reduce_and_i32(i32) +declare spir_func i32 @__mux_sub_group_reduce_or_i32(i32) +declare spir_func i64 @__mux_sub_group_reduce_xor_i64(i64) + +declare spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1) +declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1) +declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1) + +; CHECK-LABEL: @__vecz_v4_reduce_mul_i32( +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %{{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + store i32 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_mul_i64( +; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %{{.*}}) +; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]]) +; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call + %0 = load i64, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 %0) + %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv + store i64 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_mul_f32( +; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> %{{.*}}) +; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]]) +; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float %0) + %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv + store float %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_and_i32( +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %{{.*}}) +; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]]) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + store i32 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_or_i32( +; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %{{.*}}) +; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 %0) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + store i32 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_xor_i32( +; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %{{.*}}) +; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]]) +; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i64, ptr addrspace(1) %arrayidx, align 4 + %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0) + %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv + store i64 %call2, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_logical_and( +; This doesn't generate a reduction intrinsic... +; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1 +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[T]]) +; CHECK: [[E:%.*]] = zext i1 %call2 to i32 +; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 %1) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + %zext = zext i1 %call2 to i32 + store i32 %zext, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_logical_or( +; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0 +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[T]]) +; CHECK: [[E:%.*]] = zext i1 %call2 to i32 +; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 %1) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + %zext = zext i1 %call2 to i32 + store i32 %zext, ptr addrspace(1) %arrayidx3, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_logical_xor( +; CHECK: [[X:%.*]] = call {{.*}}i4 @llvm.ctpop.i4(i4 {{%.*}}) +; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[T:%.*]]) +; CHECK: [[E:%.*]] = zext i1 %call2 to i32 +; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4 +define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6 + %conv = zext i32 %call1 to i64 + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 %1) + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv + %zext = zext i1 %call2 to i32 + store i32 %zext, ptr addrspace(1) %arrayidx3, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll new file mode 100644 index 0000000000000..ad98dbfe5f788 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll @@ -0,0 +1,204 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32) +declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64) +declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32) +declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float) +declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float) + +define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i32( +; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_add_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = add <4 x i32> [[SCAN]], [[SPLAT]] +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call + %0 = load i64, i64 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0) + %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call + store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i64( +; CHECK: [[SCAN:%.*]] = call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_sub_group_scan_exclusive_add_i64(i64 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i64> poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[HEAD]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = add <4 x i64> [[SCAN]], [[SPLAT]] +; CHECK: store <4 x i64> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_f32( +; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.0{{.*}}, <4 x float> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fadd_f32(float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = fadd <4 x float> [[SCAN]], [[SPLAT]] +; CHECK: store <4 x float> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smin_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}}) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smin_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]]) +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umin_i32( +; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umin_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]]) +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smax_i32( +; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smax_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]]) +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umax_i32( +; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umax_i32(i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]]) +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmin_f32( +; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmin_f32(float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]]) +; CHECK: store <4 x float> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmax_f32( +; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmax_f32(float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]]) +; CHECK: store <4 x float> [[FINAL]], +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll new file mode 100644 index 0000000000000..691b7aba7100f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll @@ -0,0 +1,171 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare spir_func i64 @__mux_get_global_id(i32) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32) +declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32) +declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float) + +declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32) +declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1) +declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1) + +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_mul_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_excl_mul_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_mul_f32( +; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_f(<4 x float> %{{.*}}) +define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0) + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_excl_mul_f32( +; CHECK: call <4 x float> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_f(<4 x float> %{{.*}}) +define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call + %0 = load float, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0) + %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call + store float %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_and_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_and_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_or_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_or_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_xor_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_j(<4 x i32> %{{.*}}) +define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + store i32 %call1, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_and( +; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_and_Dv4_b(<4 x i1> %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_or( +; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_or_Dv4_b(<4 x i1> %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} + +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_xor( +; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_b(<4 x i1> %{{.*}}) +define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) { +entry: + %call = tail call spir_func i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call + %0 = load i32, ptr addrspace(1) %arrayidx, align 4 + %1 = trunc i32 %0 to i1 + %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1) + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call + %2 = zext i1 %call1 to i32 + store i32 %2, ptr addrspace(1) %arrayidx2, align 4 + ret void +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll new file mode 100644 index 0000000000000..b5783f2c9e55c --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll @@ -0,0 +1,190 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \ +; RUN: --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; See @kernel_varying_idx, below +; CHECK: Could not packetize sub-group shuffle %shuffle9 + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %in, ptr %out) +; CHECK: [[VECIDX:%.*]] = urem i32 %size_minus_1, 4 +; CHECK: [[MUXIDX:%.*]] = udiv i32 %size_minus_1, 4 +; CHECK: [[VEC:%.*]] = extractelement <4 x i64> {{%.*}}, i32 [[VECIDX]] +; CHECK: [[SHUFFLE:%.*]] = call i64 @__mux_sub_group_shuffle_i64(i64 [[VEC]], i32 [[MUXIDX]]) +; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 [[SHUFFLE]], i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: store <4 x i64> [[SPLAT]] +define spir_kernel void @kernel(ptr %in, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %size = call i32 @__mux_get_sub_group_size() + %size_minus_1 = sub i32 %size, 1 + %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid + %val = load i64, ptr %arrayidx.in, align 8 + %shuffle1 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %size_minus_1) + %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid + store i64 %shuffle1, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %in, ptr %out) +; CHECK: [[VECIDX:%.*]] = urem i32 %size_minus_1, 4 +; CHECK: [[MUXIDX:%.*]] = udiv i32 %size_minus_1, 4 +; CHECK: [[BASE:%.*]] = mul i32 %2, 2 +; CHECK: [[IDX0:%.*]] = add i32 [[BASE]], 0 +; CHECK: [[ELT0:%.*]] = extractelement <8 x float> %1, i32 [[IDX0]] +; CHECK: [[TVEC:%.*]] = insertelement <2 x float> poison, float [[ELT0]], i32 0 +; CHECK: [[IDX1:%.*]] = add i32 [[BASE]], 1 +; CHECK: [[ELT1:%.*]] = extractelement <8 x float> %1, i32 [[IDX1]] +; CHECK: [[VEC:%.*]] = insertelement <2 x float> [[TVEC]], float [[ELT1]], i32 1 +; CHECK: [[SHUFFLE:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[VEC]], i32 [[MUXIDX]]) +; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> poison, +; CHECK-SAME: <8 x i32> +define spir_kernel void @kernel_vec_data(ptr %in, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %size = call i32 @__mux_get_sub_group_size() + %size_minus_1 = sub i32 %size, 1 + %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid + %val = load <2 x float>, ptr %arrayidx.in, align 8 + %shuffle2 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %size_minus_1) + %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid + store <2 x float> %shuffle2, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_const_idx(ptr %in, ptr %out) +; CHECK: [[VEC:%.*]] = extractelement <4 x i64> {{%.*}}, i32 1 +; CHECK: [[SHUFFLE:%.*]] = call i64 @__mux_sub_group_shuffle_i64(i64 [[VEC]], i32 0) +; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 [[SHUFFLE]], i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: store <4 x i64> [[SPLAT]] +define spir_kernel void @kernel_const_idx(ptr %in, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid + %val = load i64, ptr %arrayidx.in, align 8 + %shuffle3 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 1) + %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid + store i64 %shuffle3, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data_const_idx(ptr %in, ptr %out) +; We're wanting the "1th" sub-group member, which becomes the 2-element vector +; at element index 2 +; CHECK: [[VEC:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v8f32(<8 x float> {{%.*}}, i64 2) +; CHECK: [[SHUFFLE:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[VEC]], i32 0) +; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> poison, +; CHECK-SAME: <8 x i32> +; CHECK: store <8 x float> [[SPLAT]] +define spir_kernel void @kernel_vec_data_const_idx(ptr %in, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid + %val = load <2 x float>, ptr %arrayidx.in, align 8 + %shuffle4 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 1) + %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid + store <2 x float> %shuffle4, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data(i64 %val, ptr %out) +; It doesn't matter what sub-group index we choose because the data is uniform. +; Just splat it. +; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 %val, i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: store <4 x i64> [[SPLAT]] +define spir_kernel void @kernel_uniform_data(i64 %val, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %size = call i32 @__mux_get_sub_group_size() + %size_minus_1 = sub i32 %size, 1 + %shuffle5 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %size_minus_1) + %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid + store i64 %shuffle5, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data_varying_idx(i64 %val, ptr %idxs, ptr %out) +; It doesn't matter what sub-group index we choose because the data is uniform. +; Just splat it. +; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 %val, i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: store <4 x i64> [[SPLAT]] +define spir_kernel void @kernel_uniform_data_varying_idx(i64 %val, ptr %idxs, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid + %idx = load i32, ptr %arrayidx.idxs, align 4 + %shuffle6 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %idx) + %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid + store i64 %shuffle6, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_vec_data(<2 x float> %val, ptr %out) +; It doesn't matter what sub-group index we choose because the data is uniform. +; Just splat it. +; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> poison, +; CHECK-SAME: <8 x i32> +; CHECK: store <8 x float> [[SPLAT]] +define spir_kernel void @kernel_uniform_vec_data(<2 x float> %val, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %size = call i32 @__mux_get_sub_group_size() + %size_minus_1 = sub i32 %size, 1 + %shuffle7 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %size_minus_1) + %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid + store <2 x float> %shuffle7, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_vec_data_varying_idx(<2 x float> %val, ptr %idxs, ptr %out) +; It doesn't matter what sub-group index we choose because the data is uniform. +; Just splat it. +; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> poison, +; CHECK-SAME: <8 x i32> +; CHECK: store <8 x float> [[SPLAT]] +define spir_kernel void @kernel_uniform_vec_data_varying_idx(<2 x float> %val, ptr %idxs, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid + %idx = load i32, ptr %arrayidx.idxs, align 4 + %shuffle8 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %idx) + %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid + store <2 x float> %shuffle8, ptr %arrayidx.out, align 8 + ret void +} + +; We don't support vectorization of varying indices (for now) - see the check +; above (which is printed before the final IR) +define spir_kernel void @kernel_varying_idx(ptr %in, ptr %idxs, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %size = call i32 @__mux_get_sub_group_size() + %size_minus_1 = sub i32 %size, 1 + %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid + %val = load i64, ptr %arrayidx.in, align 8 + %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid + %idx = load i32, ptr %arrayidx.idxs, align 4 + %shuffle9 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %idx) + %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid + store i64 %shuffle9, ptr %arrayidx.out, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare i32 @__mux_get_sub_group_size() + +declare i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %lid) +declare <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %lid) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll new file mode 100644 index 0000000000000..5a7c4b4e7f8fb --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll @@ -0,0 +1,206 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \ +; RUN: --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) +; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4 +; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4 + +; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}} + +; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0 +; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]]) +; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0 +; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]] + +; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1 +; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]]) +; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1 +; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]] + +; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2 +; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]]) +; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2 +; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]] + +; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3 +; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]]) +; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3 +; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]] +define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid + %lhs = load float, ptr %arrayidx.lhs, align 4 + %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid + %rhs = load float, ptr %arrayidx.rhs, align 4 + %shuffle_up = call float @__mux_sub_group_shuffle_down_f32(float %lhs, float %rhs, i32 1) + %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid + store float %shuffle_up, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) +; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}} +; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}} + +; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0 +; CHECK: [[SHUFF0:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8( +; CHECK-SAME: <16 x i8> [[LHS:%.*]], <16 x i8> [[RHS:%.*]], i32 [[DELTA0]]) +; CHECK: [[SUBVECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0 +; CHECK: [[ELTBASE0:%.*]] = mul i32 [[SUBVECIDX0]], 4 +; CHECK: [[VECIDX00:%.*]] = add i32 [[ELTBASE0]], 0 +; CHECK: [[ELT00:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX00]] +; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> poison, i8 [[ELT00]], i32 0 +; CHECK: [[VECIDX01:%.*]] = add i32 [[ELTBASE0]], 1 +; CHECK: [[ELT01:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX01]] +; CHECK: [[VEC01:%.*]] = insertelement <4 x i8> [[VEC00]], i8 [[ELT01]], i32 1 +; CHECK: [[VECIDX02:%.*]] = add i32 [[ELTBASE0]], 2 +; CHECK: [[ELT02:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX02]] +; CHECK: [[VEC02:%.*]] = insertelement <4 x i8> [[VEC01]], i8 [[ELT02]], i32 2 +; CHECK: [[VECIDX03:%.*]] = add i32 [[ELTBASE0]], 3 +; CHECK: [[ELT03:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX03]] +; CHECK: [[VEC03:%.*]] = insertelement <4 x i8> [[VEC02]], i8 [[ELT03]], i32 3 + +; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1 +; CHECK: [[SHUFF1:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8( +; CHECK-SAME: <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA1]]) +; CHECK: [[SUBVECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1 +; CHECK: [[ELTBASE1:%.*]] = mul i32 [[SUBVECIDX1]], 4 +; CHECK: [[VECIDX10:%.*]] = add i32 [[ELTBASE1]], 0 +; CHECK: [[ELT10:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX10]] +; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> poison, i8 [[ELT10]], i32 0 +; CHECK: [[VECIDX11:%.*]] = add i32 [[ELTBASE1]], 1 +; CHECK: [[ELT11:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX11]] +; CHECK: [[VEC11:%.*]] = insertelement <4 x i8> [[VEC10]], i8 [[ELT11]], i32 1 +; CHECK: [[VECIDX12:%.*]] = add i32 [[ELTBASE1]], 2 +; CHECK: [[ELT12:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX12]] +; CHECK: [[VEC12:%.*]] = insertelement <4 x i8> [[VEC11]], i8 [[ELT12]], i32 2 +; CHECK: [[VECIDX13:%.*]] = add i32 [[ELTBASE1]], 3 +; CHECK: [[ELT13:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX13]] +; CHECK: [[VEC13:%.*]] = insertelement <4 x i8> [[VEC12]], i8 [[ELT13]], i32 3 + +; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2 +; CHECK: [[SHUFF2:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8( +; CHECK-SAME: <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA2]]) +; CHECK: [[SUBVECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2 +; CHECK: [[ELTBASE2:%.*]] = mul i32 [[SUBVECIDX2]], 4 +; CHECK: [[VECIDX20:%.*]] = add i32 [[ELTBASE2]], 0 +; CHECK: [[ELT20:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX20]] +; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> poison, i8 [[ELT20]], i32 0 +; CHECK: [[VECIDX21:%.*]] = add i32 [[ELTBASE2]], 1 +; CHECK: [[ELT21:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX21]] +; CHECK: [[VEC21:%.*]] = insertelement <4 x i8> [[VEC20]], i8 [[ELT21]], i32 1 +; CHECK: [[VECIDX22:%.*]] = add i32 [[ELTBASE2]], 2 +; CHECK: [[ELT22:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX22]] +; CHECK: [[VEC22:%.*]] = insertelement <4 x i8> [[VEC21]], i8 [[ELT22]], i32 2 +; CHECK: [[VECIDX23:%.*]] = add i32 [[ELTBASE2]], 3 +; CHECK: [[ELT23:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX23]] +; CHECK: [[VEC23:%.*]] = insertelement <4 x i8> [[VEC22]], i8 [[ELT23]], i32 3 + +; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3 +; CHECK: [[SHUFF3:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8( +; CHECK-SAME: <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA3]]) +; CHECK: [[SUBVECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3 +; CHECK: [[ELTBASE3:%.*]] = mul i32 [[SUBVECIDX3]], 4 +; CHECK: [[VECIDX30:%.*]] = add i32 [[ELTBASE3]], 0 +; CHECK: [[ELT30:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX30]] +; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> poison, i8 [[ELT30]], i32 0 +; CHECK: [[VECIDX31:%.*]] = add i32 [[ELTBASE3]], 1 +; CHECK: [[ELT31:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX31]] +; CHECK: [[VEC31:%.*]] = insertelement <4 x i8> [[VEC30]], i8 [[ELT31]], i32 1 +; CHECK: [[VECIDX32:%.*]] = add i32 [[ELTBASE3]], 2 +; CHECK: [[ELT32:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX32]] +; CHECK: [[VEC32:%.*]] = insertelement <4 x i8> [[VEC31]], i8 [[ELT32]], i32 2 +; CHECK: [[VECIDX33:%.*]] = add i32 [[ELTBASE3]], 3 +; CHECK: [[ELT33:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX33]] +; CHECK: [[VEC33:%.*]] = insertelement <4 x i8> [[VEC32]], i8 [[ELT33]], i32 3 +define spir_kernel void @kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.lhs = getelementptr inbounds <4 x i8>, ptr %lhsptr, i64 %gid + %lhs = load <4 x i8>, ptr %arrayidx.lhs, align 4 + %arrayidx.rhs = getelementptr inbounds <4 x i8>, ptr %rhsptr, i64 %gid + %rhs = load <4 x i8>, ptr %arrayidx.rhs, align 4 + %shuffle_up = call <4 x i8> @__mux_sub_group_shuffle_down_v4i8(<4 x i8> %lhs, <4 x i8> %rhs, i32 2) + %arrayidx.out = getelementptr inbounds <4 x i8>, ptr %out, i64 %gid + store <4 x i8> %shuffle_up, ptr %arrayidx.out, align 4 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out) +; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4 +; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4 +; CHECK: [[DELTALD:%.*]] = load <4 x i32>, ptr %arrayidx.deltas, align 4 + +; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, [[DELTALD]] +; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}} + +; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0 +; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]]) +; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0 +; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]] + +; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1 +; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]]) +; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1 +; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]] + +; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2 +; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]]) +; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2 +; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]] + +; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3 +; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]]) +; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3 +; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]] +define spir_kernel void @kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid + %lhs = load float, ptr %arrayidx.lhs, align 4 + %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid + %rhs = load float, ptr %arrayidx.rhs, align 4 + %arrayidx.deltas = getelementptr inbounds i32, ptr %deltaptr, i64 %gid + %delta = load i32, ptr %arrayidx.deltas, align 4 + %shuffle_up = call float @__mux_sub_group_shuffle_down_f32(float %lhs, float %rhs, i32 %delta) + %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid + store float %shuffle_up, ptr %arrayidx.out, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare float @__mux_sub_group_shuffle_down_f32(float %prev, float %curr, i32 %delta) +declare <4 x i8> @__mux_sub_group_shuffle_down_v4i8(<4 x i8> %prev, <4 x i8> %curr, i32 %delta) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll new file mode 100644 index 0000000000000..779596da58a14 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll @@ -0,0 +1,242 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \ +; RUN: --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) +; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4 +; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4 + +; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} + +; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer +; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer +; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]] + +; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} + +; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]] +; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]] + +; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]] + +; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0 +; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]]) +; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0 +; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]] + +; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1 +; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]]) +; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1 +; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]] + +; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2 +; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]]) +; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2 +; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]] + +; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3 +; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]]) +; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3 +; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]] +define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid + %lhs = load float, ptr %arrayidx.lhs, align 4 + %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid + %rhs = load float, ptr %arrayidx.rhs, align 4 + %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %lhs, float %rhs, i32 1) + %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid + store float %shuffle_up, ptr %arrayidx.out, align 8 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) +; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}} +; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} + +; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer +; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer +; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]] + +; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} + +; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]] +; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]] + +; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]] + +; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0 +; CHECK: [[SHUFF0:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8( +; CHECK-SAME: <16 x i8> [[LHS:%.*]], <16 x i8> [[RHS:%.*]], i32 [[DELTA0]]) +; CHECK: [[SUBVECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0 +; CHECK: [[ELTBASE0:%.*]] = mul i32 [[SUBVECIDX0]], 4 +; CHECK: [[VECIDX00:%.*]] = add i32 [[ELTBASE0]], 0 +; CHECK: [[ELT00:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX00]] +; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> poison, i8 [[ELT00]], i32 0 +; CHECK: [[VECIDX01:%.*]] = add i32 [[ELTBASE0]], 1 +; CHECK: [[ELT01:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX01]] +; CHECK: [[VEC01:%.*]] = insertelement <4 x i8> [[VEC00]], i8 [[ELT01]], i32 1 +; CHECK: [[VECIDX02:%.*]] = add i32 [[ELTBASE0]], 2 +; CHECK: [[ELT02:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX02]] +; CHECK: [[VEC02:%.*]] = insertelement <4 x i8> [[VEC01]], i8 [[ELT02]], i32 2 +; CHECK: [[VECIDX03:%.*]] = add i32 [[ELTBASE0]], 3 +; CHECK: [[ELT03:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX03]] +; CHECK: [[VEC03:%.*]] = insertelement <4 x i8> [[VEC02]], i8 [[ELT03]], i32 3 + +; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1 +; CHECK: [[SHUFF1:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8( +; CHECK-SAME: <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA1]]) +; CHECK: [[SUBVECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1 +; CHECK: [[ELTBASE1:%.*]] = mul i32 [[SUBVECIDX1]], 4 +; CHECK: [[VECIDX10:%.*]] = add i32 [[ELTBASE1]], 0 +; CHECK: [[ELT10:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX10]] +; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> poison, i8 [[ELT10]], i32 0 +; CHECK: [[VECIDX11:%.*]] = add i32 [[ELTBASE1]], 1 +; CHECK: [[ELT11:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX11]] +; CHECK: [[VEC11:%.*]] = insertelement <4 x i8> [[VEC10]], i8 [[ELT11]], i32 1 +; CHECK: [[VECIDX12:%.*]] = add i32 [[ELTBASE1]], 2 +; CHECK: [[ELT12:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX12]] +; CHECK: [[VEC12:%.*]] = insertelement <4 x i8> [[VEC11]], i8 [[ELT12]], i32 2 +; CHECK: [[VECIDX13:%.*]] = add i32 [[ELTBASE1]], 3 +; CHECK: [[ELT13:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX13]] +; CHECK: [[VEC13:%.*]] = insertelement <4 x i8> [[VEC12]], i8 [[ELT13]], i32 3 + +; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2 +; CHECK: [[SHUFF2:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8( +; CHECK-SAME: <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA2]]) +; CHECK: [[SUBVECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2 +; CHECK: [[ELTBASE2:%.*]] = mul i32 [[SUBVECIDX2]], 4 +; CHECK: [[VECIDX20:%.*]] = add i32 [[ELTBASE2]], 0 +; CHECK: [[ELT20:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX20]] +; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> poison, i8 [[ELT20]], i32 0 +; CHECK: [[VECIDX21:%.*]] = add i32 [[ELTBASE2]], 1 +; CHECK: [[ELT21:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX21]] +; CHECK: [[VEC21:%.*]] = insertelement <4 x i8> [[VEC20]], i8 [[ELT21]], i32 1 +; CHECK: [[VECIDX22:%.*]] = add i32 [[ELTBASE2]], 2 +; CHECK: [[ELT22:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX22]] +; CHECK: [[VEC22:%.*]] = insertelement <4 x i8> [[VEC21]], i8 [[ELT22]], i32 2 +; CHECK: [[VECIDX23:%.*]] = add i32 [[ELTBASE2]], 3 +; CHECK: [[ELT23:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX23]] +; CHECK: [[VEC23:%.*]] = insertelement <4 x i8> [[VEC22]], i8 [[ELT23]], i32 3 + +; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3 +; CHECK: [[SHUFF3:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8( +; CHECK-SAME: <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA3]]) +; CHECK: [[SUBVECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3 +; CHECK: [[ELTBASE3:%.*]] = mul i32 [[SUBVECIDX3]], 4 +; CHECK: [[VECIDX30:%.*]] = add i32 [[ELTBASE3]], 0 +; CHECK: [[ELT30:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX30]] +; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> poison, i8 [[ELT30]], i32 0 +; CHECK: [[VECIDX31:%.*]] = add i32 [[ELTBASE3]], 1 +; CHECK: [[ELT31:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX31]] +; CHECK: [[VEC31:%.*]] = insertelement <4 x i8> [[VEC30]], i8 [[ELT31]], i32 1 +; CHECK: [[VECIDX32:%.*]] = add i32 [[ELTBASE3]], 2 +; CHECK: [[ELT32:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX32]] +; CHECK: [[VEC32:%.*]] = insertelement <4 x i8> [[VEC31]], i8 [[ELT32]], i32 2 +; CHECK: [[VECIDX33:%.*]] = add i32 [[ELTBASE3]], 3 +; CHECK: [[ELT33:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX33]] +; CHECK: [[VEC33:%.*]] = insertelement <4 x i8> [[VEC32]], i8 [[ELT33]], i32 3 +define spir_kernel void @kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.lhs = getelementptr inbounds <4 x i8>, ptr %lhsptr, i64 %gid + %lhs = load <4 x i8>, ptr %arrayidx.lhs, align 4 + %arrayidx.rhs = getelementptr inbounds <4 x i8>, ptr %rhsptr, i64 %gid + %rhs = load <4 x i8>, ptr %arrayidx.rhs, align 4 + %shuffle_up = call <4 x i8> @__mux_sub_group_shuffle_up_v4i8(<4 x i8> %lhs, <4 x i8> %rhs, i32 2) + %arrayidx.out = getelementptr inbounds <4 x i8>, ptr %out, i64 %gid + store <4 x i8> %shuffle_up, ptr %arrayidx.out, align 4 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out) +; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4 +; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4 +; CHECK: [[DELTALD:%.*]] = load <4 x i32>, ptr %arrayidx.deltas, align 4 + +; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[DELTALD]] +; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} + +; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer +; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer +; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]] + +; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} + +; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]] +; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]] + +; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]] + +; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0 +; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]]) +; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0 +; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]] + +; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1 +; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]]) +; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1 +; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]] + +; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2 +; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]]) +; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2 +; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]] + +; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3 +; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32( +; CHECK-SAME: <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]]) +; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3 +; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]] +define spir_kernel void @kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid + %lhs = load float, ptr %arrayidx.lhs, align 4 + %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid + %rhs = load float, ptr %arrayidx.rhs, align 4 + %arrayidx.deltas = getelementptr inbounds i32, ptr %deltaptr, i64 %gid + %delta = load i32, ptr %arrayidx.deltas, align 4 + %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %lhs, float %rhs, i32 %delta) + %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid + store float %shuffle_up, ptr %arrayidx.out, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare float @__mux_sub_group_shuffle_up_f32(float %prev, float %curr, i32 %delta) +declare <4 x i8> @__mux_sub_group_shuffle_up_v4i8(<4 x i8> %prev, <4 x i8> %curr, i32 %delta) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll new file mode 100644 index 0000000000000..c1aaca731d2cd --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll @@ -0,0 +1,231 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \ +; RUN: --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128" + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_const_value(ptr %in, ptr %out) +; The XOR'd sub-group local IDs +; CHECK: [[XORIDS:%.*]] = xor <4 x i32> +; Which mux sub-group each of the XOR'd sub-group local IDs correspond to +; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; Which vector group element each of the XOR'd sub-group local IDs correspond to +; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} + +; Extract the first XOR'd vector-local sub-group local ID from the vector of vector indices +; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0 +; Extract the data element that this XOR'd local ID corresponds to +; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]] +; Extract the first XOR'd mux-local sub-group local ID from the vector of mux indices +; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0 +; Shuffle across any hardware sub-group +; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]]) +; Put that result into the final vector +; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0 + +; And so on for the other shuffle values +; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1 +; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]] +; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1 +; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]]) +; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1 + +; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2 +; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]] +; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2 +; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]]) +; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2 + +; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3 +; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]] +; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3 +; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]]) +; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3 + +; CHECK: store <4 x half> [[SHUFF_VEC3]], +define spir_kernel void @kernel_varying_data_const_value(ptr %in, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid + %data = load half, ptr %arrayidx.in, align 2 + %shuffle1 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 4) + %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid + store half %shuffle1, ptr %arrayidx.out, align 2 + ret void +} + +; This should just be the same as the previous kernel. The uniform value doesn't change anything. +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_uniform_value(ptr %in, i32 %val, ptr %out) +; CHECK: [[XORIDS:%.*]] = xor <4 x i32> +; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0 +; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]] +; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0 +; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]]) +; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0 +; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1 +; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]] +; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1 +; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]]) +; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1 +; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2 +; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]] +; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2 +; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]]) +; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2 +; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3 +; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]] +; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3 +; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]]) +; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3 +; CHECK: store <4 x half> [[SHUFF_VEC3]], +define spir_kernel void @kernel_varying_data_uniform_value(ptr %in, i32 %val, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid + %data = load half, ptr %arrayidx.in, align 2 + %shuffle2 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val) + %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid + store half %shuffle2, ptr %arrayidx.out, align 2 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data_uniform_value(half %data, i32 %val, ptr %out) +; CHECK: [[SPLATINS:%.*]] = insertelement <4 x half> poison, half %data, i64 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x half> [[SPLATINS]], <4 x half> poison, <4 x i32> zeroinitializer +; CHECK: store <4 x half> [[SPLAT]] +define spir_kernel void @kernel_uniform_data_uniform_value(half %data, i32 %val, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %shuffle3 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val) + %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid + store half %shuffle3, ptr %arrayidx.out, align 2 + ret void +} + +; This should just be the same as the previous kernel. The varying value doesn't change anything. +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_varying_value(ptr %in, ptr %vals, ptr %out) +; CHECK: [[XORIDS:%.*]] = xor <4 x i32> +; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0 +; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]] +; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0 +; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]]) +; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0 +; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1 +; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]] +; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1 +; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]]) +; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1 +; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2 +; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]] +; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2 +; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]]) +; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2 +; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3 +; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]] +; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3 +; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]]) +; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3 +; CHECK: store <4 x half> [[SHUFF_VEC3]], +define spir_kernel void @kernel_varying_data_varying_value(ptr %in, ptr %vals, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid + %data = load half, ptr %arrayidx.in, align 2 + %arrayidx.vals = getelementptr inbounds i32, ptr %in, i64 %gid + %val = load i32, ptr %arrayidx.vals, align 4 + %shuffle4 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val) + %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid + store half %shuffle4, ptr %arrayidx.out, align 2 + ret void +} + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_vec_data_varying_value(ptr %in, ptr %vals, ptr %out) +; CHECK: [[XORIDS:%.*]] = xor <4 x i32> +; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} +; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} + +; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0 +; CHECK: [[MULIDXELT0:%.*]] = mul i32 [[IDXELT0]], 2 +; CHECK: [[MADIDXELT00:%.*]] = add i32 [[MULIDXELT0]], 0 +; CHECK: [[ELT00:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT00]] +; CHECK: [[DATAELT00:%.*]] = insertelement <2 x float> poison, float [[ELT00]], i32 0 +; CHECK: [[MADIDXELT01:%.*]] = add i32 [[MULIDXELT0]], 1 +; CHECK: [[ELT01:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT01]] +; CHECK: [[DATAELT01:%.*]] = insertelement <2 x float> [[DATAELT00]], float [[ELT01]], i32 1 +; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0 +; CHECK: [[SHUFF_ELT0:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT01]], i32 [[ID0]]) +; CHECK: [[SHUFF_RES0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32( +; CHECK-SAME: <8 x float> poison, <2 x float> [[SHUFF_ELT0]], i64 0) + +; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1 +; CHECK: [[MULIDXELT1:%.*]] = mul i32 [[IDXELT1]], 2 +; CHECK: [[MADIDXELT10:%.*]] = add i32 [[MULIDXELT1]], 0 +; CHECK: [[ELT10:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT10]] +; CHECK: [[DATAELT10:%.*]] = insertelement <2 x float> poison, float [[ELT10]], i32 0 +; CHECK: [[MADIDXELT11:%.*]] = add i32 [[MULIDXELT1]], 1 +; CHECK: [[ELT11:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT11]] +; CHECK: [[DATAELT11:%.*]] = insertelement <2 x float> [[DATAELT10]], float [[ELT11]], i32 1 +; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1 +; CHECK: [[SHUFF_ELT1:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT11]], i32 [[ID1]]) +; CHECK: [[SHUFF_RES1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32( +; CHECK-SAME: <8 x float> [[SHUFF_RES0]], <2 x float> [[SHUFF_ELT1]], i64 2) + +; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2 +; CHECK: [[MULIDXELT2:%.*]] = mul i32 [[IDXELT2]], 2 +; CHECK: [[MADIDXELT20:%.*]] = add i32 [[MULIDXELT2]], 0 +; CHECK: [[ELT20:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT20]] +; CHECK: [[DATAELT20:%.*]] = insertelement <2 x float> poison, float [[ELT20]], i32 0 +; CHECK: [[MADIDXELT21:%.*]] = add i32 [[MULIDXELT2]], 1 +; CHECK: [[ELT21:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT21]] +; CHECK: [[DATAELT21:%.*]] = insertelement <2 x float> [[DATAELT20]], float [[ELT21]], i32 1 +; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2 +; CHECK: [[SHUFF_ELT2:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT21]], i32 [[ID2]]) +; CHECK: [[SHUFF_RES2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32( +; CHECK-SAME: <8 x float> [[SHUFF_RES1]], <2 x float> [[SHUFF_ELT2]], i64 4) + +; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3 +; CHECK: [[MULIDXELT3:%.*]] = mul i32 [[IDXELT3]], 2 +; CHECK: [[MADIDXELT30:%.*]] = add i32 [[MULIDXELT3]], 0 +; CHECK: [[ELT30:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT30]] +; CHECK: [[DATAELT30:%.*]] = insertelement <2 x float> poison, float [[ELT30]], i32 0 +; CHECK: [[MADIDXELT31:%.*]] = add i32 [[MULIDXELT3]], 1 +; CHECK: [[ELT31:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT31]] +; CHECK: [[DATAELT31:%.*]] = insertelement <2 x float> [[DATAELT30]], float [[ELT31]], i32 1 +; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3 +; CHECK: [[SHUFF_ELT3:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT31]], i32 [[ID3]]) +; CHECK: [[SHUFF_RES3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32( +; CHECK-SAME: <8 x float> [[SHUFF_RES2]], <2 x float> [[SHUFF_ELT3]], i64 6) + +; CHECK: store <8 x float> [[SHUFF_RES3]] +define spir_kernel void @kernel_varying_vec_data_varying_value(ptr %in, ptr %vals, ptr %out) { + %gid = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid + %data = load <2 x float>, ptr %arrayidx.in, align 8 + %arrayidx.vals = getelementptr inbounds i32, ptr %in, i64 %gid + %val = load i32, ptr %arrayidx.vals, align 4 + %shuffle5 = call <2 x float> @__mux_sub_group_shuffle_xor_v2f32(<2 x float> %data, i32 %val) + %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid + store <2 x float> %shuffle5, ptr %arrayidx.out, align 8 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare half @__mux_sub_group_shuffle_xor_f16(half, i32) +declare <2 x float> @__mux_sub_group_shuffle_xor_v2f32(<2 x float>, i32) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll new file mode 100644 index 0000000000000..2a8464528d01d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll @@ -0,0 +1,118 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-passes=ternary-transform,verify -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_positive(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 0 + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + store i64 1, i64* %c3, align 4 + ret void +} + +define spir_kernel void @test_positive_gep_different_type(i64 %a, i64 %b, i8* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 0 + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i8, i8* %c2, i64 %gid + store i8 1, i8* %c3, align 4 + ret void +} + +define spir_kernel void @test_negative(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + %c1 = getelementptr i64, i64* %c, i64 0 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + store i64 %b, i64* %c2, align 4 + ret void + } + + +define spir_kernel void @test_vector_scalar_cond(i64 %a, <2 x i32> %b, <2 x i32>* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr <2 x i32>, <2 x i32>* %c, i64 %gid + %c1 = getelementptr <2 x i32>, <2 x i32>* %c, i64 0 + %c2 = select i1 %cond, <2 x i32>* %c0, <2 x i32>* %c1 + %c3 = getelementptr <2 x i32>, <2 x i32>* %c2, i64 %gid + store <2 x i32> , <2 x i32>* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_positive(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 0 +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %[[XOR:.+]] = xor i1 %cond, true +; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid +; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond) +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]]) + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_positive_gep_different_type(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 0 +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %[[XOR:.+]] = xor i1 %cond, true +; CHECK: %[[GEP1:.+]] = getelementptr i8, ptr %c0, i64 %gid +; CHECK: %[[GEP2:.+]] = getelementptr i8, ptr %c1, i64 %gid +; CHECK: call void @__vecz_b_masked_store4_hu3ptrb(i8 1, ptr %[[GEP1]], i1 %cond) +; CHECK: call void @__vecz_b_masked_store4_hu3ptrb(i8 1, ptr %[[GEP2]], i1 %[[XOR]]) + +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_negative(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: %c1 = getelementptr i64, ptr %c, i64 0 +; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1 +; CHECK: store i64 %b, ptr %c2, align 4 + +; Note: we don't perform this transform on vector accesses. +; CHECK: define spir_kernel void @__vecz_v4_test_vector_scalar_cond(i64 %a, <2 x i32> %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr <2 x i32>, ptr %c, i64 %gid +; CHECK: %c1 = getelementptr <2 x i32>, ptr %c, i64 0 +; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1 +; CHECK: %c3 = getelementptr <2 x i32>, ptr %c2, i64 %gid +; CHECK: store <2 x i32> , ptr %c3, align 4 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll new file mode 100644 index 0000000000000..69756d0886cc3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %gid_shift = shl i64 %gid, 1 + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 %gid_shift + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform is applied when the source GEPs have +; constant strides, even though they are different. + +; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %gid_shift = shl i64 %gid, 1 +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_shift +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %[[XOR:.+]] = xor i1 %cond, true +; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid +; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond) +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll new file mode 100644 index 0000000000000..7636e5411a171 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %gid_offset = add i64 %gid, 16 + %gid_mashed = xor i64 %gid, 12462 + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 %gid_offset + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid_mashed + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform pass is not applied when the GEP index +; is divergent, which would result in a scatter store regardless. + +; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c) +; CHECK: %gid_offset = add i64 %gid, 16 +; CHECK: %gid_mashed = xor i64 %gid, 12462 +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1 +; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid_mashed +; CHECK: store i64 1, ptr %c3, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll new file mode 100644 index 0000000000000..02573c3ce0b59 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %gid_offset = add i64 %gid, 16 + %gid_mashed = xor i64 %gid, 12462 + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 %gid_mashed + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform pass is not applied when a source GEP +; is divergent, which would result in a scatter store regardless. + +; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c) +; CHECK: %gid_offset = add i64 %gid, 16 +; CHECK: %gid_mashed = xor i64 %gid, 12462 +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_mashed +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1 +; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid +; CHECK: store i64 1, ptr %c3, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll new file mode 100644 index 0000000000000..fe73640be0612 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll @@ -0,0 +1,44 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_negative -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_negative(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + %c1 = getelementptr i64, i64* %c, i64 0 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + store i64 %b, i64* %c2, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform is not applied when the select is not +; accessed through an additional GEP. + +; CHECK: define spir_kernel void @__vecz_v4_test_negative(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: %c1 = getelementptr i64, ptr %c, i64 0 +; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1 +; CHECK: store i64 %b, ptr %c2, align 4 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll new file mode 100644 index 0000000000000..6eff9b6ad58e4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %gid_offset = add i64 %gid, 16 + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 %gid_offset + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform is applied when the source GEPs have +; equal constant strides. + +; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %gid_offset = add i64 %gid, 16 +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %[[XOR:.+]] = xor i1 %cond, true +; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid +; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond) +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll new file mode 100644 index 0000000000000..8e88963b75871 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll @@ -0,0 +1,54 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %gid_shift = shl i64 %gid, 1 + %cond = icmp eq i64 %a, 0 + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 %gid_shift + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform is applied when the condition is +; uniform, and the source GEPs have different constant strides. + +; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %gid_shift = shl i64 %gid, 1 +; CHECK: %cond = icmp eq i64 %a, 0 +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_shift +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %[[XOR:.+]] = xor i1 %cond, true +; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid +; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond) +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll new file mode 100644 index 0000000000000..3cee1ff3eb4b4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll @@ -0,0 +1,52 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %gid_offset = add i64 %gid, 16 + %cond = icmp eq i64 %a, 0 + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 %gid_offset + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform is not applied when the condition is +; uniform, and the two strides are the same. + +; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c) +; CHECK: %gid_offset = add i64 %gid, 16 +; CHECK: %cond = icmp eq i64 %a, 0 +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1 +; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid +; CHECK: store i64 1, ptr %c3, align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll new file mode 100644 index 0000000000000..1f2b59b23456d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll @@ -0,0 +1,46 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %gid_offset = add i64 %gid, 16 + %cond = icmp eq i64 %a, 0 + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 %gid_offset + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 0 + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform is not applied when the condition is +; uniform and the two strides are equal, and that the result is a contiguous +; vector store. + +; CHECK: %[[SELECT:.+]] = select i1 %cond, ptr %c0, ptr %c1 +; CHECK: %[[BASE:.+]] = getelementptr i64, ptr %[[SELECT]], i64 0 +; CHECK: store <4 x i64> {{<(i64 1(, )?)+>|splat \(i64 1\)}}, ptr %[[BASE]], align 4 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll new file mode 100644 index 0000000000000..a9d1a37b305b8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll @@ -0,0 +1,52 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 %gid + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 0 + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform is applied when one of the source GEPs +; is uniform + +; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 0 +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %[[XOR:.+]] = xor i1 %cond, true +; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid +; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond) +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll new file mode 100644 index 0000000000000..b577f149f82e3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll @@ -0,0 +1,52 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) { +entry: + %gid = call i64 @__mux_get_global_id(i32 0) + %cond = icmp eq i64 %a, %gid + %c0 = getelementptr i64, i64* %c, i64 1 + store i64 %b, i64* %c0, align 4 + %c1 = getelementptr i64, i64* %c, i64 0 + store i64 0, i64* %c1, align 4 + %c2 = select i1 %cond, i64* %c0, i64* %c1 + %c3 = getelementptr i64, i64* %c2, i64 %gid + store i64 1, i64* %c3, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This checks that the ternary transform is applied when the source GEPs are +; both uniform. + +; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c) +; CHECK: %gid = call i64 @__mux_get_global_id(i32 0) +; CHECK: %cond = icmp eq i64 %a, %gid +; CHECK: %c0 = getelementptr i64, ptr %c, i64 1 +; CHECK: store i64 %b, ptr %c0, align 4 +; CHECK: %c1 = getelementptr i64, ptr %c, i64 0 +; CHECK: store i64 0, ptr %c1, align 4 +; CHECK: %[[XOR:.+]] = xor i1 %cond, true +; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid +; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond) +; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]]) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll new file mode 100644 index 0000000000000..8b5d83c3b6835 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll @@ -0,0 +1,118 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -vecz-simd-width=128 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +; CHECK-LABEL: define spir_kernel void @__vecz_v128_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) +; CHECK: = load <128 x i32>, ptr addrspace(1) +; CHECK: = load <128 x i32>, ptr addrspace(1) +; CHECK: = add nsw <128 x i32> +; CHECK: store <128 x i32> +define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 { +entry: + %in1.addr = alloca i32 addrspace(1)*, align 8 + %in2.addr = alloca i32 addrspace(1)*, align 8 + %out.addr = alloca i32 addrspace(1)*, align 8 + %tid = alloca i64, align 8 + %a = alloca i32, align 4 + %b = alloca i32, align 4 + store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30 + store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30 + store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30 + call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31 + %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31 + store i64 %call, i64* %tid, align 8, !dbg !31 + call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32 + %0 = load i64, i64* %tid, align 8, !dbg !32 + %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32 + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32 + store i32 %2, i32* %a, align 4, !dbg !32 + call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33 + %3 = load i64, i64* %tid, align 8, !dbg !33 + %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33 + %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33 + store i32 %5, i32* %b, align 4, !dbg !33 + %6 = load i32, i32* %a, align 4, !dbg !34 + %7 = load i32, i32* %b, align 4, !dbg !34 + %add = add nsw i32 %6, %7, !dbg !34 + %8 = load i64, i64* %tid, align 8, !dbg !34 + %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34 + store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34 + ret void, !dbg !35 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +declare i64 @__mux_get_global_id(i32) #2 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nobuiltin } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!21} +!llvm.module.flags = !{!27} +!llvm.ident = !{!28} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2) +!1 = !DIFile(filename: "", directory: "/tmp") +!2 = !{} +!3 = !{!4} +!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10) +!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp") +!6 = !DISubroutineType(types: !7) +!7 = !{null, !8, !8, !8} +!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64) +!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!10 = !{!11, !12, !13, !14, !19, !20} +!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8) +!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8) +!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8) +!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15) +!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17) +!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp") +!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18) +!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned) +!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9) +!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9) +!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26} +!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1} +!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} +!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"} +!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"} +!26 = !{!"kernel_arg_type_qual", !"", !"", !""} +!27 = !{i32 2, !"Debug Info Version", i32 3} +!28 = !{!"clang version 3.8.0 "} +!29 = !DIExpression() +!30 = !DILocation(line: 1, scope: !4) +!31 = !DILocation(line: 3, scope: !4) +!32 = !DILocation(line: 5, scope: !4) +!33 = !DILocation(line: 6, scope: !4) +!34 = !DILocation(line: 7, scope: !4) +!35 = !DILocation(line: 8, scope: !4) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll new file mode 100644 index 0000000000000..0f667a71134e5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll @@ -0,0 +1,40 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -vecz-passes=scalarizer -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +; CHECK-LABEL: define spir_kernel void @__vecz_v4_add(ptr %in1, ptr %in2, ptr %out) +; CHECK-COUNT-128: = extractelement <128 x i32> %in1v, +; CHECK-COUNT-128: insertelement <128 x i32> +define spir_kernel void @add(<128 x i32>* %in1, <128 x i32>* %in2, <128 x i32>* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %in1p = getelementptr inbounds <128 x i32>, <128 x i32>* %in1, i64 %call + %in1v = load <128 x i32>, <128 x i32>* %in1p, align 4 + %in2p = getelementptr inbounds <128 x i32>, <128 x i32>* %in2, i64 %call + %in2v = load <128 x i32>, <128 x i32>* %in2p, align 4 + %add = add nsw <128 x i32> %in1v, %in2v + %outp = getelementptr inbounds <128 x i32>, <128 x i32>* %out, i64 %call + store <128 x i32> %add, <128 x i32>* %outp, align 4 + ret void +} + +declare i64 @__mux_get_global_id(i32) #2 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll new file mode 100644 index 0000000000000..d7b37641357b8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k uniform_address_index -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +define spir_kernel void @uniform_address_index(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %a, i32 %b) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #2 + %0 = icmp eq i32 %a, -2147483648 + %1 = icmp eq i32 %b, -1 + %2 = and i1 %0, %1 + %3 = icmp eq i32 %b, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %b + %div = sdiv i32 %a, %5 + %6 = trunc i64 %call to i32 + %conv1 = add i32 %div, %6 + %idxprom = sext i32 %conv1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %7 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %7, i32 addrspace(1)* %arrayidx3, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) local_unnamed_addr #1 + +; It tests to ensure that the array index is correctly identified +; as having a uniform stride and generates plain vector loads and not +; gather/scatter builtin calls + +; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index +; CHECK: entry: +; CHECK: call i64 @__mux_get_global_id(i32 0) +; CHECK-DAG: %[[INA:.+]] = getelementptr i32, ptr addrspace(1) %in, i32 %[[X:.+]] +; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]] +; CHECK-DAG: %[[OUTA:.+]] = getelementptr i32, ptr addrspace(1) %out, i32 %[[X:.+]] +; CHECK-DAG: store <4 x i32> %[[LOAD]], ptr addrspace(1) %[[OUTA]] +; CHECK-NOT: call <4 x i32> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll new file mode 100644 index 0000000000000..d7b37641357b8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll @@ -0,0 +1,56 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k uniform_address_index -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +define spir_kernel void @uniform_address_index(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %a, i32 %b) local_unnamed_addr #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #2 + %0 = icmp eq i32 %a, -2147483648 + %1 = icmp eq i32 %b, -1 + %2 = and i1 %0, %1 + %3 = icmp eq i32 %b, 0 + %4 = or i1 %3, %2 + %5 = select i1 %4, i32 1, i32 %b + %div = sdiv i32 %a, %5 + %6 = trunc i64 %call to i32 + %conv1 = add i32 %div, %6 + %idxprom = sext i32 %conv1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %7 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %7, i32 addrspace(1)* %arrayidx3, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) local_unnamed_addr #1 + +; It tests to ensure that the array index is correctly identified +; as having a uniform stride and generates plain vector loads and not +; gather/scatter builtin calls + +; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index +; CHECK: entry: +; CHECK: call i64 @__mux_get_global_id(i32 0) +; CHECK-DAG: %[[INA:.+]] = getelementptr i32, ptr addrspace(1) %in, i32 %[[X:.+]] +; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]] +; CHECK-DAG: %[[OUTA:.+]] = getelementptr i32, ptr addrspace(1) %out, i32 %[[X:.+]] +; CHECK-DAG: store <4 x i32> %[[LOAD]], ptr addrspace(1) %[[OUTA]] +; CHECK-NOT: call <4 x i32> diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll new file mode 100644 index 0000000000000..86e3d6145c4c3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll @@ -0,0 +1,45 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare i32 @__mux_get_local_size(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %size = call i32 @__mux_get_local_size(i32 0) + br label %loop + +loop: + %index = phi i32 [0, %entry], [%inc, %loop] + %load = load i32, i32 addrspace(1)* %in + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index + store i32 %load, i32 addrspace(1)* %slot + %inc = add i32 %index, 1 + %cmp = icmp ne i32 %inc, %size + br i1 %cmp, label %loop, label %merge + +merge: + ret void +} + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK-NOT: define spir_kernel void @test +; CHECK: %[[LOAD:load.*]] = load i32, ptr addrspace(1) %in +; CHECK: store i32 %[[LOAD]], ptr addrspace(1) %slot diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll new file mode 100644 index 0000000000000..f6e12b7d83615 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll @@ -0,0 +1,49 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %id = call i64 @__mux_get_global_id(i64 0) #2 + %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id + %load = load i32, i32 addrspace(1)* %init_addr + br label %loop + +loop: + %index = phi i64 [0, %entry], [%inc, %loop] + %slot = phi i32 addrspace(1)* [%init_addr, %entry], [%inc_addr, %loop] + store i32 %load, i32 addrspace(1)* %slot + %inc_addr = getelementptr inbounds i32, i32 addrspace(1)* %slot, i64 16 + %inc = add i64 %index, 1 + %cmp = icmp ne i64 %inc, 16 + br i1 %cmp, label %loop, label %merge + +merge: + ret void +} + +declare i64 @__mux_get_global_id(i64) + +; It checks that the stride analysis can tell the store is contiguous through the PHI node. + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr +; CHECK: loop: +; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll new file mode 100644 index 0000000000000..bc6dc059cb554 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll @@ -0,0 +1,50 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %id = call i64 @__mux_get_global_id(i64 0) #2 + %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id + %load = load i32, i32 addrspace(1)* %init_addr + br label %loop + +loop: + %index = phi i64 [0, %entry], [%inc, %loop] + %slot = phi i32 addrspace(1)* [%inc_addr, %loop], [%init_addr, %entry] + store i32 %load, i32 addrspace(1)* %slot + %inc_addr = getelementptr inbounds i32, i32 addrspace(1)* %slot, i64 16 + %inc = add i64 %index, 1 + %cmp = icmp ne i64 %inc, 16 + br i1 %cmp, label %loop, label %merge + +merge: + ret void +} + +declare i64 @__mux_get_global_id(i64) + +; It checks that the stride analysis can tell the store is contiguous through the PHI node. +; Same as uniform_loop_contiguous_phi1.ll except with the PHI node incoming values reversed. + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr +; CHECK: loop: +; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll new file mode 100644 index 0000000000000..4baf7d5791f7b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll @@ -0,0 +1,51 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %id = call i64 @__mux_get_global_id(i64 0) #2 + %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id + %load = load i32, i32 addrspace(1)* %init_addr + br label %loop + +loop: + %count = phi i64 [0, %entry], [%inc, %loop] + %index = phi i64 [%id, %entry], [%inc_index, %loop] + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %index + store i32 %load, i32 addrspace(1)* %slot + %inc_index = add i64 %index, 16 + %inc = add i64 %count, 1 + %cmp = icmp ne i64 %inc, 16 + br i1 %cmp, label %loop, label %merge + +merge: + ret void +} + +declare i64 @__mux_get_global_id(i64) + +; It checks that the stride analysis can tell the store is contiguous through the PHI node. +; Same as uniform_loop_contiguous_phi1.ll except with the index GEP inside the loop. + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr +; CHECK: loop: +; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll new file mode 100644 index 0000000000000..33033bd0d9518 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll @@ -0,0 +1,51 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %id = call i64 @__mux_get_global_id(i64 0) #2 + %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id + %load = load i32, i32 addrspace(1)* %init_addr + br label %loop + +loop: + %count = phi i64 [0, %entry], [%inc, %loop] + %index = phi i64 [%inc_index, %loop], [%id, %entry] + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %index + store i32 %load, i32 addrspace(1)* %slot + %inc_index = add i64 %index, 16 + %inc = add i64 %count, 1 + %cmp = icmp ne i64 %inc, 16 + br i1 %cmp, label %loop, label %merge + +merge: + ret void +} + +declare i64 @__mux_get_global_id(i64) + +; It checks that the stride analysis can tell the store is contiguous through the PHI node. +; Same as uniform_loop_contiguous_phi3.ll except with the PHI node incoming values reversed. + +; CHECK: define spir_kernel void @__vecz_v4_test +; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr +; CHECK: loop: +; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll new file mode 100644 index 0000000000000..ac8cb69ee5fc5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll @@ -0,0 +1,50 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir-unknown-unknown" + +declare spir_func i32 @__mux_get_local_size(i32); + +define spir_kernel void @test(i32 addrspace(1)* %in) { +entry: + %size = call i32 @__mux_get_local_size(i32 0) + br label %loop + +loop: + %index = phi i32 [0, %entry], [%inc, %loop] + %load = load i32, i32 addrspace(1)* %in + %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index + store i32 %load, i32 addrspace(1)* %slot + %inc = add i32 %index, 1 + %cmp = icmp ne i32 %inc, %size + br i1 %cmp, label %loop, label %merge + +merge: + ret void +} + +; CHECK: define spir_kernel void @test(ptr addrspace(1) %in) !codeplay_ca_vecz.base !0 +; CHECK: entry: +; CHECK: loop: +; CHECK: define spir_kernel void @__vecz_v4_test(ptr addrspace(1) %in) #0 !codeplay_ca_vecz.derived !2 +; CHECK: entry: +; CHECK: loop: +; CHECK: !0 = !{!1, ptr @__vecz_v4_test} +; CHECK: !1 = !{i32 4, i32 0, i32 0, i32 0} +; CHECK: !2 = !{!1, ptr @test} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll new file mode 100644 index 0000000000000..6ce5f1cfc7ce4 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll @@ -0,0 +1,58 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 { +entry: + %x = call i64 @__mux_get_global_id(i32 0) #2 + %y = call i64 @__mux_get_global_id(i32 1) #2 + %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x + %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y + %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y + %varying = load i32, i32 addrspace(1)* %a_gep + %uniform1 = load i32, i32 addrspace(1)* %b_gep + %uniform2 = load i32, i32 addrspace(1)* %c_gep + %vu = add i32 %varying, %uniform1 + %vuu = add i32 %vu, %uniform2 + %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x + store i32 %vuu, i32 addrspace(1)* %d_gep + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks that a sum of a varying value with two uniform values +; gets re-associated from (Varying + Uniform) + Uniform +; to Varying + (Uniform + Uniform) +; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation +; CHECK: load + +; Ensure the two uniforms are added together directly +; CHECK: %[[REASSOC:.+]] = add i32 %uniform1, %uniform2 + +; Ensure there is only one vector splat +; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %[[REASSOC]], {{(i32|i64)}} 0 +; CHECK-NOT: insertelement <4 x i32> poison, i32 %{{.+}}, {{(i32|i64)}} 0 + +; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: %[[RESULT:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]] +; CHECK: store <4 x i32> %vuu{{.*}}, ptr addrspace(1) %{{.+}} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll new file mode 100644 index 0000000000000..1315a92a7a9d3 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll @@ -0,0 +1,59 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 { +entry: + %x = call i64 @__mux_get_global_id(i32 0) #2 + %y = call i64 @__mux_get_global_id(i32 1) #2 + %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x + %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %x + %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y + %varying1 = load i32, i32 addrspace(1)* %a_gep + %varying2 = load i32, i32 addrspace(1)* %b_gep + %uniform = load i32, i32 addrspace(1)* %c_gep + %vu = add i32 %varying1, %uniform + %vvu = add i32 %vu, %varying2 + %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x + store i32 %vvu, i32 addrspace(1)* %d_gep + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks that a sum of a varying value with two uniform values +; gets re-associated from (Varying + Uniform) + Varying +; to (Varying + Varying) + Uniform +; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation + +; CHECK: %[[VARYING1:.+]] = load <4 x i32> +; CHECK: %[[VARYING2:.+]] = load <4 x i32> + +; The splat of the uniform value +; CHECK: %uniform = load +; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %uniform, {{(i32|i64)}} 0 +; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer + +; Ensure the two varyings are added together directly +; CHECK: %[[REASSOC:.+]] = add <4 x i32> %[[VARYING1]], %[[VARYING2]] +; CHECK: %[[VVU:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]] +; CHECK: store <4 x i32> %[[VVU]], ptr addrspace(1) %{{.+}} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll new file mode 100644 index 0000000000000..10dab1c06440e --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll @@ -0,0 +1,59 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 { +entry: + %x = call i64 @__mux_get_global_id(i32 0) #2 + %y = call i64 @__mux_get_global_id(i32 1) #2 + %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x + %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %x + %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y + %varying1 = load i32, i32 addrspace(1)* %a_gep + %varying2 = load i32, i32 addrspace(1)* %b_gep + %uniform = load i32, i32 addrspace(1)* %c_gep + %vu = add i32 %varying1, %uniform + %vvu = add i32 %varying2, %vu + %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x + store i32 %vvu, i32 addrspace(1)* %d_gep + ret void +} + +declare i64 @__mux_get_global_id(i32) + +; This test checks that a sum of a varying value with two uniform values +; gets re-associated from Varying + (Varying + Uniform) +; to (Varying + Varying) + Uniform +; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation + +; CHECK: %[[VARYING1:.+]] = load <4 x i32> +; CHECK: %[[VARYING2:.+]] = load <4 x i32> + +; The splat of the uniform value +; CHECK: %uniform = load +; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %uniform, {{(i32|i64)}} 0 +; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer + +; Ensure the two varyings are added together directly +; CHECK: %[[REASSOC:.+]] = add <4 x i32> %[[VARYING1]], %[[VARYING2]] +; CHECK: %[[VVU:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]] +; CHECK: store <4 x i32> %[[VVU]], ptr addrspace(1) %{{.+}} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll new file mode 100644 index 0000000000000..e698f17df7339 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll @@ -0,0 +1,67 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k k_controlflow_loop_if -S < %s | FileCheck %s + +; ModuleID = 'test.cl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind uwtable +define void @k_controlflow_loop_if(float* nocapture %out, float* nocapture readonly %in1, i32* nocapture readnone %in2) #0 { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) #2 + %sext = shl i64 %call, 32 + %idxprom = ashr exact i64 %sext, 32 + %arrayidx = getelementptr inbounds float, float* %in1, i64 %idxprom + %0 = bitcast float* %arrayidx to i32* + %1 = load i32, i32* %0, align 4, !tbaa !7 + %arrayidx2 = getelementptr inbounds float, float* %out, i64 %idxprom + %2 = bitcast float* %arrayidx2 to i32* + store i32 %1, i32* %2, align 4, !tbaa !7 + ret void +} + +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nobuiltin nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (float*, float*, i32*)* @k_controlflow_loop_if, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 0, i32 0, i32 0} +!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"float*", !"float*", !"int*"} +!4 = !{!"kernel_arg_base_type", !"float*", !"float*", !"int*"} +!5 = !{!"kernel_arg_type_qual", !"", !"", !""} +!6 = !{!"clang version 3.8.0 "} +!7 = !{!8, !8, i64 0} +!8 = !{!"float", !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C/C++ TBAA"} + +; The vectorized function +; CHECK: define void @__vecz_v[[WIDTH:[0-9]+]]_k_controlflow_loop_if( + +; The unmangled __mux_get_global_id call +; CHECK: tail call i64 @__mux_get_global_id(i32 0) + +; The vectorized loads and stores +; CHECK: load <4 x i32>, ptr %arrayidx, align 4 +; CHECK: store <4 x i32> %0, ptr %arrayidx2, align 4 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll new file mode 100644 index 0000000000000..ccc581108605a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll @@ -0,0 +1,113 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k entry -w 2 -vecz-handle-declaration-only-calls -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024" +target triple = "spir64-unknown-unknown" + +@.str.1 = private unnamed_addr addrspace(2) constant [10 x i8] c"Test %ld\0A\00", align 1 +@.str.2 = private unnamed_addr addrspace(2) constant [6 x i8] c"Test\0A\00", align 1 + +define spir_kernel void @entry(i64* %input, i64* %output) { +entry: + %gid = call i64 @__mux_get_local_id(i32 0) + %i1ptr = getelementptr i64, i64* %output, i64 %gid + call void @__mux_mem_barrier(i32 2, i32 264) + %ii = call i64 @functionD(i64* %input) + %ib = trunc i64 %ii to i1 + call void @functionA(i64* %i1ptr, i1 %ib) + %i1 = load i64, i64* %i1ptr + %i2ptr = getelementptr i64, i64* %input, i64 %gid + %i2 = load i64, i64* %i2ptr + %cond = icmp eq i64 %i1, %i2 + br i1 %cond, label %middle, label %end + +middle: + %ci3ptr = getelementptr i64, i64* %output, i64 %gid + %ci3 = load i64, i64* %ci3ptr + %fc = call i64 @functionB(i64* %ci3ptr, i64 %ci3, i32 16, i1 false) + %call2 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str.1, i64 0, i64 0), i64 %ci3) + br label %end + +end: + %rr = phi i64 [42, %entry], [%fc, %middle] + call void @functionC(i64 %rr) + %nah = call i64 @functionB(i64* %i2ptr, i64 %rr, i32 8, i1 true) + %call3 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str.2, i64 0, i64 0)) + ret void +} + +declare void @functionA(i64*, i1) + +declare i64 @functionB(i64*, i64, i32, i1) + +declare void @functionC(i64) + +define i64 @functionD(i64* %input) { +entry: + %r = load i64, i64* %input + ret i64 %r +} + +declare void @__mux_mem_barrier(i32, i32) + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) + +declare i64 @__mux_get_local_id(i32) + +; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_entry +; CHECK: entry: +; Check that we didn't mask the __mux_get_local_id call +; CHECK: %gid = call i64 @__mux_get_local_id(i32 0) +; Check that we didn't mask the mem_fence call +; CHECK: call void @__mux_mem_barrier(i32 2, i32 264) +; Check that we instantiated functionA without a mask +; CHECK: call void @functionA(ptr {{.+}}, i1 %ib) +; CHECK: call void @functionA(ptr {{.+}}, i1 %ib) + +; Get the condition -- Also works as a sanity check for this test +; CHECK: [[COND:%cond.*]] = icmp eq <[[WIDTH]] x i64> + +; Check if we instatiated functionB with a mask +; CHECK: [[COND1:%[0-9]+]] = extractelement <[[WIDTH]] x i1> [[COND]], {{(i32|i64)}} 0 +; CHECK: [[COND2:%[0-9]+]] = extractelement <[[WIDTH]] x i1> [[COND]], {{(i32|i64)}} 1 +; CHECK: {{.+}} = call i64 @__vecz_b_masked_functionB(ptr {{(nonnull )?}}{{%[0-9]+}}, i64 {{%[0-9]+}}, i32 16, i1 false, i1 [[COND1]]) +; CHECK: {{.+}} = call i64 @__vecz_b_masked_functionB(ptr {{(nonnull )?}}{{%[0-9]+}}, i64 {{%[0-9]+}}, i32 16, i1 false, i1 [[COND2]]) +; CHECK: call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2) @.str.1, i64 {{%[0-9]+}}, i1 [[COND1]]) +; CHECK: call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2) @.str.1, i64 {{%[0-9]+}}, i1 [[COND2]]) + +; The following checks check the generated functionB masked function +; CHECK: define private i64 @__vecz_b_masked_functionB(ptr{{( %0)?}}, i64{{( %1)?}}, i32{{( %2)?}}, i1{{( %3)?}}, i1{{( %4)?}}) { +; CHECK: entry: +; CHECK: br i1 %4, label %active, label %exit +; CHECK: active: +; CHECK: [[RES:%[0-9]+]] = call i64 @functionB(ptr {{(nonnull )?}}%0, i64 %1, i32 %2, i1 %3) +; CHECK: br label %exit +; CHECK: exit: +; CHECK: [[RET:%[0-9]+]] = phi i64 [ [[RES]], %active ], [ 0, %entry ] +; CHECK: ret i64 [[RET]] + +; The following checks check the generated printf masked function +; CHECK: define private spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2){{( %0)?}}, i64{{( %1)?}}, i1{{( %2)?}}) { +; CHECK: entry: +; CHECK: br i1 %2, label %active, label %exit +; CHECK: active: +; CHECK: [[RES:%[0-9]+]] = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) %0, i64 %1) +; CHECK: br label %exit +; CHECK: exit: +; CHECK: [[RET:%[0-9]+]] = phi i32 [ [[RES]], %active ], [ 0, %entry ] +; CHECK: ret i32 [[RET]] diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll new file mode 100644 index 0000000000000..2f68a9297f6b5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll @@ -0,0 +1,86 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k varying_load1 -vecz-passes=cfg-convert -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @varying_load1(i32 addrspace(1)* %out, i32 %n, i32 addrspace(1)* %meta) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %cmp = icmp slt i32 %conv, 11 + br i1 %cmp, label %if.then, label %if.end16 + +if.then: ; preds = %entry + %0 = load i32, i32 addrspace(1)* %meta, align 4 + %cmp2 = icmp eq i32 %0, 0 + br i1 %cmp2, label %if.then4, label %if.end + +if.then4: ; preds = %if.then + %mul5 = mul nsw i32 %conv, %n + %1 = icmp eq i32 %mul5, -2147483648 + %2 = icmp eq i32 %n, -1 + %3 = and i1 %2, %1 + %4 = icmp eq i32 %n, 0 + %5 = or i1 %4, %3 + %6 = select i1 %5, i32 1, i32 %n + %div6 = sdiv i32 %mul5, %6 + %add = add nsw i32 %div6, %conv + %shl7 = mul i32 %add, 8 + %add8 = add nsw i32 %shl7, %mul5 + %shl9 = shl i32 %add8, 3 + br label %if.end + +if.end: ; preds = %if.then4, %if.then + %sum.0 = phi i32 [ %shl9, %if.then4 ], [ %n, %if.then ] + %rem1 = and i32 %conv, 1 + %cmp10 = icmp eq i32 %rem1, 0 + br i1 %cmp10, label %if.then12, label %if.end16 + +if.then12: ; preds = %if.end + %7 = load i32, i32 addrspace(1)* %meta, align 4 + %add13 = add nsw i32 %7, %n + %mul14 = mul nsw i32 %add13, %sum.0 + br label %if.end16 + +if.end16: ; preds = %if.end, %if.then12, %entry + %ret.1 = phi i32 [ 0, %entry ], [ %mul14, %if.then12 ], [ 0, %if.end ] + %idxprom = sext i32 %conv to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom + store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4 + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +; The purpose of this test is to make sure that if a condition is a use of a +; uniform load that is control dependent of a varying path, then the load will +; be considered "mask varying" and so the condition is still uniform. + +; CHECK: spir_kernel void @__vecz_v4_varying_load1 +; CHECK: if.then: +; CHECK: %{{.+}} = call i32 @__vecz_b_masked_load4 +; CHECK: br i1 diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll new file mode 100644 index 0000000000000..5a90f9cdf0b55 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll @@ -0,0 +1,89 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k varying_load2 -vecz-passes=cfg-convert -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +source_filename = "kernel.opencl" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @varying_load2(i32 addrspace(1)* %input, i32 addrspace(1)* %out) #0 { +entry: + %call1 = call i64 @__mux_get_local_size(i32 0) #3 + %call2 = call i64 @__mux_get_local_id(i32 0) #3 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %call2 + %cmp = icmp ne i64 %call2, 0 + br i1 %cmp, label %for.cond.preheader, label %if.end14 + +for.cond.preheader: ; preds = %entry + br label %for.cond + +for.cond: ; preds = %for.cond.preheader, %for.inc + %max.0 = phi i32 [ %max.1, %for.inc ], [ 0, %for.cond.preheader ] + %storemerge = phi i64 [ %inc, %for.inc ], [ 0, %for.cond.preheader ] + %call6 = call i64 @__mux_get_local_size(i32 0) #3 + %cmp7 = icmp ult i64 %storemerge, %call6 + br i1 %cmp7, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %load1 = load i32, i32 addrspace(1)* %input, align 4 + %cmp9 = icmp ugt i32 %load1, %max.0 + br i1 %cmp9, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %load2 = load i32, i32 addrspace(1)* %input, align 4 + br label %for.inc + +for.inc: ; preds = %if.then, %for.body + %max.1 = phi i32 [ %load2, %if.then ], [ %max.0, %for.body ] + %inc = add i64 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + %max.0.lcssa = phi i32 [ %max.0, %for.cond ] + %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call1 + store i32 %max.0.lcssa, i32 addrspace(1)* %arrayidx13, align 4 + br label %if.end14 + +if.end14: ; preds = %for.end, %entry + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_local_id(i32) #1 +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_local_size(i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent noduplicate "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { convergent nobuiltin nounwind readonly } +attributes #4 = { nounwind } + +; The purpose of this test is to make sure that if a condition is a use of a +; uniform load that is control dependent of a varying path, then the load will +; be considered "mask varying" and so the condition is still uniform. + +; CHECK: spir_kernel void @__vecz_v4_varying_load2 +; CHECK: for.body: +; CHECK: %{{.+}} = call i32 @__vecz_b_masked_load4 +; CHECK: br i1 +; CHECK: if.then: +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll new file mode 100644 index 0000000000000..7755913a779a8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll @@ -0,0 +1,80 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; Test the -cl-opt-disable compile option +; RUN: veczc -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @fmuladd(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx1 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %1 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx1, align 32 + %arrayidx2 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx2, align 32 + %arrayidx3 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx3, align 32 + %div = fdiv <4 x double> %2, %3 + %4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %div) + %arrayidx4 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %5 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx4, align 32 + %sub = fsub <4 x double> %5, %4 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx4, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_fmuladd( +; Check if the scalar fmuladd exists +; CHECK: call double @llvm.fmuladd.f64( +; Check if the vector fmuladd doesn't exist +; CHECK-NOT: call double @llvm.fmuladd.v4f64( +; CHECK: ret void + +define spir_kernel void @fma(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call + %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32 + %arrayidx1 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call + %1 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx1, align 32 + %arrayidx2 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call + %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx2, align 32 + %arrayidx3 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call + %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx3, align 32 + %div = fdiv <4 x double> %2, %3 + %4 = call <4 x double> @llvm.fma.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %div) + %arrayidx4 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call + %5 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx4, align 32 + %sub = fsub <4 x double> %5, %4 + store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx4, align 32 + ret void +} + +; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_fma( +; Check if the scalar fma exists +; CHECK: call double @llvm.fma.f64( +; Check if the vector fma doesn't exist +; CHECK-NOT: call double @llvm.fma.v4f64( +; CHECK: ret void + +declare i64 @__mux_get_global_id(i32) + +declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll new file mode 100644 index 0000000000000..7d9b0385dbb90 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll @@ -0,0 +1,87 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %cmp = icmp eq i64 %call, 0 + br i1 %cmp, label %for.end, label %for.cond + +for.cond: ; preds = %entry, %for.body + %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ] + %call1 = call i64 @__mux_get_global_size(i32 0) + %conv = trunc i64 %call1 to i32 + %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat + %0 = extractelement <4 x i1> %cmp2, i64 0 + br i1 %0, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %3 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom3 = sext i32 %3 to i64 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3 + store i32 %2, i32 addrspace(1)* %arrayidx4, align 4 + %4 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom5 = sext i32 %4 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5 + %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %6 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom7 = sext i32 %6 to i64 + %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7 + store i32 %5, i32 addrspace(1)* %arrayidx8, align 4 + %7 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom9 = sext i32 %7 to i64 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9 + %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %9 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom11 = sext i32 %9 to i64 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 %8, i32 addrspace(1)* %arrayidx12, align 4 + %10 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom13 = sext i32 %10 to i64 + %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13 + %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4 + %12 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom15 = sext i32 %12 to i64 + %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15 + store i32 %11, i32 addrspace(1)* %arrayidx16, align 4 + %inc = add <4 x i32> %storemerge, + br label %for.cond + +for.end: ; preds = %entry, %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_global_size(i32) + +; This test checks if a uniform <4 x i32> phi is not scalarized +; CHECK: define spir_kernel void @__vecz_v4_vector_loop +; CHECK: %[[STOREMERGE:.+]] = phi <4 x i32> [ %[[INC:.+]], %for.body ], [ zeroinitializer, %entry.ROSCC ] +; CHECK: %[[INC]] = add <4 x i32> %storemerge, {{<(i32 1(, )?)+>|splat \(i32 1\)}} +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll new file mode 100644 index 0000000000000..998c283a2f46f --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll @@ -0,0 +1,97 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %call.trunc = trunc i64 %call to i32 + %call.splatinsert = insertelement <4 x i32> poison, i32 %call.trunc, i32 0 + %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + %cmp = icmp eq i64 %call, 0 + br i1 %cmp, label %for.end, label %for.cond + +for.cond: ; preds = %entry, %for.body + %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ] + %call1 = call i64 @__mux_get_global_size(i32 0) + %conv = trunc i64 %call1 to i32 + %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer + %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat + %0 = extractelement <4 x i1> %cmp2, i64 0 + br i1 %0, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom + %2 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %3 = extractelement <4 x i32> %storemerge, i64 0 + %idxprom3 = sext i32 %3 to i64 + %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3 + store i32 %2, i32 addrspace(1)* %arrayidx4, align 4 + %4 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom5 = sext i32 %4 to i64 + %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5 + %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4 + %6 = extractelement <4 x i32> %storemerge, i64 1 + %idxprom7 = sext i32 %6 to i64 + %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7 + store i32 %5, i32 addrspace(1)* %arrayidx8, align 4 + %7 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom9 = sext i32 %7 to i64 + %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9 + %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4 + %9 = extractelement <4 x i32> %storemerge, i64 2 + %idxprom11 = sext i32 %9 to i64 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 %8, i32 addrspace(1)* %arrayidx12, align 4 + %10 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom13 = sext i32 %10 to i64 + %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13 + %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4 + %12 = extractelement <4 x i32> %storemerge, i64 3 + %idxprom15 = sext i32 %12 to i64 + %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15 + store i32 %11, i32 addrspace(1)* %arrayidx16, align 4 + %inc = add <4 x i32> %storemerge, %call.splat + br label %for.cond + +for.end: ; preds = %entry, %for.cond + ret void +} + +declare i64 @__mux_get_global_id(i32) +declare i64 @__mux_get_global_size(i32) + +; This test checks if a varying <4 x i32> phi is scalarized into 4 i32 phis +; and then re-packetized +; CHECK: define spir_kernel void @__vecz_v4_vector_loop +; CHECK: %[[STOREMERGE1:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC2:.+]], %for.cond ] +; CHECK: %[[STOREMERGE4:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC5:.+]], %for.cond ] +; CHECK: %[[STOREMERGE6:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC7:.+]], %for.cond ] +; CHECK: %[[STOREMERGE8:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC9:.+]], %for.cond ] +; CHECK: %[[INC2]] = add <4 x i32> %[[STOREMERGE1]], [[CALL:.+]] +; CHECK: %[[INC5]] = add <4 x i32> %[[STOREMERGE4]], [[CALL]] +; CHECK: %[[INC7]] = add <4 x i32> %[[STOREMERGE6]], [[CALL]] +; CHECK: %[[INC9]] = add <4 x i32> %[[STOREMERGE8]], [[CALL]] +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll new file mode 100644 index 0000000000000..5582091b8ccd5 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll @@ -0,0 +1,92 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1 +@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1 +@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1 +@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1 + +; Function Attrs: nounwind +define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call + %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call + %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4 + %add = add <4 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call + store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add) + ret void +} + +define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call + %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call + %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4 + %add = add <32 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call + store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add) + ret void +} + +define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call + %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call + %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4 + %add = add <64 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call + store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add) + ret void +} + +define spir_kernel void @test_float_vectors(<2 x float>* %in) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call + %0 = load <2 x float>, <2 x float>* %arrayidx, align 8 + %mul = fmul <2 x float> %0, %0 + %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul) + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) + +; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [29 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho\0A\00" + +; CHECK: define spir_kernel void @__vecz_v4_test( +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll new file mode 100644 index 0000000000000..1e5257625ac75 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll @@ -0,0 +1,92 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test32 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1 +@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1 +@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1 +@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1 + +; Function Attrs: nounwind +define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call + %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call + %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4 + %add = add <4 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call + store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add) + ret void +} + +define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call + %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call + %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4 + %add = add <32 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call + store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add) + ret void +} + +define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call + %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call + %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4 + %add = add <64 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call + store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add) + ret void +} + +define spir_kernel void @test_float_vectors(<2 x float>* %in) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call + %0 = load <2 x float>, <2 x float>* %arrayidx, align 8 + %mul = fmul <2 x float> %0, %0 + %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul) + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) + +; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [225 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho\0A\00" + +; CHECK: define spir_kernel void @__vecz_v4_test32( +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll new file mode 100644 index 0000000000000..d63db033b2971 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll @@ -0,0 +1,92 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test64 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1 +@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1 +@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1 +@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1 + +; Function Attrs: nounwind +define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call + %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call + %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4 + %add = add <4 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call + store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add) + ret void +} + +define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call + %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call + %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4 + %add = add <32 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call + store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add) + ret void +} + +define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call + %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call + %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4 + %add = add <64 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call + store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add) + ret void +} + +define spir_kernel void @test_float_vectors(<2 x float>* %in) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call + %0 = load <2 x float>, <2 x float>* %arrayidx, align 8 + %mul = fmul <2 x float> %0, %0 + %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul) + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) + +; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [449 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho\0A\00" + +; CHECK: define spir_kernel void @__vecz_v4_test64(ptr %out, ptr %in1, ptr %in2) +; CHECK: %call465130 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: %call465131 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: %call465132 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: %call465133 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll new file mode 100644 index 0000000000000..a426c804c1fe1 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll @@ -0,0 +1,43 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [4 x i8] c"%p\0A\00", align 1 + +define spir_kernel void @test() { +entry: + %gid = call spir_func i64 @__mux_get_global_id(i32 0) + %printf = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64 %gid) + ret void +} + +declare spir_func i64 @__mux_get_global_id(i32) + +define spir_func i32 @printf(ptr, ...) { + ret i32 0 +} + +; CHECK: define spir_kernel void @__vecz_v4_test( +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64 +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64 +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64 +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64 +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll new file mode 100644 index 0000000000000..82b5926f3d280 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll @@ -0,0 +1,102 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_float_vectors -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1 +@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1 +@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1 +@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1 + +; Function Attrs: nounwind +define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call + %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call + %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4 + %add = add <4 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call + store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add) + ret void +} + +define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call + %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call + %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4 + %add = add <32 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call + store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add) + ret void +} + +define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call + %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call + %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4 + %add = add <64 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call + store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add) + ret void +} + +define spir_kernel void @test_float_vectors(<2 x float>* %in) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call + %0 = load <2 x float>, <2 x float>* %arrayidx, align 8 + %mul = fmul <2 x float> %0, %0 + %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul) + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) + +; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [13 x i8] c"%#16A,%#16A\0A\00", align 1 + +; CHECK: define spir_kernel void @__vecz_v4_test_float_vectors +; CHECK: %[[V4:[0-9]+]] = fpext <4 x float> %{{.+}} to <4 x double> +; CHECK: %[[V5:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 0 +; CHECK: %[[V6:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 1 +; CHECK: %[[V7:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 2 +; CHECK: %[[V8:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 3 +; CHECK: %[[V9:[0-9]+]] = fpext <4 x float> %{{.+}} to <4 x double> +; CHECK: %[[V10:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 0 +; CHECK: %[[V11:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 1 +; CHECK: %[[V12:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 2 +; CHECK: %[[V13:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 3 +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V5]], double %[[V10]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V6]], double %[[V11]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V7]], double %[[V12]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V8]], double %[[V13]]) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll new file mode 100644 index 0000000000000..d2010a9e95b6b --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll @@ -0,0 +1,100 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test_float_vectors -vecz-simd-width=4 -vecz-double-support=false -vecz-choices=FullScalarization -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1 +@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1 +@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1 +@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1 + +; Function Attrs: nounwind +define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call + %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call + %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4 + %add = add <4 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call + store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add) + ret void +} + +define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call + %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call + %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4 + %add = add <32 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call + store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add) + ret void +} + +define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call + %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call + %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4 + %add = add <64 x i8> %1, %0 + %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call + store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4 + %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add) + ret void +} + +define spir_kernel void @test_float_vectors(<2 x float>* %in) { +entry: + %call = call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call + %0 = load <2 x float>, <2 x float>* %arrayidx, align 8 + %mul = fmul <2 x float> %0, %0 + %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul) + ret void +} + +declare i64 @__mux_get_global_id(i32) + +declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) + +; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [13 x i8] c"%#16A,%#16A\0A\00", align 1 + +; CHECK: define spir_kernel void @__vecz_v4_test_float_vectors +; CHECK: %[[V5:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 0 +; CHECK: %[[V6:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 1 +; CHECK: %[[V7:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 2 +; CHECK: %[[V8:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 3 +; CHECK: %[[V10:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 0 +; CHECK: %[[V11:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 1 +; CHECK: %[[V12:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 2 +; CHECK: %[[V13:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 3 +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V5]], float %[[V10]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V6]], float %[[V11]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V7]], float %[[V12]]) +; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V8]], float %[[V13]]) +; CHECK: ret void diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll new file mode 100644 index 0000000000000..0a121a27a795d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll @@ -0,0 +1,38 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +define spir_kernel void @test(ptr %src, ptr %dst) { +entry: + %lid = tail call i32 @__mux_get_sub_group_local_id() + %lid.i64 = zext i32 %lid to i64 + %src.i = getelementptr i64, ptr %src, i64 %lid.i64 + %val = load <1 x i64>, ptr %src.i, align 8 + %vec = shufflevector <1 x i64> %val, <1 x i64> zeroinitializer, <8 x i32> zeroinitializer + %dst.i = getelementptr <8 x i64>, ptr %dst, i64 %lid.i64 + store <8 x i64> %vec, ptr %dst.i, align 16 + ret void +} + +; CHECK-LABEL: define spir_kernel void @test +; CHECK-LABEL: define spir_kernel void @__vecz_v4_test + +declare i32 @__mux_get_sub_group_local_id() diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll new file mode 100644 index 0000000000000..813dcfe9cc94a --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll @@ -0,0 +1,154 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k blend_div_loop -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind +define spir_kernel void @blend_div_loop(i8 addrspace(1)* %src1ptr, i32 %src1_step, i32 %src1_offset, i8 addrspace(1)* %dstptr, i32 %dst_step, i32 %dst_offset, i32 %dst_rows, i32 %dst_cols, i8 addrspace(1)* %src2ptr, i32 %src2_step, i32 %src2_offset, i8 addrspace(1)* %src3ptr, i32 %src3_step, i32 %src3_offset, i32 %rowsPerWI) #0 { +entry: + %call = call i64 @__mux_get_global_id(i32 0) #2 + %conv = trunc i64 %call to i32 + %call1 = call i64 @__mux_get_global_id(i32 1) #2 + %0 = trunc i64 %call1 to i32 + %conv3 = mul i32 %0, %rowsPerWI + %cmp = icmp slt i32 %conv, %dst_cols + br i1 %cmp, label %if.then, label %if.end62 + +if.then: ; preds = %entry + %call5 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src1_offset) #2 + %call6 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src1_step, i32 %call5) #2 + %call7 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %dst_offset) #2 + %call8 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %dst_step, i32 %call7) #2 + %call9 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src2_offset) #2 + %call10 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src2_step, i32 %call9) #2 + %call11 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src3_offset) #2 + %call12 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src3_step, i32 %call11) #2 + %add = add nsw i32 %conv3, %rowsPerWI + %call13 = call spir_func i32 @_Z3minii(i32 %dst_rows, i32 %add) #2 + br label %for.cond + +for.cond: ; preds = %for.end54, %if.then + %src1_index.0 = phi i32 [ %call6, %if.then ], [ %add59, %for.end54 ] + %dst_index.0 = phi i32 [ %call8, %if.then ], [ %add60, %for.end54 ] + %src2_index.0 = phi i32 [ %call10, %if.then ], [ %add55, %for.end54 ] + %src3_index.0 = phi i32 [ %call12, %if.then ], [ %add56, %for.end54 ] + %y.0 = phi i32 [ %conv3, %if.then ], [ %inc58, %for.end54 ] + %cmp14 = icmp slt i32 %y.0, %call13 + br i1 %cmp14, label %for.body, label %if.end62 + +for.body: ; preds = %for.cond + %idx.ext = sext i32 %src1_index.0 to i64 + %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %src1ptr, i64 %idx.ext + %idx.ext16 = sext i32 %dst_index.0 to i64 + %add.ptr17 = getelementptr inbounds i8, i8 addrspace(1)* %dstptr, i64 %idx.ext16 + %idx.ext18 = sext i32 %src2_index.0 to i64 + %add.ptr19 = getelementptr inbounds i8, i8 addrspace(1)* %src2ptr, i64 %idx.ext18 + %idx.ext20 = sext i32 %src3_index.0 to i64 + %add.ptr21 = getelementptr inbounds i8, i8 addrspace(1)* %src3ptr, i64 %idx.ext20 + br label %for.cond22 + +for.cond22: ; preds = %for.inc49, %for.body + %src1.0 = phi i8 addrspace(1)* [ %add.ptr, %for.body ], [ %add.ptr51, %for.inc49 ] + %src2.0 = phi i8 addrspace(1)* [ %add.ptr19, %for.body ], [ %add.ptr52, %for.inc49 ] + %src3.0 = phi i8 addrspace(1)* [ %add.ptr21, %for.body ], [ %add.ptr53, %for.inc49 ] + %px.0 = phi i32 [ 0, %for.body ], [ %inc50, %for.inc49 ] + %cmp23 = icmp eq i32 %px.0, 0 + br i1 %cmp23, label %for.body25, label %for.end54 + +for.body25: ; preds = %for.cond22 + %1 = zext i32 %px.0 to i64 + %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr17, i64 %1 + store i8 -1, i8 addrspace(1)* %arrayidx, align 1 + br label %for.cond26 + +for.cond26: ; preds = %for.inc, %for.body25 + %storemerge = phi i32 [ 0, %for.body25 ], [ %inc, %for.inc ] + %cmp27 = icmp eq i32 %storemerge, 0 + br i1 %cmp27, label %for.body29, label %for.inc49 + +for.body29: ; preds = %for.cond26 + %2 = zext i32 %storemerge to i64 + %arrayidx31 = getelementptr inbounds i8, i8 addrspace(1)* %src2.0, i64 %2 + %3 = load i8, i8 addrspace(1)* %arrayidx31, align 1 + %4 = zext i32 %storemerge to i64 + %arrayidx34 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 %4 + %5 = load i8, i8 addrspace(1)* %arrayidx34, align 1 + %cmp36 = icmp ugt i8 %3, %5 + br i1 %cmp36, label %if.then46, label %lor.lhs.false + +lor.lhs.false: ; preds = %for.body29 + %6 = zext i32 %storemerge to i64 + %arrayidx39 = getelementptr inbounds i8, i8 addrspace(1)* %src3.0, i64 %6 + %7 = load i8, i8 addrspace(1)* %arrayidx39, align 1 + %8 = zext i32 %storemerge to i64 + %arrayidx42 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 %8 + %9 = load i8, i8 addrspace(1)* %arrayidx42, align 1 + %cmp44 = icmp ult i8 %7, %9 + br i1 %cmp44, label %if.then46, label %for.inc + +if.then46: ; preds = %lor.lhs.false, %for.body29 + %10 = zext i32 %px.0 to i64 + %arrayidx48 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr17, i64 %10 + store i8 0, i8 addrspace(1)* %arrayidx48, align 1 + br label %for.inc49 + +for.inc: ; preds = %lor.lhs.false + %inc = add nuw nsw i32 %storemerge, 1 + br label %for.cond26 + +for.inc49: ; preds = %if.then46, %for.cond26 + %inc50 = add nuw nsw i32 %px.0, 1 + %add.ptr51 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 1 + %add.ptr52 = getelementptr inbounds i8, i8 addrspace(1)* %src2.0, i64 1 + %add.ptr53 = getelementptr inbounds i8, i8 addrspace(1)* %src3.0, i64 1 + br label %for.cond22 + +for.end54: ; preds = %for.cond22 + %add55 = add nsw i32 %src2_index.0, %src2_step + %add56 = add nsw i32 %src3_index.0, %src3_step + %inc58 = add nsw i32 %y.0, 1 + %add59 = add nsw i32 %src1_index.0, %src1_step + %add60 = add nsw i32 %dst_index.0, %dst_step + br label %for.cond + +if.end62: ; preds = %for.cond, %entry + ret void +} + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_global_id(i32) #1 + +; Function Attrs: convergent nounwind readonly +declare spir_func i32 @_Z5mad24iii(i32, i32, i32) #1 + +; Function Attrs: convergent nounwind readonly +declare spir_func i32 @_Z3minii(i32, i32) #1 + +attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { convergent nobuiltin nounwind readonly } + +; The purpose of this test is to make sure we correctly replace the uses of +; divergent loop update masks outside the loop, even in the pure exit. + +; CHECK: spir_kernel void @__vecz_v4_blend_div_loop +; CHECK: for.cond26.pure_exit: +; CHECK: %if.then46.entry_mask{{[0-9]+}} = or i1 %if.then46.loop_exit_mask{{[0-9]+}}.blend, %if.then46.loop_exit_mask.blend diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll new file mode 100644 index 0000000000000..da33e218bbff8 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll @@ -0,0 +1,111 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k vecz_scalar_gather_load -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "kernel.opencl" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_group_id(i32) + +; Function Attrs: convergent nounwind readonly +declare i64 @__mux_get_local_id(i32) + +; Function Attrs: convergent nounwind +define spir_kernel void @vecz_scalar_gather_load(i32 addrspace(1)* %row_indices, i32 addrspace(1)* %row_blocks, float addrspace(1)* %result) { +entry: + %call1 = call i64 @__mux_get_group_id(i32 0) + %call2 = call i64 @__mux_get_local_id(i32 0) + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %row_blocks, i64 %call1 + %load1 = load i32, i32 addrspace(1)* %arrayidx1, align 4 + %add1 = add i64 %call1, 1 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %row_blocks, i64 %add1 + %load2 = load i32, i32 addrspace(1)* %arrayidx2, align 4 + br label %for.cond + +for.cond: ; preds = %entry, %for.inc + %storemerge = phi i32 [ %load1, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ult i32 %storemerge, %load2 + br i1 %cmp1, label %if.then1, label %for.end + +if.then1: ; preds = %for.cond + %storemerge.zext = zext i32 %storemerge to i64 + %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %row_indices, i64 %storemerge.zext + %load3 = load i32, i32 addrspace(1)* %gep1, align 4 + %sub1 = sub i32 %load3, %load1 + %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %row_indices, i64 %storemerge.zext + %load4 = load i32, i32 addrspace(1)* %gep2, align 4 + %sub2 = sub i32 %load4, %load1 + %cmp2 = icmp ugt i32 %sub2, %sub1 + br i1 %cmp2, label %if.then2, label %if.else2 + +if.then2: ; preds = %if.then1 + %sub1.zext = zext i32 %sub1 to i64 + %gep3 = getelementptr inbounds float, float addrspace (1)* %result, i64 %sub1.zext + %load5 = load float, float addrspace(1)* %gep3, align 4 + br label %if.else2 + +if.else2: ; preds = %if.then1, %if.then2 + %ret = phi float [ %load5, %if.then2 ], [ 0.000000e+00, %if.then1 ] + %cmp3 = icmp eq i64 %call2, 0 + br i1 %cmp3, label %if.then3, label %for.inc + +if.then3: ; preds = %if.else2 + %gep4 = getelementptr inbounds float, float addrspace(1)* %result, i64 %call2 + store float %ret, float addrspace(1)* %gep4, align 4 + br label %for.inc + +for.inc: ; preds = %if.then3, %if.else2 + %inc = add i32 %storemerge, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; The purpose of this test is to ensure we don't generate a masked load for a +; load from a uniform address, even where it is in a divergent control path. +; It used to be the case that such a load would become a masked load during +; control flow conversion, thefore causing it to become a varying load due to +; the varying mask. However, since the introduction of the Mask Varying +; attribute, it is possible to support a Uniform load with a Varying mask, so +; it is no longer necessary to mark all loads in divergent paths as Varying. +; The somewhat circuitous upshot of this is that the load no longer gets a mask +; at all, since it was previously only considered to be in a divergent path on +; account of another Mask Varying load! + +; CHECK: spir_kernel void @__vecz_v4_vecz_scalar_gather_load + +; This load depends only on the uniform loop iterator +; CHECK: if.then1: +; CHECK: %[[IND:.+]] = phi i32 +; CHECK: %[[ZIND:.+]] = zext i32 %[[IND]] to i64 +; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr addrspace(1) %row_indices, i64 %[[ZIND]] +; CHECK: %{{.+}} = load i32, ptr addrspace(1) %[[GEP1]] + +; This load depends only on other uniform loads +; CHECK: if.then2: +; CHECK-NOT: declare float @__vecz_b_masked_gather_load4_ +; CHECK-NOT: declare float @__vecz_b_masked_load4_ +; CHECK: %[[GEP2:.+]] = getelementptr inbounds float, ptr addrspace(1) %result +; CHECK: %{{.+}} = load float, ptr addrspace(1) %[[GEP2]] + +; The store instruction is definitely in a divergent path, however, so needs a mask. +; CHECK: if.then3: +; CHECK: call void @__vecz_b_masked_store4_f diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll new file mode 100644 index 0000000000000..d1085569a5207 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll @@ -0,0 +1,83 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k vecz_scalar_interleaved_load -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s + +; ModuleID = 'Unknown buffer' +source_filename = "Unknown buffer" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind readnone +declare i64 @__mux_get_global_id(i32) #0 + +define spir_kernel void @vecz_scalar_interleaved_load(float addrspace(1)* %out, i64 %n, float %m) { +entry: + %gid0 = tail call i64 @__mux_get_global_id(i32 0) #0 + %gid1 = tail call i64 @__mux_get_global_id(i32 1) #0 + %cmp1 = icmp slt i64 %gid0, %n + br i1 %cmp1, label %if.then1, label %end + +if.then1: ; preds = %entry + %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %gid1 + %cmp2 = fcmp une float %m, 0.000000e+00 + br i1 %cmp2, label %if.then2, label %if.else2 + +if.then2: ; preds = %if.then1 + %mul1 = mul nsw i64 %gid0, %n + %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %mul1 + %cmp3 = icmp slt i64 %gid1, %n + %load1 = load float, float addrspace(1)* %gep2, align 4 + %ie1 = insertelement <4 x float> poison, float %load1, i32 0 + br i1 %cmp3, label %if.then3, label %if.else3 + +if.then3: ; preds = %if.then2 + %laod2 = load float, float addrspace(1)* %gep2, align 4 + br label %if.else3 + +if.else3: ; preds = %if.then2, %if.then3 + %phi_load2 = phi float [ %laod2, %if.then3 ], [ 0.000000e+00, %if.then2 ] + %ie2 = insertelement <4 x float> %ie1, float %phi_load2, i32 1 + %load3 = load float, float addrspace(1)* %gep2, align 4 + %ie3 = insertelement <4 x float> %ie2, float %load3, i32 2 + %x76 = load float, float addrspace(1)* %gep2, align 4 + %ie4 = insertelement <4 x float> %ie3, float %x76, i32 3 + br label %if.else2 + +if.else2: ; preds = %if.else3, %if.then1 + %ret_vec = phi <4 x float> [ %ie4, %if.else3 ], [ zeroinitializer, %if.then1 ] + %ret = extractelement <4 x float> %ret_vec, i32 0 + %ret_gep = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %gid1 + store float %ret, float addrspace(1)* %ret_gep, align 4 + br label %end + +end: ; preds = %entry, %if.else2 + ret void +} + +attributes #0 = { nounwind readnone } + +; The purpose of this test is to ensure we correctly generate a scalar +; masked load for a scalar load that has a strided pointer, instead of +; generating an interleaved masked load for a non vector load (which is +; invalid). + +; The middle optimizations break this test because after scalarization, +; some of the vector elements become dead code and thus, an interleaved +; load is in fact generated (although correctly, in this case) + +; CHECK: spir_kernel void @__vecz_v4_vecz_scalar_interleaved_load +; CHECK: declare float @__vecz_b_masked_load4_fu3ptrU3AS1b diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll new file mode 100644 index 0000000000000..2496b1b1d675d --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll @@ -0,0 +1,204 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s + +target triple = "spir64-unknown-unknown" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + +declare i64 @__mux_get_global_id(i32) + +declare i32 @__mux_work_group_scan_inclusive_add_i32(i32, i32) +declare i64 @__mux_work_group_scan_inclusive_add_i64(i32, i64) +declare float @__mux_work_group_scan_inclusive_fadd_f32(i32, float) + +declare i32 @__mux_work_group_scan_inclusive_smin_i32(i32, i32) +declare i32 @__mux_work_group_scan_inclusive_umin_i32(i32, i32) +declare i32 @__mux_work_group_scan_inclusive_smax_i32(i32, i32) +declare i32 @__mux_work_group_scan_inclusive_umax_i32(i32, i32) +declare float @__mux_work_group_scan_inclusive_fmin_f32(i32, float) +declare float @__mux_work_group_scan_inclusive_fmax_f32(i32, float) + +define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call i32 @__mux_work_group_scan_inclusive_add_i32(i32 0, i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i32( +; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_add_i32(i32 0, i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = add <4 x i32> [[SCAN]], [[SPLAT]] +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call + %0 = load i64, i64 addrspace(1)* %arrayidx, align 4 + %call1 = tail call i64 @__mux_work_group_scan_inclusive_add_i64(i32 0, i64 %0) + %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call + store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i64( +; CHECK: [[SCAN:%.*]] = call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_work_group_scan_exclusive_add_i64(i32 0, i64 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i64> poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[HEAD]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = add <4 x i64> [[SCAN]], [[SPLAT]] +; CHECK: store <4 x i64> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call float @__mux_work_group_scan_inclusive_fadd_f32(i32 0, float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_f32( +; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.0{{.*}}, <4 x float> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fadd_f32(i32 0, float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = fadd <4 x float> [[SCAN]], [[SPLAT]] +; CHECK: store <4 x float> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call i32 @__mux_work_group_scan_inclusive_smin_i32(i32 0, i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smin_i32( +; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}}) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_smin_i32(i32 0, i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]]) +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call i32 @__mux_work_group_scan_inclusive_umin_i32(i32 0, i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umin_i32( +; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_umin_i32(i32 0, i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]]) +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call i32 @__mux_work_group_scan_inclusive_smax_i32(i32 0, i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smax_i32( +; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_smax_i32(i32 0, i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]]) +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call + %0 = load i32, i32 addrspace(1)* %arrayidx, align 4 + %call1 = tail call i32 @__mux_work_group_scan_inclusive_umax_i32(i32 0, i32 %0) + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call + store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umax_i32( +; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_umax_i32(i32 0, i32 [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]]) +; CHECK: store <4 x i32> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call float @__mux_work_group_scan_inclusive_fmin_f32(i32 0, float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmin_f32( +; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fmin_f32(i32 0, float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]]) +; CHECK: store <4 x float> [[FINAL]], +} + +define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) { +entry: + %call = tail call i64 @__mux_get_global_id(i32 0) + %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call + %0 = load float, float addrspace(1)* %arrayidx, align 4 + %call1 = tail call float @__mux_work_group_scan_inclusive_fmax_f32(i32 0, float %0) + %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call + store float %call1, float addrspace(1)* %arrayidx2, align 4 + ret void +; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmax_f32( +; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> [[INPUT:%.*]]) +; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[INPUT]]) +; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fmax_f32(i32 0, float [[SUM]]) +; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0 +; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]]) +; CHECK: store <4 x float> [[FINAL]], +} diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll new file mode 100644 index 0000000000000..3461a335d6845 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll @@ -0,0 +1,104 @@ +; Copyright (C) Codeplay Software Limited +; +; Licensed under the Apache License, Version 2.0 (the "License") with LLVM +; Exceptions; you may not use this file except in compliance with the License. +; You may obtain a copy of the License at +; +; https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +; +; Unless required by applicable law or agreed to in writing, software +; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +; License for the specific language governing permissions and limitations +; under the License. +; +; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +; RUN: veczc -k dont_mask_workitem_builtins -S < %s | FileCheck %s + +; ModuleID = 'kernel.opencl' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "spir64-unknown-unknown" + +; Function Attrs: nounwind +define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 { +entry: + %call = call i64 @__mux_get_local_id(i32 0) #5 + %conv = trunc i64 %call to i32 + %cmp = icmp sgt i32 %conv, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + fence syncscope("singlethread") acq_rel + %call2 = call i64 @__mux_get_global_id(i32 0) #5 + %conv3 = trunc i64 %call2 to i32 + %idxprom = sext i32 %conv3 to i64 + %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom + %0 = load i32, i32 addrspace(2)* %arrayidx, align 4 + %idxprom4 = sext i32 %conv3 to i64 + %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4 + store i32 %0, i32 addrspace(1)* %arrayidx5, align 4 + br label %if.end + +if.else: ; preds = %entry + %call8 = call i64 @__mux_get_local_size(i32 0) #5 + %call9 = call i64 @__mux_get_group_id(i32 0) #5 + %mul = mul i64 %call9, %call8 + %add = add i64 %mul, %call + %sext = shl i64 %add, 32 + %idxprom11 = ashr exact i64 %sext, 32 + %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11 + store i32 42, i32 addrspace(1)* %arrayidx12, align 4 + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +declare i64 @__mux_get_local_id(i32) #1 + +declare i64 @__mux_get_global_id(i32) #1 + +declare i64 @__mux_get_local_size(i32) #1 + +declare i64 @__mux_get_group_id(i32) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { noinline } +attributes #3 = { argmemonly nounwind } +attributes #4 = { argmemonly nounwind readonly } +attributes #5 = { nobuiltin nounwind } +attributes #6 = { nounwind } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 2, i32 1} +!2 = !{!"kernel_arg_access_qual", !"none", !"none"} +!3 = !{!"kernel_arg_type", !"int*", !"int*"} +!4 = !{!"kernel_arg_base_type", !"int*", !"int*"} +!5 = !{!"kernel_arg_type_qual", !"const", !""} +!6 = !{!"clang version 3.8.1 "} + +; The vectorized function +; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_dont_mask_workitem_builtins( + +; Check if the builtins are still here +; CHECK: call i64 @__mux_get_local_id(i32 0) +; CHECK: call i64 @__mux_get_local_size(i32 0) +; CHECK: call i64 @__mux_get_group_id(i32 0) +; CHECK: fence syncscope("singlethread") acq_rel +; CHECK: call i64 @__mux_get_global_id(i32 0) +; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_global_id(i32 +; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_local_size(i32 +; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_group_id(i32 + +; Function end +; CHECK: ret void + +; Also check that we haven't declared the masked functions +; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_group_id(i32, i1) +; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_local_size(i32, i1) +; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_group_id(i32, i1) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt new file mode 100644 index 0000000000000..921204b382aa0 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt @@ -0,0 +1,14 @@ +llvm_map_components_to_libnames(llvm_libs all ${LLVM_TARGETS_TO_BUILD}) +list(REMOVE_ITEM llvm_libs LTO OptRemarks) + +add_llvm_tool(veczc + ${CMAKE_CURRENT_SOURCE_DIR}/source/veczc.cpp +) +target_compile_options(veczc PRIVATE ${VECZ_COMPILE_OPTIONS}) +target_compile_definitions(veczc PRIVATE ${VECZ_COMPILE_DEFINITIONS}) +target_include_directories(veczc PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/include> + ${CMAKE_CURRENT_SOURCE_DIR}/../../compiler_pipeline/include + ${CMAKE_CURRENT_SOURCE_DIR}/../../vecz/include + ) +target_link_libraries(veczc PUBLIC ${llvm_libs}) diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp new file mode 100644 index 0000000000000..5c4a4f228db00 --- /dev/null +++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp @@ -0,0 +1,465 @@ +// Copyright (C) Codeplay Software Limited +// +// Licensed under the Apache License, Version 2.0 (the "License") with LLVM +// Exceptions; you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vecz/pass.h" +#include "vecz/vecz_target_info.h" + +static llvm::cl::opt + InputFilename(llvm::cl::Positional, llvm::cl::desc(""), + llvm::cl::init("-")); + +static llvm::cl::opt + OutputFilename("o", llvm::cl::desc("Override output filename"), + llvm::cl::value_desc("filename")); +static llvm::cl::opt + WriteTextual("S", llvm::cl::desc("Write module as text")); + +static llvm::cl::list + KernelNameSpecs("k", llvm::cl::desc("Kernel to vectorize"), + llvm::cl::ZeroOrMore, llvm::cl::value_desc("name")); + +static llvm::cl::opt + SIMDDimIdx("d", llvm::cl::desc("Dimension index to vectorize on"), + llvm::cl::init(0), llvm::cl::value_desc("dimension")); + +static llvm::cl::opt + SIMDWidth("w", llvm::cl::desc("Width to vectorize to"), llvm::cl::init(0), + llvm::cl::value_desc("width")); + +static llvm::cl::opt FailQuietly( + "vecz-fail-quietly", + llvm::cl::desc("don't return an error code on vectorization failure")); + +static llvm::cl::opt + ChoicesHelp("vecz-choices-help", + llvm::cl::desc("see information about available choices")); + +static llvm::cl::opt + VeczAuto("vecz-auto", + llvm::cl::desc("run the vectorizer if it is found to be useful")); + +static llvm::cl::opt VeczSimdWidth( + "vecz-simd-width", + llvm::cl::desc("manually set the SIMD width for the vectorizer")); + +static llvm::cl::opt VeczScalable( + "vecz-scalable", + llvm::cl::desc("force scalable vectorization for the vectorizer")); + +// Allow the passing of Vecz Choices string on the command line. This is parsed +// after the choices environment variable, thus overriding it. +static llvm::cl::opt + ChoicesString("vecz-choices", llvm::cl::desc("Set vecz choices")); + +static llvm::cl::opt + VeczCollectStats("vecz-llvm-stats", + llvm::cl::desc("enable reporting LLVM statistics")); + +static llvm::cl::opt + UserTriple("vecz-target-triple", llvm::cl::desc("the target triple")); +static llvm::cl::opt UserCPU("vecz-target-mcpu", + llvm::cl::desc("Set the CPU model")); +static llvm::cl::opt + CPUFeatures("vecz-target-features", + llvm::cl::desc("Set the CPU feature string")); +static llvm::cl::opt DoubleSupport( + "vecz-double-support", llvm::cl::init(true), + llvm::cl::desc( + "Assume the target has double-precision floating point support")); + +static llvm::cl::list + SGSizes("device-sg-sizes", + llvm::cl::desc("Comma-separated list of supported sub-group sizes"), + llvm::cl::CommaSeparated); + +static llvm::TargetMachine *initLLVMTarget(llvm::StringRef triple_string, + llvm::StringRef cpu_model, + llvm::StringRef target_features) { + const llvm::Triple triple(triple_string); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmPrinters(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllDisassemblers(); + + llvm::TargetOptions opts; + opts.DisableIntegratedAS = false; + std::string e; + const llvm::Target *target = + llvm::TargetRegistry::lookupTarget(triple.getTriple(), e); + if (!target) { + (void)::fprintf(stderr, "can't get target %s:%s\n", + triple.getTriple().c_str(), e.c_str()); + ::exit(1); + } + llvm::PassRegistry ®istry = *llvm::PassRegistry::getPassRegistry(); + llvm::initializeAlwaysInlinerLegacyPassPass(registry); +#if LLVM_VERSION_GREATER_EQUAL(21, 0) + return target->createTargetMachine(triple, cpu_model, target_features, opts, + llvm::Reloc::Model::Static); +#else + return target->createTargetMachine(triple.getTriple(), cpu_model, + target_features, opts, + llvm::Reloc::Model::Static); +#endif +} + +static vecz::VeczPassOptions getDefaultPassOptions() { + // Enable/disable Choices from the CODEPLAY_VECZ_CHOICES environment + // variable. + vecz::VectorizationChoices Choices; + + const char *ptr = std::getenv("CODEPLAY_VECZ_CHOICES"); + if (ptr && !Choices.parseChoicesString(ptr)) { + llvm::errs() + << "Failed to parse the CODEPLAY_VECZ_CHOICES env variable.\n" + "Use --vecz-choices-help for available choices and usage info.\n"; + ::exit(1); + } + + // Parse the Vecz choices given in the command line + const std::string &ch = ChoicesString; + if (!ch.empty() && !Choices.parseChoicesString(ch)) { + llvm::errs() + << "Failed to parse the --vecz-choices command line option.\n" + "Use --vecz-choices-help for available choices and usage info.\n"; + ::exit(1); + } + + if (VeczCollectStats) { + llvm::EnableStatistics(true); + } + + const auto factor = SIMDWidth ? SIMDWidth : 4; + auto VF = llvm::ElementCount::get(VeczSimdWidth ? VeczSimdWidth : factor, + VeczScalable == llvm::cl::BOU_TRUE); + + vecz::VeczPassOptions passOpts; + passOpts.choices = Choices; + passOpts.factor = VF; + passOpts.vecz_auto = VeczAuto; + passOpts.vec_dim_idx = SIMDDimIdx; + passOpts.local_size = SIMDWidth; + return passOpts; +} + +// Parse a command line vectorization specification for a given kernel +// ::= ':' +// ::= +// ::= (opt)(opt) +// (opt)(opt) +// ::= ',' +// ::= [0-9]+ +// ::= [a-zA-Z_][a-zA-Z_0-9]+ +// ::= '.' [123] +// ::= +// ::= 'a' // automatic vectorization factor +// ::= '@' +// ::= 's' +// ::= 'p' +static bool +parsePassOptionsSwitch(const llvm::StringRef spec, llvm::StringRef &name, + llvm::SmallVectorImpl &opts) { + auto pair = spec.split(':'); + name = pair.first; + auto vals = pair.second; + auto defaults = getDefaultPassOptions(); + if (!name.size()) { + return false; + } + if (!vals.empty()) { + do { + // HEREBEDRAGONS: The return status of `consumeInteger` and + // `consume_front` are "failed" and "succeeded" respectively. It's + // opposite day somewhere in llvm land... + unsigned vf; + auto opt = defaults; + if (vals.consume_front("a")) { + opt.vecz_auto = true; + } else if (!vals.consumeInteger(10, vf)) { + opt.factor = llvm::ElementCount::getFixed(vf); + } + if (vals.consume_front(".")) { + unsigned dim; + if (vals.consumeInteger(10, dim)) { + return false; + } + if (!dim || dim > 3) { + return false; + } + opt.vec_dim_idx = dim; + } + if (vals.consume_front("@")) { + unsigned simd_width; + if (vals.consumeInteger(10, simd_width)) { + return false; + } + opt.local_size = simd_width; + } + // ::= 's' + if (vals.consume_front("s")) { + opt.factor = + llvm::ElementCount::getScalable(opt.factor.getKnownMinValue()); + } + // ::= 'p' + if (vals.consume_front("p")) { + opt.choices.enableVectorPredication(); + } + opts.push_back(opt); + } while (vals.consume_front(",") && !vals.empty()); + if (!vals.empty()) { + return false; + } + } else { + opts.push_back(defaults); + } + return true; +} + +using KernelOptMap = + llvm::SmallDenseMap, 1>; + +int main(const int argc, const char *const argv[]) { + llvm::cl::ParseCommandLineOptions(argc, argv); + + if (ChoicesHelp) { + const auto &Infos = vecz::VectorizationChoices::queryAvailableChoices(); + llvm::outs() << "Available Vecz Choices:\n\n"; + for (const auto &Info : Infos) { + llvm::outs() << " * " << Info.name << ":\n"; + llvm::outs() << " " << Info.desc << "\n\n"; + } + llvm::outs() << "Separate multiple items with any one of [:;,].\n" + "Prefix any choice with \"no\" to disable that option.\n"; + return 0; + } + + // If the user didn't specify an output filename, but is reading from stdin, + // output to stdout. This may be emitting binary, but trust the user to know + // what they're doing. We could also emit a warning. + if (OutputFilename.empty() && InputFilename == "-") { + OutputFilename = "-"; + } + + if (OutputFilename.empty()) { + llvm::errs() << "Error: no output filename was given (use -o )\n"; + return 1; + } + + llvm::SMDiagnostic err; + llvm::LLVMContext context; + + std::unique_ptr module = + llvm::parseIRFile(InputFilename, err, context); + + if (!module) { + auto errorOrInputFile = + llvm::MemoryBuffer::getFileOrSTDIN(InputFilename.getValue()); + + // If there was an error in getting the input file. + if (!errorOrInputFile) { + llvm::errs() << "Error: " << errorOrInputFile.getError().message() << " '" + << InputFilename.getValue() << "'\n"; + return 1; + } + + llvm::errs() << "Error: bitcode file was malformed\n"; + err.print("veczc", llvm::errs(), + llvm::sys::Process::StandardErrHasColors()); + return 1; + } + + KernelOptMap kernelOpts; + if (KernelNameSpecs.empty()) { + auto defaults = getDefaultPassOptions(); + for (const auto &f : *module) { + if (f.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) { + continue; + } + kernelOpts[f.getName()].push_back(defaults); + } + } else { + for (const auto &S : KernelNameSpecs) { + llvm::StringRef name; + llvm::SmallVector opts; + if (!parsePassOptionsSwitch(S, name, opts)) { + (void)::fprintf( + stderr, "failed to parse kernel vectorization specification%s\n", + name.str().c_str()); + return 1; + } + if (!module->getFunction(name)) { + llvm::errs() << "Error: no such kernel to vectorize ('" << name + << "')\n"; + return 1; + } + kernelOpts[name] = std::move(opts); + } + } + + // Open the file. + std::error_code EC; + llvm::sys::fs::OpenFlags OpenFlags = llvm::sys::fs::OF_None; + if (WriteTextual) { + OpenFlags |= llvm::sys::fs::OF_Text; + } + auto Out = + std::make_unique(OutputFilename, EC, OpenFlags); + if (EC || !Out) { + llvm::errs() << EC.message() << '\n'; + return 1; + } + + std::unique_ptr tm( + UserTriple.size() ? initLLVMTarget(UserTriple, UserCPU, CPUFeatures) + : nullptr); + assert(!UserTriple.size() || tm); + if (tm) { +#if LLVM_VERSION_GREATER_EQUAL(21, 0) + module->setTargetTriple(tm->getTargetTriple()); +#else + module->setTargetTriple(tm->getTargetTriple().getTriple()); +#endif + module->setDataLayout(tm->createDataLayout()); + } + + compiler::utils::PassMachinery passMach(context, tm.get()); + + auto TICallback = [&](const llvm::Module &) { + return vecz::createTargetInfoFromTargetMachine(tm.get()); + }; + + passMach.initializeStart(); + passMach.getMAM().registerPass( + [&] { return vecz::TargetInfoAnalysis(TICallback); }); + passMach.getMAM().registerPass( + [&] { return compiler::utils::BuiltinInfoAnalysis(); }); + passMach.getMAM().registerPass( + [&] { return compiler::utils::SubgroupAnalysis(); }); + passMach.getFAM().registerPass([] { return llvm::TargetIRAnalysis(); }); + passMach.getMAM().registerPass([] { + compiler::utils::DeviceInfo Info{/*half*/ 0, /*float*/ 0, DoubleSupport, + /*MaxWorthWidth*/ 64}; + for (const auto S : SGSizes) { + Info.reqd_sub_group_sizes.push_back(S); + } + return compiler::utils::DeviceInfoAnalysis(Info); + }); + passMach.getMAM().registerPass([&kernelOpts] { + return vecz::VeczPassOptionsAnalysis( + [&kernelOpts](llvm::Function &F, llvm::ModuleAnalysisManager &, + llvm::SmallVectorImpl &Opts) { + auto it = kernelOpts.find(F.getName()); + if (it == kernelOpts.end()) { + return false; + } + Opts.assign(it->second.begin(), it->second.end()); + return true; + }); + }); + passMach.initializeFinish(); + + llvm::ModulePassManager PM; + + // Forcibly compute the BuiltinInfoAnalysis so that cached retrievals work. + PM.addPass(llvm::RequireAnalysisPass()); + + PM.addPass(llvm::createModuleToPostOrderCGSCCPassAdaptor( + compiler::utils::OptimalBuiltinReplacementPass())); + PM.addPass(vecz::RunVeczPass()); + PM.run(*module, passMach.getMAM()); + + // If the user has specified a list of kernels to vectorize, we need to + // check we've matched their expectations. If they didn't specify we work on + // a "best-effort" basis + if (!KernelNameSpecs.empty()) { + for (auto p : kernelOpts) { + auto &f = *module->getFunction(p.first); + const auto &requested = p.getSecond(); + llvm::SmallVector results; + compiler::utils::parseOrigToVeczFnLinkMetadata(f, results); + for (auto &expected : requested) { + if (expected.vecz_auto) { + continue; + } + bool found = false; + for (auto &result : results) { + // FIXME this probably not the best way to do this + found |= result.second.vf.getKnownMinValue() >= + expected.factor.getKnownMinValue(); + } + if (!found) { + llvm::errs() << "Error: Failed to vectorize function '" << f.getName() + << "'\n"; + return FailQuietly ? 0 : 1; + } + } + } + } + + // Write the resulting module. + llvm::ModulePassManager printMPM; + if (WriteTextual) { + printMPM.addPass(llvm::PrintModulePass(Out->os())); + } else { + printMPM.addPass(llvm::BitcodeWriterPass(Out->os())); + } + printMPM.run(*module, passMach.getMAM()); + + Out->keep(); + + if (llvm::AreStatisticsEnabled()) { + llvm::PrintStatistics(); + } + return 0; +}