diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml
index 33c2269fb360b..fb593ee00b3eb 100644
--- a/.github/workflows/sycl-linux-build.yml
+++ b/.github/workflows/sycl-linux-build.yml
@@ -202,8 +202,7 @@ jobs:
           --ci-defaults ${{ inputs.build_configure_extra_args }} \
           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-          -DLLVM_INSTALL_UTILS=ON \
-          -DNATIVECPU_USE_OCK=Off
+          -DLLVM_INSTALL_UTILS=ON
     - name: Compile
       id: build
       # Emulate default value for manual dispatch as we've run out of available arguments.
diff --git a/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt
index fb2bf7703ab10..dce987133970b 100644
--- a/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt
+++ b/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt
@@ -1,3 +1,16 @@
+set(OCK_LIBS)
+option(NATIVECPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" ON)
+
+# Don't use OCK compiler_passes if Native CPU is not enabled.
+if(NOT "native_cpu" IN_LIST SYCL_ENABLE_BACKENDS)
+  set(NATIVECPU_USE_OCK Off CACHE BOOL "Use the oneAPI Construction Kit for Native CPU" FORCE)
+endif()
+
+if(NATIVECPU_USE_OCK)
+  add_subdirectory(compiler_passes EXCLUDE_FROM_ALL)
+  set(OCK_LIBS NativeCPUPipeline NativeCPUVecz)
+endif()
+
 add_llvm_component_library(LLVMSYCLNativeCPUUtils
   PipelineSYCLNativeCPU.cpp
   PrepareSYCLNativeCPU.cpp
@@ -17,80 +30,13 @@ add_llvm_component_library(LLVMSYCLNativeCPUUtils
   TargetParser
   TransformUtils
   ipo
-  )
+  ${OCK_LIBS}
+)
 
-option(NATIVECPU_USE_OCK "Use the oneAPI Construction Kit for Native CPU" ON)
-
-# Don't fetch OCK if Native CPU is not enabled.
-if(NOT "native_cpu" IN_LIST SYCL_ENABLE_BACKENDS)
-  set(NATIVECPU_USE_OCK Off CACHE BOOL "Use the oneAPI Construction Kit for Native CPU" FORCE)
-endif()
 
 if(NATIVECPU_USE_OCK)
-  set(OCK_SEARCH_LOC "oneapi-construction-kit/compiler_passes")
-  if(NOT FETCHCONTENT_SOURCE_DIR_ONEAPI-CK)
-    find_path(OCK_SOURCE_DIR ${OCK_SEARCH_LOC} PATHS ${CMAKE_PREFIX_PATH})
-  endif()
-  if(OCK_SOURCE_DIR)
-    message(STATUS "Found system source location of oneAPI Construction Kit in ${OCK_SOURCE_DIR}")
-    set(OCK_SOURCE_DIR "${OCK_SOURCE_DIR}/${OCK_SEARCH_LOC}")
-    set(OCK_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/oneapi-construction-kit")
-  else()
-    set(OCK_GIT_REPO "https://github.com/uxlfoundation/oneapi-construction-kit.git")
-    # commit d0a32d701e34b3285de7ce776ea36abfec673df7
-    # Merge: a9f848e0e8 56473a8c25
-    # Author: Harald van Dijk <harald.vandijk@codeplay.com>
-    # Date:   Mon Jun 30 12:24:46 2025 +0100
-    # 
-    #     Merge pull request #878 from hvdijk/specify-fuse-ld-lld
-    #     
-    #     [RefSi] Explicitly specify -fuse-ld=lld.
-    set(OCK_GIT_TAG d0a32d701e34b3285de7ce776ea36abfec673df7)
-   
-    include(FetchContent)
-    FetchContent_Declare(oneapi-ck
-     GIT_REPOSITORY "${OCK_GIT_REPO}"
-     GIT_TAG "${OCK_GIT_TAG}"
-    )
-    FetchContent_GetProperties(oneapi-ck)
-    if(NOT oneapi-ck_POPULATED)
-      if(FETCHCONTENT_SOURCE_DIR_ONEAPI-CK)
-        message(STATUS "Using specified oneAPI Construction Kit repo location at ${FETCHCONTENT_SOURCE_DIR_ONEAPI-CK}")
-      else()
-        message(STATUS "Cloning oneAPI Construction Kit from ${OCK_GIT_REPO}, tag ${OCK_GIT_TAG}")
-      endif()
-      FetchContent_Populate(oneapi-ck)
-      message(STATUS "oneAPI Construction Kit cloned in ${oneapi-ck_SOURCE_DIR}")
-      set(OCK_SOURCE_DIR ${oneapi-ck_SOURCE_DIR}/compiler_passes)
-      set(OCK_BINARY_DIR ${oneapi-ck_BINARY_DIR})
-    endif()
-  endif()
-
-  set(CA_ENABLE_API "cl" CACHE STRING "" FORCE)
-  add_subdirectory(
-    ${OCK_SOURCE_DIR}
-    ${OCK_BINARY_DIR} EXCLUDE_FROM_ALL)
-
-  install(TARGETS compiler-pipeline
-  EXPORT;LLVMExports
-          LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline
-          ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline
-          RUNTIME DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT compiler-pipeline)
-  set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS compiler-pipeline)
-  install(TARGETS vecz
-  EXPORT;LLVMExports
-          LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz
-          ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz
-          RUNTIME DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT vecz)
-  set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS vecz)
-  install(TARGETS multi_llvm EXPORT;LLVMExports)
-  set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS multi_llvm)
   target_compile_definitions(LLVMSYCLNativeCPUUtils PRIVATE  NATIVECPU_USE_OCK)
   target_include_directories(LLVMSYCLNativeCPUUtils PRIVATE 
-    ${oneapi-ck_SOURCE_DIR}/modules/compiler/multi_llvm/include
-    ${oneapi-ck_SOURCE_DIR}/modules/cargo/include
-    ${oneapi-ck_SOURCE_DIR}/modules/compiler/vecz/include
-    ${oneapi-ck_SOURCE_DIR}/modules/compiler/utils/include)
-  target_link_libraries(LLVMSYCLNativeCPUUtils PRIVATE compiler-pipeline vecz)
-
+    ${CMAKE_CURRENT_SOURCE_DIR}/compiler_passes/compiler_pipeline/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/compiler_passes/vecz/include)
 endif()
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt
new file mode 100644
index 0000000000000..de47b25e03a30
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/compiler_pipeline)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/vecz)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst
new file mode 100644
index 0000000000000..cdfe3a9c79034
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_passes.rst
@@ -0,0 +1,63 @@
+Compiler passes
+===============
+
+Introduction
+------------
+
+Files under this directory are integrated from the `oneAPI Construction Kit`_
+using `git-filter-repo`. They are used by Native CPU to help create a pipeline for
+turning a base kernel into something which can be executed across multiple work
+items, including auto-vectorization.
+
+These files are largely from the sub-directories
+**modules/compiler/compiler_pipeline**, **modules/compiler/vecz** and
+**modules/compiler/multi_llvm**. Only files that are used have been integrated
+and the **CMake** files have been updated to fit in with LLVM components.
+
+These sub-directories are used as follows:
+
+* **compiler_pipeline** provides the passes to build a pipeline from the initial
+  kernel, including generating working item loops, handling local memory,
+  handling metadata and calling the vectorizer **vecz**.
+
+* **vecz** provides a full function vectorizer, which generates a copy of the
+  original function but vectorized across the work group, taking into account
+  subgroups.
+
+* **multi_llvm**. This provides some support for these functions to work across
+  multiple LLVM versions. Although this is not strictly needed in LLVM, it has
+  been integrated to allow the integration to go smoothly, without changing files
+  directly. Note this is header only and exists under
+  **compiler_pipeline/include/multi_llvm**.
+
+**compiler_pipeline** and **vecz** will be documented under `sycl/docs`. Note
+that there are several limitations in the code that are a result of the initial
+integration. These should be addressed over time for maintainability reasons,
+they are not necessary for correctness or performance reasons.
+
+General limitations
+-------------------
+
+To simplify the integration and reduce risk, most of the files were integrated
+with no changes at all. This means there are currently the following limitations:
+
+* The namespace in **compiler_pipeline** is **compiler/utils**, the namespace in
+  multi_llvm is **multi_llvm** and the namespace in **vecz** is **vecz**. These should
+  be updated to reflect being under **LLVM**.
+* include files should ideally be moved to under **llvm/include** but remain under
+  these directories after the integration.
+* **vecz** has a test tool **veczc** and associated **lit** tests. This tool if
+  required should be moved under **llvm/tools** or **llvm/test**. This is also
+  requires `NATIVE_CPU_BUILD_VECZ_TEST_TOOLS` **CMake** option to build. This can be
+  run using the target `check-sycl-vecz`.
+* **compiler_pipeline** has lit tests for the passes which have not been integrated.
+  This is because they use a tool **muxc**, but these passes should be
+  able to be tested using **opt**. These lit tests can be found in the
+  `pipeline pass tests`_.
+* There are many integrated files that are unlikely to have any code coverage but because
+  there are referred to in other files which we do need, they exist here. These
+  should be pruned over time as a better understanding is made of what is
+  essential.
+
+.. _oneAPI Construction Kit: https://github.com/uxlfoundation/oneapi-construction-kit
+.. _pipeline pass tests: https://github.com/uxlfoundation/oneapi-construction-kit/tree/main/modules/compiler/test/lit/passes
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt
new file mode 100644
index 0000000000000..90981a1718dac
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/CMakeLists.txt
@@ -0,0 +1,32 @@
+add_llvm_component_library(LLVMNativeCPUPipeline
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/attributes.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/barrier_regions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/builtin_info.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/cl_builtin_info.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/define_mux_builtins_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/dma.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/encode_kernel_metadata_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/group_collective_helpers.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/mangling.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/metadata.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/mux_builtin_info.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/pass_functions.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/optimal_builtin_replacement_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/pass_machinery.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/prepare_barriers_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/replace_local_module_scope_variables_pass.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/scheduling.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/sub_group_analysis.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/target_extension_types.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/source/work_item_loops_pass.cpp
+
+  LINK_COMPONENTS
+  Passes
+  Core
+  )
+
+# TODO: Move to under LLVM include and work out why ADDITIONAL_HEADER_DIRS
+# does not capture it.
+target_include_directories(LLVMNativeCPUPipeline PUBLIC
+$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
new file mode 100644
index 0000000000000..228097d1434d8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/address_spaces.h
@@ -0,0 +1,38 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// LLVM address space identifiers.
+
+#ifndef COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED
+#define COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED
+
+namespace compiler {
+namespace utils {
+namespace AddressSpace {
+enum {
+  Private = 0,
+  Global = 1,
+  Constant = 2,
+  Local = 3,
+  Generic = 4,
+};
+}
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_ADDRESS_SPACES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
new file mode 100644
index 0000000000000..3ea0a5fad08ca
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/attributes.h
@@ -0,0 +1,186 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef COMPILER_UTILS_ATTRIBUTES_H_INCLUDED
+#define COMPILER_UTILS_ATTRIBUTES_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+
+#include <optional>
+
+namespace llvm {
+class CallInst;
+class Function;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+/// @brief Encodes information that a function is a kernel
+///
+/// @param[in] F Function in which to encode the information.
+void setIsKernel(llvm::Function &F);
+
+/// @brief Encodes information that a function is a kernel entry point
+///
+/// @param[in] F Function in which to encode the information.
+void setIsKernelEntryPt(llvm::Function &F);
+
+/// @brief Returns whether the function is a kernel under compilation.
+///
+/// @param[in] F Function to check.
+bool isKernel(const llvm::Function &F);
+
+/// @brief Returns whether the function is a kernel entry point under
+/// compilation.
+///
+/// @param[in] F Function to check.
+bool isKernelEntryPt(const llvm::Function &F);
+
+/// @brief Drops any information about whether a function is a kernel.
+///
+/// @param[in] F Function to drop information from.
+void dropIsKernel(llvm::Function &F);
+
+/// @brief Takes information about kernels from one function to another.
+///
+/// Removes information from the old function, and overwrites any such
+/// information in the new function.
+///
+/// @param[in] ToF Function to copy to.
+/// @param[in] FromF Function to copy from.
+void takeIsKernel(llvm::Function &ToF, llvm::Function &FromF);
+
+/// @brief Sets the original function name as an attribute.
+void setOrigFnName(llvm::Function &F);
+
+/// @brief Retrieves the original function name from the given Function.
+///
+/// @return The original function name (via function attributes) or an empty
+/// string if none is found.
+llvm::StringRef getOrigFnName(const llvm::Function &F);
+
+/// @brief Retrieves the original function name from the given Function, or the
+/// Function's name.
+///
+/// @return The original function name (via function attributes) or the
+/// function's name if none is found.
+llvm::StringRef getOrigFnNameOrFnName(const llvm::Function &F);
+
+/// @brief Sets the original function name as an attribute.
+void setBaseFnName(llvm::Function &F, llvm::StringRef N);
+
+/// @brief Retrieves the base function name component from the given Function.
+///
+/// @return The base function name (via function attributes) or an empty string
+/// if none is found.
+llvm::StringRef getBaseFnName(const llvm::Function &F);
+
+/// @brief Retrieves the base function name component from the given Function,
+/// or the Function's name.
+///
+/// @return The base function name (via function attributes) or the function's
+/// name if none is found.
+llvm::StringRef getBaseFnNameOrFnName(const llvm::Function &F);
+
+/// @brief Retrieves the base function name from the given Function and
+/// sets it if none is found.
+/// @param F The function to read "base function name" attributes from
+/// @param SetFromF The function whose name is set as F's base function
+/// name if none is found in F.
+llvm::StringRef getOrSetBaseFnName(llvm::Function &F,
+                                   const llvm::Function &SetFromF);
+
+/// @brief Sets the local memory usage estimation for the given function.
+///
+/// @param[in] F the function in which to add the attribute
+/// @param[in] LocalMemUsage the (estimated) local memory usage in bytes
+void setLocalMemoryUsage(llvm::Function &F, uint64_t LocalMemUsage);
+
+/// @brief Gets the local memory usage estimation for the given function.
+///
+/// @param[in] F Function from which to pull the attribute
+/// @return the (estimated) local memory usage in bytes if present,
+/// std::nullopt otherwise.
+std::optional<uint64_t> getLocalMemoryUsage(const llvm::Function &F);
+
+/// @brief Sets information about a function's required DMA size as an
+/// attribute.
+///
+/// @param[in] F Function in which to add the attribute.
+/// @param[in] DMASizeBytes DMA size in bytes.
+void setDMAReqdSizeBytes(llvm::Function &F, uint32_t DMASizeBytes);
+
+/// @brief Retrieves information about a function's required DMA size as an
+/// attribute.
+///
+/// @param[in] F Function from which to pull the attribute
+/// @return The required DMA size order if present, else `std::nullopt`
+std::optional<uint32_t> getDMAReqdSizeBytes(const llvm::Function &F);
+
+/// @brief Determines the ordering of work item execution after a barrier.
+enum class BarrierSchedule {
+  /// @brief The barrier pass is free to schedule work items in any order.
+  Unordered = 0,
+  /// @brief The barrier region is entirely uniform (no dependence on work item
+  /// ID) such that execution of multiple work items is redundant and we are
+  /// free to execute the region for only a single work item. Additionally,
+  /// such a region is not allowed to read from or write to the barrier struct
+  /// (the region cannot use any variables defined outwith it, nor define any
+  /// variables used outwith it). Used by work group collectives to initialize
+  /// their accumulators.
+  Once,
+  /// @brief The barrier region should execute all vectorized work items first,
+  /// followed by the scalar tail.
+  ScalarTail,
+  /// @brief The barrier region must be executed in Local Linear ID order.
+  Linear,
+};
+
+/// @brief Sets the work item execution schedule for the given barrier.
+///
+/// @param[in] CI the barrier call instruction
+/// @param[in] Sched the execution schedule to set
+void setBarrierSchedule(llvm::CallInst &CI, BarrierSchedule Sched);
+
+/// @brief Gets the work item execution schedule for the given barrier.
+///
+/// @param[in] CI the barrier call instruction
+/// @return the execution schedule for this barrier
+BarrierSchedule getBarrierSchedule(const llvm::CallInst &CI);
+
+/// @brief Marks a function as not explicitly using subgroups
+///
+/// May be set even with unresolved external functions, assuming those don't
+/// explicitly use subgroups.
+///
+/// @param[in] F Function in which to encode the information.
+void setHasNoExplicitSubgroups(llvm::Function &F);
+
+/// @brief Returns whether the kernel does not explicitly use subgroups
+///
+/// @param[in] F Function to check.
+bool hasNoExplicitSubgroups(const llvm::Function &F);
+
+/// @brief Returns the mux subgroup size for the current function.
+///
+/// Currently always returns 1!
+unsigned getMuxSubgroupSize(const llvm::Function &F);
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_ATTRIBUTES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
new file mode 100644
index 0000000000000..701ac4d0f3102
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/barrier_regions.h
@@ -0,0 +1,365 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Barrier regions, used by the WorkItemLoopsPass.
+
+#ifndef COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED
+#define COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED
+
+#include <compiler/utils/attributes.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SetVector.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+#include <multi_llvm/llvm_version.h>
+
+#include <map>
+
+#include "pass_functions.h"
+
+namespace llvm {
+class BasicBlock;
+class CallInst;
+class FenceInst;
+class Function;
+class Instruction;
+class Module;
+class StructType;
+class Type;
+class Value;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+enum { kBarrier_EndID = 0, kBarrier_FirstID, kBarrier_StartNewID };
+
+class Barrier;
+class BuiltinInfo;
+
+template <class T, size_t N>
+using OrderedSet =
+    llvm::SetVector<T, llvm::SmallVector<T, N>, llvm::SmallPtrSet<T, N>>;
+
+/// @brief Struct to store information about an inter-barrier region.
+struct BarrierRegion {
+  /// @brief the barrier id of this region
+  unsigned id = 0;
+  /// @brief the barrier call instruction for this region
+  llvm::Instruction *barrier_inst = nullptr;
+  /// @brief the entry block of this region
+  llvm::BasicBlock *entry = nullptr;
+
+  llvm::DenseSet<llvm::Value *> defs;
+  /// @brief barrier crossing uses that are defined in this region
+  OrderedSet<llvm::Value *, 16> uses_int;
+  /// @brief barrier crossing uses that are defined in another region
+  OrderedSet<llvm::Value *, 16> uses_ext;
+  /// @brief the blocks in this region
+  std::vector<llvm::BasicBlock *> blocks;
+  /// @brief the exit blocks of this region
+  llvm::SmallPtrSet<llvm::BasicBlock *, 4> barrier_blocks;
+  /// @brief the barrier ids of the successor regions
+  llvm::SmallVector<unsigned, 4> successor_ids;
+  /// @brief the work item execution schedule for this region
+  BarrierSchedule schedule = BarrierSchedule::Unordered;
+};
+
+class Barrier {
+public:
+  /// @brief Type for ids of new kernel functions
+  using kernel_id_map_t = std::map<unsigned, llvm::Function *>;
+
+  Barrier(llvm::Module &m, llvm::Function &f, bool IsDebug)
+      : live_var_mem_ty_(nullptr),
+        size_t_bytes(compiler::utils::getSizeTypeBytes(m)), module_(m),
+        func_(f), is_debug_(IsDebug), max_live_var_alignment(0) {}
+
+  /// @brief perform the Barrier Region analysis and kernel splitting
+  void Run(llvm::ModuleAnalysisManager &mam);
+
+  /// @brief return whether the barrier struct needs to contain anything
+  bool hasLiveVars() const { return !whole_live_variables_set_.empty(); }
+
+  /// @brief returns the StructType of the barrier struct
+  llvm::StructType *getLiveVarsType() const { return live_var_mem_ty_; }
+
+  /// @brief returns the maximum alignment of the barrier struct
+  unsigned getLiveVarMaxAlignment() const { return max_live_var_alignment; }
+
+  /// @brief gets the split subkernels
+  const kernel_id_map_t &getSubkernels() const { return kernel_id_map_; }
+
+  /// @brief gets the split subkernel for the given barrier id
+  llvm::Function *getSubkernel(unsigned id) const {
+    return kernel_id_map_.find(id)->second;
+  }
+
+  /// @brief gets the number of regions/subkernels
+  size_t getNumSubkernels() const { return kernel_id_map_.size(); }
+
+  llvm::CallInst *getBarrierCall(unsigned id) const {
+    return llvm::dyn_cast_or_null<llvm::CallInst>(
+        barrier_region_id_map_.find(id)->second.barrier_inst);
+  }
+
+  /// @brief gets the size of the fixed sized part of the barrier struct
+  size_t getLiveVarMemSizeFixed() const { return live_var_mem_size_fixed; }
+
+  /// @brief gets the minimum size of the scalable part of the barrier struct
+  size_t getLiveVarMemSizeScalable() const {
+    return live_var_mem_size_scalable;
+  }
+
+  /// @brief gets the element index of the first scalable member of the barrier
+  /// struct
+  size_t getLiveVarMemScalablesIndex() const {
+    return live_var_mem_scalables_index;
+  }
+
+  /// @brief gets the barrier IDs of the successors of the given barrier region
+  const llvm::SmallVectorImpl<unsigned> &getSuccessorIds(unsigned id) const {
+    return barrier_region_id_map_.find(id)->second.successor_ids;
+  }
+
+  /// @brief gets the barrier IDs of the successors of the given barrier region
+  BarrierSchedule getSchedule(unsigned id) const {
+    return barrier_region_id_map_.find(id)->second.schedule;
+  }
+
+  /// @brief replaces a subkernel with a given function
+  void replaceSubkernel(llvm::Function *from, llvm::Function *to);
+
+  using debug_variable_records_t =
+      llvm::SmallVector<std::pair<llvm::DbgVariableRecord *, unsigned>, 4>;
+  const debug_variable_records_t &getDebugDbgVariableRecords() const {
+    return debug_variable_records_;
+  }
+
+  /// @brief gets the original function
+  llvm::Function &getFunc() { return func_; }
+  const llvm::Function &getFunc() const { return func_; }
+
+  /// @brief struct to help retrieval of values from the barrier struct
+  struct LiveValuesHelper {
+    const Barrier &barrier;
+    /// @brief A cache of queried live-values addresses (inside the live
+    /// variables struct), stored by the pair (value, member_idx).
+    llvm::DenseMap<std::pair<const llvm::Value *, unsigned>, llvm::Value *>
+        live_GEPs;
+    llvm::DenseMap<const llvm::Value *, llvm::Value *> reloads;
+    llvm::IRBuilder<> gepBuilder;
+    llvm::Value *barrier_struct = nullptr;
+    llvm::Value *vscale = nullptr;
+
+    LiveValuesHelper(const Barrier &b, llvm::Instruction *i, llvm::Value *s)
+        : barrier(b), gepBuilder(i), barrier_struct(s) {}
+
+    LiveValuesHelper(const Barrier &b, llvm::BasicBlock *bb, llvm::Value *s)
+        : barrier(b), gepBuilder(bb), barrier_struct(s) {}
+
+    /// @brief Return a GEP instruction pointing to the given value/idx pair in
+    /// the barrier struct.
+    ///
+    /// @return The GEP corresponding to the address of the value in the
+    /// struct, or nullptr if the value could not be found in the struct.
+    llvm::Value *getGEP(const llvm::Value *live, unsigned member_idx = 0);
+
+    /// @brief Return a GEP instruction corresponding to the address of
+    /// the given ExtractValueInst in the barriers struct.
+    ///
+    /// @return The GEP corresponding to the address of the value in the
+    /// struct, or nullptr if the value is not an ExtractValueInst.
+    llvm::Value *getExtractValueGEP(const llvm::Value *live);
+
+    /// @brief get a value reloaded from the barrier struct.
+    ///
+    /// @param[in] live the live value to retrieve from the barrier
+    /// @param[in] ir where to insert new instructions
+    /// @param[in] name a postfix to append to new value names
+    /// @param[in] reuse whether to generate the load for a given value only
+    /// once, returning the previously cached value on further requests.
+    llvm::Value *getReload(llvm::Value *live, llvm::IRBuilderBase &ir,
+                           const char *name, bool reuse = false);
+  };
+
+private:
+  /// @brief The first is set for livein and the second is set for liveout
+  using live_in_out_t =
+      std::pair<llvm::DenseSet<llvm::Value *>, llvm::DenseSet<llvm::Value *>>;
+  /// @brief Type for memory allocation of live variables at all of barriers
+  using live_variable_mem_t = OrderedSet<llvm::Value *, 32>;
+  /// @brief Type for index of live variables on live variable information
+  /// Indexed by the pair (value, member_idx)
+  using live_variable_index_map_t =
+      llvm::DenseMap<std::pair<const llvm::Value *, unsigned>, unsigned>;
+  /// @brief Type for index of live variables on live variable information
+  /// Indexed by the pair (value, member_idx)
+  using live_variable_scalables_map_t = live_variable_index_map_t;
+  /// @brief Type for ids of barriers
+  using barrier_id_map_t = llvm::DenseMap<llvm::BasicBlock *, unsigned>;
+  /// @brief Type for ids of barrier regions
+  using barrier_region_id_map_t = std::map<unsigned, BarrierRegion>;
+  /// @brief Type for map from ids to fence instructions
+  using fence_id_map_t = llvm::DenseMap<unsigned, llvm::FenceInst *>;
+  /// @brief Type between block and instruction for barrier.
+  using barrier_block_inst_map_t =
+      llvm::DenseMap<llvm::BasicBlock *, llvm::Instruction *>;
+  /// @brief Type between block and block for barrier.
+  using barrier_block_block_set_t = llvm::DenseSet<llvm::BasicBlock *>;
+  /// @brief Type between barrier id and stub call instructions. First
+  /// component of the pair is invoked before the barrier, the second after.
+  using debug_stub_map_t =
+      llvm::DenseMap<unsigned, std::pair<llvm::CallInst *, llvm::CallInst *>>;
+
+  /// @brief Keep whole live variables at all of barriers.
+  live_variable_mem_t whole_live_variables_set_;
+  /// @brief Keep index of live variables on live variable information.
+  live_variable_index_map_t live_variable_index_map_;
+  /// @brief Keep offsets of scalable live variables.
+  live_variable_scalables_map_t live_variable_scalables_map_;
+  /// @brief Keep ids of barriers.
+  barrier_id_map_t barrier_id_map_;
+  /// @brief Look up a barrier region by its id.
+  barrier_region_id_map_t barrier_region_id_map_;
+  /// @brief Keep ids of barriers.
+  kernel_id_map_t kernel_id_map_;
+  /// @brief Keep struct types for live variables' memory layout.
+  llvm::StructType *live_var_mem_ty_;
+  /// @brief The total size of the non-scalable barrier struct
+  size_t live_var_mem_size_fixed = 0;
+  /// @brief The total unscaled size of the scalable barrier struct
+  size_t live_var_mem_size_scalable = 0;
+  /// @brief The index of the scalables buffer array in the barrier struct.
+  size_t live_var_mem_scalables_index = 0;
+  /// @brief Keep barriers.
+  llvm::SmallVector<llvm::CallInst *, 8> barriers_;
+  /// @brief Set of basic blocks that have a barrier as their successor
+  barrier_block_block_set_t barrier_successor_set_;
+  /// @brief Map between barrier ids and call instructions invoking stubs
+  debug_stub_map_t barrier_stub_call_map_;
+  /// @brief List of debug DbgVariableRecords and byte offsets into live
+  /// variable struct
+  debug_variable_records_t debug_variable_records_;
+
+  size_t size_t_bytes;
+
+  llvm::Module &module_;
+  llvm::Function &func_;
+
+  BuiltinInfo *bi_ = nullptr;
+
+  /// @brief Set to true if we want to debug the kernel. This involves adding
+  /// debug stub functions and an extra alloca to aide debugging.
+  const bool is_debug_;
+
+  // @brief max alignment required for the live variables.
+  unsigned max_live_var_alignment;
+
+  /// @brief Find Barriers.
+  void FindBarriers();
+
+  /// @brief Split block with barrier.
+  void SplitBlockwithBarrier();
+
+  /// @brief Generate an empty kernel that only duplicates the source kernel's
+  /// CFG
+  ///
+  /// This is used to do a "dry run" of kernel splitting in order to obtain the
+  /// dominator tree, which is needed for correct identification of values that
+  /// cross the barrier.
+  ///
+  /// @param[in] region the region to clone into the new kernel.
+  /// @param[out] bbmap a mapping of original blocks onto the empty clones.
+  /// @return the fake kernel
+  llvm::Function *GenerateFakeKernel(
+      BarrierRegion &region,
+      llvm::DenseMap<llvm::BasicBlock *, llvm::BasicBlock *> &bbmap);
+
+  /// @brief Obtain a set of Basic Blocks for an inter-barrier region
+  ///
+  /// It traverses the CFG, following successors, until it hits a barrier,
+  /// building the region's internal data.
+  ///
+  /// @param[out] region the region to process
+  void GatherBarrierRegionBlocks(BarrierRegion &region);
+
+  /// @brief Obtain a set of Values used in a region that cross a barrier
+  ///
+  /// A value use crosses a barrier in the following cases:
+  /// * Its use is not in the same region as the defintion
+  /// * Its definition does not dominate the use
+  ///
+  /// @param[in] region The inter-barrier region
+  /// @param[in] ignore set of values to ignore
+  void GatherBarrierRegionUses(BarrierRegion &region,
+                               llvm::DenseSet<llvm::Value *> &ignore);
+
+  /// @brief Find livein and liveout variables per each basic block.
+  void FindLiveVariables();
+
+  /// @brief Remove variables that are better recalculated than stored in the
+  ///        barrier, for instance casts and vector splats.
+  void TidyLiveVariables();
+
+  /// @brief Pad the field types to an alignment by adding an int array if
+  /// needed
+  /// @param field_tys The vector of types representing the final structure
+  /// @param offset The current offset in the structure
+  /// @param alignment The required alignment
+  /// @return The new offset (or original offset if no padding needed)
+  unsigned PadTypeToAlignment(llvm::SmallVectorImpl<llvm::Type *> &field_tys,
+                              unsigned offset, unsigned alignment);
+
+  /// @brief Make type for whole live variables.
+  void MakeLiveVariableMemType();
+
+  /// @brief Generate new kernel from an inter-barrier region such that no call
+  /// to barriers occur within it.
+  ///
+  /// @param[in] region the inter-barrier region to create the kernel from
+  /// @return the new kernel
+  llvm::Function *GenerateNewKernel(BarrierRegion &region);
+
+  /// @brief This function is a copy from llvm::CloneBasicBlock. In order to
+  /// update live variable information, some of codes are added.
+  ///
+  /// @param[in] bb Basic block to copy.
+  /// @param[out] vmap Map for value for cloning.
+  /// @param[in] name_suffix Name for suffix.
+  /// @param[out] live_defs_info Live definitions' info current basic block.
+  /// @param[in] F Current function.
+  ///
+  /// @return Return cloned basic block.
+  llvm::BasicBlock *CloneBasicBlock(llvm::BasicBlock *bb,
+                                    llvm::ValueToValueMapTy &vmap,
+                                    const llvm::Twine &name_suffix,
+                                    live_variable_mem_t &live_defs_info,
+                                    llvm::Function *F);
+
+  /// @brief Seperate kernel function with barrier boundary.
+  void SeperateKernelWithBarrier();
+};
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_BARRIER_REGIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
new file mode 100644
index 0000000000000..b88b82aab6123
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/builtin_info.h
@@ -0,0 +1,860 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Information about compiler builtins.
+
+#ifndef COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED
+#define COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED
+
+#include <compiler/utils/group_collective_helpers.h>
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/ConstantRange.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <optional>
+
+namespace compiler {
+namespace utils {
+/// @addtogroup utils
+/// @{
+
+using BuiltinID = int32_t;
+
+enum BaseBuiltinID {
+  eBuiltinUnknown,
+
+  // Mux builtins
+  eMuxBuiltinIsFTZ,
+  eMuxBuiltinUseFast,
+  eMuxBuiltinIsEmbeddedProfile,
+  eMuxBuiltinGetGlobalSize,
+  eMuxBuiltinGetGlobalId,
+  eMuxBuiltinGetGlobalOffset,
+  eMuxBuiltinGetLocalSize,
+  eMuxBuiltinGetLocalId,
+  eMuxBuiltinSetLocalId,
+  eMuxBuiltinGetSubGroupId,
+  eMuxBuiltinSetSubGroupId,
+  eMuxBuiltinGetNumGroups,
+  eMuxBuiltinGetNumSubGroups,
+  eMuxBuiltinSetNumSubGroups,
+  eMuxBuiltinGetMaxSubGroupSize,
+  eMuxBuiltinSetMaxSubGroupSize,
+  eMuxBuiltinGetGroupId,
+  eMuxBuiltinGetWorkDim,
+  eMuxBuiltinDMARead1D,
+  eMuxBuiltinDMARead2D,
+  eMuxBuiltinDMARead3D,
+  eMuxBuiltinDMAWrite1D,
+  eMuxBuiltinDMAWrite2D,
+  eMuxBuiltinDMAWrite3D,
+  eMuxBuiltinDMAWait,
+  eMuxBuiltinGetGlobalLinearId,
+  eMuxBuiltinGetLocalLinearId,
+  eMuxBuiltinGetEnqueuedLocalSize,
+  eMuxBuiltinGetSubGroupSize,
+  eMuxBuiltinGetSubGroupLocalId,
+  // Synchronization builtins
+  eMuxBuiltinMemBarrier,
+  eMuxBuiltinSubGroupBarrier,
+  eMuxBuiltinWorkGroupBarrier,
+#define GROUP_BUILTINS(SCOPE)                                                  \
+  eFirstMux##SCOPE##groupCollectiveBuiltin,                                    \
+      eMuxBuiltin##SCOPE##groupAll = eFirstMux##SCOPE##groupCollectiveBuiltin, \
+      eMuxBuiltin##SCOPE##groupAny, eMuxBuiltin##SCOPE##groupBroadcast,        \
+      eMuxBuiltin##SCOPE##groupReduceAdd, eMuxBuiltin##SCOPE##groupReduceFAdd, \
+      eMuxBuiltin##SCOPE##groupReduceSMin,                                     \
+      eMuxBuiltin##SCOPE##groupReduceUMin,                                     \
+      eMuxBuiltin##SCOPE##groupReduceFMin,                                     \
+      eMuxBuiltin##SCOPE##groupReduceSMax,                                     \
+      eMuxBuiltin##SCOPE##groupReduceUMax,                                     \
+      eMuxBuiltin##SCOPE##groupReduceFMax, eMuxBuiltin##SCOPE##groupReduceMul, \
+      eMuxBuiltin##SCOPE##groupReduceFMul, eMuxBuiltin##SCOPE##groupReduceAnd, \
+      eMuxBuiltin##SCOPE##groupReduceOr, eMuxBuiltin##SCOPE##groupReduceXor,   \
+      eMuxBuiltin##SCOPE##groupReduceLogicalAnd,                               \
+      eMuxBuiltin##SCOPE##groupReduceLogicalOr,                                \
+      eMuxBuiltin##SCOPE##groupReduceLogicalXor,                               \
+      eMuxBuiltin##SCOPE##groupScanAddInclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanFAddInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanAddExclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanFAddExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanSMinInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanUMinInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanFMinInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanSMinExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanUMinExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanFMinExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanSMaxInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanUMaxInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanFMaxInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanSMaxExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanUMaxExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanFMaxExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanMulInclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanFMulInclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanMulExclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanFMulExclusive,                              \
+      eMuxBuiltin##SCOPE##groupScanAndInclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanAndExclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanOrInclusive,                                \
+      eMuxBuiltin##SCOPE##groupScanOrExclusive,                                \
+      eMuxBuiltin##SCOPE##groupScanXorInclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanXorExclusive,                               \
+      eMuxBuiltin##SCOPE##groupScanLogicalAndInclusive,                        \
+      eMuxBuiltin##SCOPE##groupScanLogicalAndExclusive,                        \
+      eMuxBuiltin##SCOPE##groupScanLogicalOrInclusive,                         \
+      eMuxBuiltin##SCOPE##groupScanLogicalOrExclusive,                         \
+      eMuxBuiltin##SCOPE##groupScanLogicalXorInclusive,                        \
+      eMuxBuiltin##SCOPE##groupScanLogicalXorExclusive
+  GROUP_BUILTINS(Work),
+  eLastMuxWorkgroupCollectiveBuiltin =
+      eMuxBuiltinWorkgroupScanLogicalXorExclusive,
+  GROUP_BUILTINS(Sub),
+  // Extra subgroup shuffle operations
+  eMuxBuiltinSubgroupShuffle,
+  eMuxBuiltinSubgroupShuffleUp,
+  eMuxBuiltinSubgroupShuffleDown,
+  eMuxBuiltinSubgroupShuffleXor,
+  eLastMuxSubgroupCollectiveBuiltin = eMuxBuiltinSubgroupShuffleXor,
+  GROUP_BUILTINS(Vec),
+  eLastMuxVecgroupCollectiveBuiltin =
+      eMuxBuiltinVecgroupScanLogicalXorExclusive,
+
+  // Marker - target builtins should start from here.
+  eFirstTargetBuiltin,
+};
+
+/// @brief Describes the uniformity of a builtin's return values. An uniform
+/// value is the same for all instances (e.g. SIMD lanes).
+enum BuiltinUniformity : int32_t {
+  /// @brief The uniformity of the builtin's return value cannot be determined.
+  eBuiltinUniformityUnknown,
+  /// @brief The builtin never returns uniform values.
+  eBuiltinUniformityNever,
+  /// @brief The builtin always returns uniform values.
+  eBuiltinUniformityAlways,
+  /// @brief The builtin returns uniform values if its inputs are uniform.
+  eBuiltinUniformityLikeInputs,
+  /// @brief The builtin returns a sequential instance ID value
+  /// (e.g. get_local_id in OpenCL).
+  eBuiltinUniformityInstanceID,
+  /// @brief The builtin might return a sequential instance ID value,
+  /// if its argument can be zero (e.g. get_local_id(x)).
+  eBuiltinUniformityMaybeInstanceID
+};
+
+/// @brief Describes certain properties of builtin functions that the vectorizer
+/// needs to know about.
+enum BuiltinProperties : int32_t {
+  /// @brief The builtin has no special propery.
+  eBuiltinPropertyNone = 0,
+  /// @brief The builtin returns a value related to the geometry of the work
+  /// space, such as its dimension or an index into that dimensions.
+  eBuiltinPropertyWorkItem = (1 << 0),
+  /// @brief The builtin can affect the execution flow (e.g. barrier).
+  eBuiltinPropertyExecutionFlow = (1 << 1),
+  /// @brief The builtin implements a reduction, that is, it takes vector
+  /// arguments and returns a scalar value.
+  eBuiltinPropertyReduction = (1 << 2),
+  /// @brief The builtin has known side-effects.
+  eBuiltinPropertySideEffects = (1 << 3),
+  /// @brief The builtin is known to have no runtime side-effects. This is
+  /// equivalent to 'readonly' or 'readnone' in IR. The return value depends
+  /// only on the values of the arguments.
+  eBuiltinPropertyNoSideEffects = (1 << 4),
+  /// @brief The builtin can be instantiated, even if it has side-effects.
+  /// Builtins with 'NoSideEffects' should not be instantiated unless they
+  /// also have this flag, because of the 'noduplicate' IR attribute.
+  eBuiltinPropertySupportsInstantiation = (1 << 5),
+  /// @brief The builtin has no vector equivalent. There may be functions that
+  /// have the same signature that a vector equivalent function would have,
+  /// but these functions should not be used for that purpose. This can also
+  /// mean that a vector builtin has no scalar equivalent.
+  eBuiltinPropertyNoVectorEquivalent = (1 << 6),
+  /// @brief The builtin has a vector equivalent. This is used for the LLVM
+  /// intrinsics, since for the OpenCL builtins we can determine that
+  /// programmatically. It can also mean that a builtin has a scalar equivalent.
+  eBuiltinPropertyVectorEquivalent = (1 << 7),
+  /// @brief The builtin can be emitted inline.
+  eBuiltinPropertyCanEmitInline = (1 << 8),
+  /// @brief The builtin returns a value through its pointer argument. The
+  /// returned type is equal to the function return type.
+  eBuiltinPropertyPointerReturnEqualRetTy = (1 << 9),
+  /// @brief The builtin wants to be inlined post vectorization
+  eBuiltinPropertyInlinePostVectorization = (1 << 10),
+  /// @brief The builtin returns a value through its pointer argument. The
+  /// returned value is an i32 scalar or vector, matching the function return
+  /// type: float -> i32, <4 x double> -> <4 x i32>, etc
+  eBuiltinPropertyPointerReturnEqualIntRetTy = (1 << 11),
+  /// @brief The builtin returns local work item ID.
+  eBuiltinPropertyLocalID = (1 << 12),
+  /// @brief The builtin is atomic
+  eBuiltinPropertyAtomic = (1 << 13),
+  /// @brief The builtin is rematerializable on the other side of a barrier
+  ///
+  /// The WorkItemLoopsPass queries this property to prune the number of live
+  /// variables that are stored and passed between barrier regions. Calls to
+  /// rematerializable builtins are removed from the live variable structure,
+  /// and are re-inserted into each barrier region that requires their results.
+  eBuiltinPropertyRematerializable = (1 << 14),
+  /// @brief The builtin should be lowered to a mux builtin.
+  ///
+  /// This mapping takes place in BuiltinInfo::lowerBuiltinToMuxBuiltin.
+  eBuiltinPropertyLowerToMuxBuiltin = (1 << 15),
+  /// @brief The builtin is known not be be convergent, i.e., it does not
+  /// depend on any other work-item in any way.
+  eBuiltinPropertyKnownNonConvergent = (1 << 16),
+};
+
+/// @brief struct to hold information about a builtin function
+struct Builtin {
+  /// @brief the builtin Function
+  const llvm::Function &function;
+  /// @brief ID for internal use
+  const BuiltinID ID;
+  /// @brief the Builtin Properties
+  const BuiltinProperties properties;
+  /// @brief list of types used in overloading this builtin (only relevant for
+  /// overloadable mux builtins)
+  std::vector<llvm::Type *> mux_overload_info = {};
+
+  /// @brief returns whether the builtin is unknown
+  bool isUnknown() const { return ID == eBuiltinUnknown; }
+};
+
+/// @brief struct to hold information about a builtin function call
+struct BuiltinCall : public Builtin {
+  /// @brief the call instruction
+  const llvm::CallInst &call;
+  /// @brief the uniformity of the builtin call
+  const BuiltinUniformity uniformity;
+
+  /// @brief constructor
+  BuiltinCall(const Builtin &B, const llvm::CallInst &CI, BuiltinUniformity U)
+      : Builtin(B), call(CI), uniformity(U) {}
+};
+
+namespace MuxBuiltins {
+constexpr const char isftz[] = "__mux_isftz";
+constexpr const char usefast[] = "__mux_usefast";
+constexpr const char isembeddedprofile[] = "__mux_isembeddedprofile";
+constexpr const char get_global_size[] = "__mux_get_global_size";
+constexpr const char get_global_id[] = "__mux_get_global_id";
+constexpr const char get_global_offset[] = "__mux_get_global_offset";
+constexpr const char get_local_size[] = "__mux_get_local_size";
+constexpr const char get_local_id[] = "__mux_get_local_id";
+constexpr const char get_sub_group_id[] = "__mux_get_sub_group_id";
+constexpr const char get_num_groups[] = "__mux_get_num_groups";
+constexpr const char get_num_sub_groups[] = "__mux_get_num_sub_groups";
+constexpr const char get_max_sub_group_size[] = "__mux_get_max_sub_group_size";
+constexpr const char get_group_id[] = "__mux_get_group_id";
+constexpr const char get_work_dim[] = "__mux_get_work_dim";
+constexpr const char dma_read_1d[] = "__mux_dma_read_1D";
+constexpr const char dma_read_2d[] = "__mux_dma_read_2D";
+constexpr const char dma_read_3d[] = "__mux_dma_read_3D";
+constexpr const char dma_write_1d[] = "__mux_dma_write_1D";
+constexpr const char dma_write_2d[] = "__mux_dma_write_2D";
+constexpr const char dma_write_3d[] = "__mux_dma_write_3D";
+constexpr const char dma_wait[] = "__mux_dma_wait";
+constexpr const char get_global_linear_id[] = "__mux_get_global_linear_id";
+constexpr const char get_local_linear_id[] = "__mux_get_local_linear_id";
+constexpr const char get_enqueued_local_size[] =
+    "__mux_get_enqueued_local_size";
+constexpr const char get_sub_group_size[] = "__mux_get_sub_group_size";
+constexpr const char get_sub_group_local_id[] = "__mux_get_sub_group_local_id";
+
+// Barriers
+constexpr const char mem_barrier[] = "__mux_mem_barrier";
+constexpr const char sub_group_barrier[] = "__mux_sub_group_barrier";
+constexpr const char work_group_barrier[] = "__mux_work_group_barrier";
+
+// DMA Event Type
+constexpr const char dma_event_type[] = "__mux_dma_event_t";
+
+// Internal Mux Functions
+constexpr const char set_local_id[] = "__mux_set_local_id";
+constexpr const char set_sub_group_id[] = "__mux_set_sub_group_id";
+constexpr const char set_num_sub_groups[] = "__mux_set_num_sub_groups";
+constexpr const char set_max_sub_group_size[] = "__mux_set_max_sub_group_size";
+} // namespace MuxBuiltins
+
+static inline llvm::Type *getPointerReturnPointeeTy(const llvm::Function &F,
+                                                    BuiltinProperties Props) {
+  if (Props & eBuiltinPropertyPointerReturnEqualRetTy) {
+    return F.getReturnType();
+  }
+  if (Props & eBuiltinPropertyPointerReturnEqualIntRetTy) {
+    llvm::Type *I32Ty = llvm::IntegerType::getInt32Ty(F.getContext());
+    if (auto *VTy = llvm::dyn_cast<llvm::VectorType>(F.getReturnType())) {
+      return llvm::VectorType::get(I32Ty,
+                                   multi_llvm::getVectorElementCount(VTy));
+    }
+    return I32Ty;
+  }
+  return nullptr;
+}
+
+/// @brief Describes how builtins should be materialized.
+enum BuiltinMatFlags : int32_t {
+  /// @brief Use default materialization options.
+  eBuiltinMatDefault = 0,
+  /// @brief The body of the builtin should be materialized.
+  eBuiltinMatDefinition = (1 << 0)
+};
+
+class BIMuxInfoConcept;
+class BILangInfoConcept;
+
+/// @brief A class that encapsulates information and transformations concerning
+/// compiler builtin functions.
+///
+/// It provides methods for querying data about builtin functions, methods for
+/// emitting bodies of builtins "inline", and methods for materializing
+/// builtins from an external source.
+///
+/// It contains a BIMuxInfoConcept implementation to provide mux builtin
+/// information on a target-by-target basis.
+///
+/// It contains an optional BILangInfoConcept implementation to provide builtin
+/// information on a target-by-target basis.
+class BuiltinInfo {
+public:
+  // Default-construct a BuiltinInfo without a concrete set of language-level
+  // builtins.
+  BuiltinInfo() : MuxImpl(std::make_unique<BIMuxInfoConcept>()) {}
+
+  BuiltinInfo(std::unique_ptr<BILangInfoConcept> &&LangImpl)
+      : MuxImpl(std::make_unique<BIMuxInfoConcept>()),
+        LangImpl(std::move(LangImpl)) {}
+
+  BuiltinInfo(std::unique_ptr<BIMuxInfoConcept> &&MuxImpl,
+              std::unique_ptr<BILangInfoConcept> &&LangImpl)
+      : MuxImpl(std::move(MuxImpl)), LangImpl(std::move(LangImpl)) {}
+
+  BuiltinInfo(BuiltinInfo &&) = default;
+  BuiltinInfo &operator=(BuiltinInfo &&RHS) = default;
+
+  /// @brief Retrieves the optional module containing builtin definitions.
+  llvm::Module *getBuiltinsModule();
+
+  /// @brief Determine general properties for the given builtin function.
+  /// @param[in] F Function to analyze.
+  /// @return Analyzed properties for the builtin.
+  std::optional<Builtin> analyzeBuiltin(const llvm::Function &F) const;
+
+  /// @brief Determine general properties for the given builtin function.
+  /// @param[in] CI Call instruction to analyze.
+  /// @return Analyzed properties for the builtin call.
+  std::optional<BuiltinCall> analyzeBuiltinCall(const llvm::CallInst &CI,
+                                                unsigned SimdDimIdx) const;
+
+  /// @brief Try to find a builtin function that is a vector equivalent of the
+  /// given function with the given vector width, if it exists.
+  /// @param[in] B Builtin to query for a vector equivalent.
+  /// @param[in] Width Vector width.
+  /// @param[in] M Optional module where the vector equivalent should be
+  /// declared.
+  /// @return Equivalent vector builtin function on success.
+  llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width,
+                                      llvm::Module *M = nullptr);
+
+  /// @brief Try to find a builtin function that is a scalar equivalent of the
+  /// given function, if it exists.
+  /// @param[in] B Builtin to query for a scalar equivalent.
+  /// @param[in] M Optional module where the vector equivalent should be
+  /// declared.
+  /// @return Equivalent scalar builtin function on success.
+  llvm::Function *getScalarEquivalent(const Builtin &B, llvm::Module *M);
+
+  /// @brief Emit an inline implementation of the builtin function F.
+  /// @param[in] Builtin Builtin function to emit an implementation for.
+  /// @param[in] B Insertion point for the implementation.
+  /// @param[in] Args Arguments to the builtin function.
+  /// @return A value that implements the builtin function or null.
+  llvm::Value *emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B,
+                                 llvm::ArrayRef<llvm::Value *> Args);
+
+  /// @brief Return a known range of values this call may return.
+  /// @param[in] CI Call instruction to analyze.
+  /// @param[in] MaxLocalSizes The maximum local work-group sizes in each of
+  /// the 3 dimensions that this target supports.
+  /// @param[in] MaxGlobalSizes The maximum global work-group sizes in each of
+  /// the 3 dimensions that this target supports.
+  std::optional<llvm::ConstantRange>
+  getBuiltinRange(llvm::CallInst &CI,
+                  std::array<std::optional<uint64_t>, 3> MaxLocalSizes,
+                  std::array<std::optional<uint64_t>, 3> MaxGlobalSizes) const;
+
+  /// @brief Lowers a call to a language-level builtin to an instruction
+  /// sequences calling a mux builtin.
+  ///
+  /// For a call to a builtin for which the property
+  /// eBuiltinPropertyLowerToMuxBuiltin is set, the target must then re-express
+  /// the call to a new sequence, usually involving mux builtins.
+  llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &CI);
+
+  /// @brief Get a builtin for printf.
+  /// @return An identifier for the builtin, or the invalid builtin if there
+  /// is none. This builtin should have a signature of `<void type | integer
+  /// type> <builtin name>(<char*>, ...)`.
+  std::optional<BuiltinID> getPrintfBuiltin() const;
+
+  /// @brief Returns true if the given ID is a ComputeMux builtin ID.
+  static bool isMuxBuiltinID(BuiltinID ID) {
+    return ID > eBuiltinUnknown && ID < eFirstTargetBuiltin;
+  }
+
+  /// @brief Returns true if the given ID is an overloadable ComputeMux builtin
+  /// ID.
+  ///
+  /// These builtins *require* extra overloading info when declaring or
+  /// defining.
+  static bool isOverloadableMuxBuiltinID(BuiltinID ID);
+
+  /// @brief Returns true if the given ID is a ComputeMux barrier builtin ID.
+  static bool isMuxControlBarrierID(BuiltinID ID) {
+    return ID == eMuxBuiltinSubGroupBarrier ||
+           ID == eMuxBuiltinWorkGroupBarrier;
+  }
+
+  /// @brief Returns true if the given ID is a ComputeMux DMA builtin ID.
+  static bool isMuxDmaBuiltinID(BuiltinID ID) {
+    return ID == eMuxBuiltinDMAWait || ID == eMuxBuiltinDMARead1D ||
+           ID == eMuxBuiltinDMARead2D || ID == eMuxBuiltinDMARead3D ||
+           ID == eMuxBuiltinDMAWrite1D || ID == eMuxBuiltinDMAWrite2D ||
+           ID == eMuxBuiltinDMAWrite3D;
+  }
+
+  /// @brief Gets information about a mux group operation builtin
+  static std::optional<GroupCollective> isMuxGroupCollective(BuiltinID ID);
+
+  /// @brief Returns the mux builtin ID matching the group collective, or
+  /// eBuiltinInvalid.
+  static std::optional<BuiltinID>
+  getMuxGroupCollective(const GroupCollective &Group);
+
+  /// @brief Returns true if the mux builtin has a barrier ID as its first
+  /// operand.
+  static bool isMuxBuiltinWithBarrierID(BuiltinID ID) {
+    if (isMuxControlBarrierID(ID)) {
+      return true;
+    }
+    auto Info = isMuxGroupCollective(ID);
+    return Info && Info->isWorkGroupScope();
+  }
+
+  /// @brief Returns true if the mux builtin has a barrier ID as its first
+  /// operand, and applies at Work Group scope.
+  static bool isMuxBuiltinWithWGBarrierID(BuiltinID ID) {
+    if (ID == eMuxBuiltinWorkGroupBarrier) {
+      return true;
+    }
+    auto Info = isMuxGroupCollective(ID);
+    return Info && Info->isWorkGroupScope();
+  }
+
+  /// @brief Maps a ComputeMux builtin ID to its function name.
+  ///
+  /// @param OverloadInfo An array of types required to resolve certain
+  /// overloadable builtins, e.g., group builtins.
+  static std::string
+  getMuxBuiltinName(BuiltinID ID,
+                    llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  /// @brief Mangles a type using the LLVM intrinsic scheme
+  ///
+  /// This is an extremely simple mangling scheme matching LLVM's intrinsic
+  /// mangling system. It is only designed to be used with a specific set of
+  /// types and is not a general-purpose mangler.
+  ///
+  /// * iXXX -> iXXX
+  /// * half -> f16
+  /// * float -> f32
+  /// * double -> f64
+  /// * <N x Ty> -> vNTy
+  /// * <vscale x N x Ty> -> nxvNTy
+  static std::string getMangledTypeStr(llvm::Type *Ty);
+
+  /// @brief Demangles a type using the LLVM intrinsic scheme - returns nullptr
+  /// if it was unable to demangle a type.
+  ///
+  /// @see getMangledTypeStr
+  static std::pair<llvm::Type *, llvm::StringRef>
+  getDemangledTypeFromStr(llvm::StringRef TyStr, llvm::LLVMContext &Ctx);
+
+  /// @brief Defines the body of a ComputeMux builtin declaration
+  ///
+  /// If the Module already has a function definition with the corresponding
+  /// function name, it is left alone and returned.
+  ///
+  /// Will declare any builtins it requires as transitive dependencies.
+  ///
+  /// @param OverloadInfo An array of types required to resolve certain
+  /// overloadable builtins, e.g., group builtins.
+  llvm::Function *
+  defineMuxBuiltin(BuiltinID, llvm::Module &M,
+                   llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  /// @brief Gets a ComputeMux builtin from the module, or declares it
+  ///
+  /// @param OverloadInfo An array of types required to resolve certain
+  /// overloadable builtins, e.g., group builtins.
+  llvm::Function *
+  getOrDeclareMuxBuiltin(BuiltinID, llvm::Module &M,
+                         llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  struct SchedParamInfo {
+    /// @brief An identifier providing resolution for targets to identify
+    /// specific scheduling parameters.
+    ///
+    /// By default, will be the index into the list returned by
+    /// getMuxSchedulingParameters.
+    unsigned ID;
+    /// @brief The parameter type
+    llvm::Type *ParamTy;
+    /// @brief A (possibly empty) set of parameter attributes to apply to all
+    /// functions featuring this parameter.
+    llvm::AttributeSet ParamAttrs;
+    /// @brief The name of the parameter, to aid debugging. May be empty.
+    std::string ParamName;
+    /// @brief A human-readable name to be emitted in !mux-scheduling-params
+    std::string ParamDebugName;
+    /// @brief True if the parameter is passed externally by the driver to the
+    /// kernel entry point, else false if this parameter is initialized by the
+    /// kernel at the top level.
+    ///
+    /// This provides an interface to passes such as AddKernelWrapperPass.
+    ///
+    /// If true, the parameter is passed through every layer of kernels. If
+    /// false, the parameter must be initialized by
+    /// initializeSchedulingParamForWrappedKernel.
+    bool PassedExternally;
+    /// @brief An optional type to aid targets in remembering the underlying
+    /// parameter type, if the parameter is a pointer.
+    llvm::Type *ParamPointeeTy = nullptr;
+    /// @brief An optional value specifying the concrete function argument.
+    llvm::Argument *ArgVal = nullptr;
+  };
+
+  /// @brief Returns a target-specific list of scheduling parameters to be
+  /// applied to all builtins for which requiresSchedulingParameters returns
+  /// true.
+  ///
+  /// This list of parameters that dictates the order of parameters added to
+  /// each builtin. As such it must be constant and immutable for each Module.
+  ///
+  /// This list is emitted into the module as metadata by the
+  /// AddSchedulingParametersPass for user reference.
+  ///
+  /// This function does not have to fill in SchedParamInfo::ArgVal, as this
+  /// query is not specific to one function.
+  llvm::SmallVector<SchedParamInfo, 4>
+  getMuxSchedulingParameters(llvm::Module &);
+
+  /// @brief Returns target-specific scheduling parameters from a concrete
+  /// function.
+  ///
+  /// Uses metadata returned via
+  /// compiler::utils::getSchedulingParameterFunctionMetadata to determine
+  /// whether the function contains scheduling parameters.
+  ///
+  /// If set, this function should return the same result as
+  /// getMuxSchedulingParameters, but with SchedParamInfo::ArgVal filled in to
+  /// correspond to the actual concrete llvm::Argument values of the given
+  /// function. Note that not all ArgVals are guaranteed to be populated, as a
+  /// function may contain only a subset of the target's list of scheduling
+  /// parameters.
+  ///
+  /// If not set, this function returns an empty list.
+  llvm::SmallVector<SchedParamInfo, 4>
+  getFunctionSchedulingParameters(llvm::Function &);
+
+  /// @brief Responsible for initializing a scheduling parameter for which
+  /// PassedExternally is 'false'.
+  ///
+  /// This is conceptually used to initialize scheduling parameters which are
+  /// used for scheduling "internally" and do not make up the driver-facing
+  /// kernel ABI.
+  ///
+  /// @param Info The SchedParamInfo dictating which kind of scheduling
+  /// parameter to initialize.
+  /// @param B An IRBuilder providing the insertion point at which to insert
+  /// initialization instructions.
+  /// @param IntoF The function into which initialization instructions are to be
+  /// inserted.
+  /// @param CalleeF The function for which the initialization is taking place.
+  /// CalleeF will be called by IntoF.
+  llvm::Value *initializeSchedulingParamForWrappedKernel(
+      const SchedParamInfo &Info, llvm::IRBuilder<> &B, llvm::Function &IntoF,
+      llvm::Function &CalleeF);
+
+  /// @brief Returns true if the builtin ID requires extra scheduling
+  /// parameters to function.
+  ///
+  /// This function only handles mux builtins, and does not to defer any of
+  /// BuiltinInfo's implementation instances.
+  ///
+  /// These parameters will to be added to the function (and its callers) by
+  /// the AddSchedulingParametersPass.
+  bool requiresSchedulingParameters(BuiltinID ID);
+
+  /// @brief Returns the remapped type for a target extension type
+  ///
+  /// This method is intended for target implementations to be able signal to
+  /// the DefineTargetExtTysPass how LLVM's target extension types should be
+  /// remapped across the module. There is a default implementation: see
+  /// BIMuxInfoConcept::getRemappedTargetExtTy
+  ///
+  /// This method is safe to call before LLVM 17 but will do nothing (there are
+  /// no target extension types before LLVM 17). Otherwise this method asserts
+  /// that the type is a target extension type.
+  ///
+  /// @param Ty The target extension type to remap
+  /// @param M The Module in which to replace the type
+  /// @return The remapped type, or nullptr if the type does not require
+  /// remapping
+  llvm::Type *getRemappedTargetExtTy(llvm::Type *Ty, llvm::Module &M);
+
+  /// Handle the invalidation of this information.
+  ///
+  /// When used as a result of BuiltinInfoAnalysis this method will be called
+  /// when the function this was computed for changes. When it returns false,
+  /// the information is preserved across those changes.
+  bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &,
+                  llvm::ModuleAnalysisManager::Invalidator &) {
+    return false;
+  }
+
+private:
+  /// @brief Try to identify a builtin function.
+  /// @param[in] F The function to identify.
+  /// @return Valid builtin ID if the name was identified, as well as any types
+  /// required to overload the builtin ID.
+  std::optional<std::pair<BuiltinID, std::vector<llvm::Type *>>>
+  identifyMuxBuiltin(const llvm::Function &F) const;
+
+  /// @brief Determine whether the given builtin function returns uniform values
+  /// or not. An optional call instruction can be passed for more accuracy.
+  /// @param[in] B the builtin to analyze uniformity.
+  /// @param[in] CI Optional argument list from a call instruction.
+  /// @param[in] SimdDimIdx Index of current vectorization dimension.
+  /// @return Uniformity value for the builtin.
+  BuiltinUniformity isBuiltinUniform(const Builtin &B, const llvm::CallInst *CI,
+                                     unsigned SimdDimIdx) const;
+
+  std::unique_ptr<BIMuxInfoConcept> MuxImpl;
+  std::unique_ptr<BILangInfoConcept> LangImpl;
+};
+
+/// @brief An interface class that provides mux- and target-specific
+/// information and transformations to an instance of BuiltinInfo. All methods
+/// are to be called through from the equivalent methods in BuiltinInfo.
+class BIMuxInfoConcept {
+public:
+  virtual ~BIMuxInfoConcept() = default;
+
+  /// @brief See BuiltinInfo::defineMuxBuiltin.
+  virtual llvm::Function *
+  defineMuxBuiltin(BuiltinID, llvm::Module &M,
+                   llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  /// @brief See BuiltinInfo::getOrDeclareMuxBuiltin.
+  virtual llvm::Function *
+  getOrDeclareMuxBuiltin(BuiltinID, llvm::Module &M,
+                         llvm::ArrayRef<llvm::Type *> OverloadInfo = {});
+
+  /// @brief See BuiltinInfo::getMuxSchedulingParameters
+  virtual llvm::SmallVector<BuiltinInfo::SchedParamInfo, 4>
+  getMuxSchedulingParameters(llvm::Module &);
+
+  /// @brief See BuiltinInfo::getFunctionSchedulingParameters
+  virtual llvm::SmallVector<BuiltinInfo::SchedParamInfo, 4>
+  getFunctionSchedulingParameters(llvm::Function &);
+
+  /// @brief See BuiltinInfo::initializeSchedulingParamForWrappedKernel
+  virtual llvm::Value *initializeSchedulingParamForWrappedKernel(
+      const BuiltinInfo::SchedParamInfo &Info, llvm::IRBuilder<> &B,
+      llvm::Function &IntoF, llvm::Function &CalleeF);
+
+  /// @brief Sets default builtin attributes on the given function.
+  static void setDefaultBuiltinAttributes(llvm::Function &F,
+                                          bool AlwaysInline = true);
+
+  /// @brief Returns true if the mux builtin requires scheduling parameters to
+  /// function.
+  virtual bool requiresSchedulingParameters(BuiltinID);
+
+  /// @brief See BuiltinInfo::getRemappedTargetExtTy
+  ///
+  /// This method is overridable but the default implementation provides the
+  /// following mappings:
+  ///   * spirv.Event -> i32
+  ///   * spirv.Sampler -> i32
+  ///   * spirv.Image -> MuxImage* (regardless of image parameters)
+  virtual llvm::Type *getRemappedTargetExtTy(llvm::Type *Ty, llvm::Module &M);
+
+  /// @see BuiltinInfo::getBuiltinRange
+  virtual std::optional<llvm::ConstantRange>
+  getBuiltinRange(llvm::CallInst &, BuiltinID ID,
+                  std::array<std::optional<uint64_t>, 3>,
+                  std::array<std::optional<uint64_t>, 3>) const;
+
+  enum MemScope : uint32_t {
+    MemScopeCrossDevice = 0,
+    MemScopeDevice = 1,
+    MemScopeWorkGroup = 2,
+    MemScopeSubGroup = 3,
+    MemScopeWorkItem = 4,
+  };
+
+  enum MemSemantics : uint32_t {
+    // Only set one of the following bits at a time:
+    MemSemanticsRelaxed = 0x0,
+    MemSemanticsAcquire = 0x2,
+    MemSemanticsRelease = 0x4,
+    MemSemanticsAcquireRelease = 0x8,
+    MemSemanticsSequentiallyConsistent = 0x10,
+    MemSemanticsMask = 0x1F,
+    // What kind of memory is controlled by a barrier
+    MemSemanticsSubGroupMemory = 0x80,
+    MemSemanticsWorkGroupMemory = 0x100,
+    MemSemanticsCrossWorkGroupMemory = 0x200,
+  };
+
+protected:
+  llvm::Function *defineGetGlobalId(llvm::Module &M);
+  llvm::Function *defineGetGlobalSize(llvm::Module &M);
+  llvm::Function *defineGetLocalLinearId(llvm::Module &M);
+  llvm::Function *defineGetGlobalLinearId(llvm::Module &M);
+  llvm::Function *defineGetEnqueuedLocalSize(llvm::Module &M);
+  llvm::Function *defineMemBarrier(llvm::Function &F, unsigned ScopeIdx,
+                                   unsigned SemanticsIdx);
+  llvm::Function *defineGetSubGroupSize(llvm::Function &F);
+  llvm::Function *defineGetSubGroupLocalId(llvm::Function &F);
+  /// @brief Provides a default implementation for `__mux_dma_read_1D` and
+  /// `__mux_dma_write_1D`.
+  ///
+  /// These routines are not intended to be efficient for a
+  /// particular architecture and are really a placeholder for customers until
+  /// they are ready to define these functions with DMA calls. They are
+  /// essentially a memcpy.
+  llvm::Function *defineDMA1D(llvm::Function &F);
+  /// @brief Provides a default implementation for `__mux_dma_read_2D`
+  /// and `__mux_dma_write_2D`.
+  ///
+  /// These routines are not intended to be efficient for a
+  /// particular architecture and are really a placeholder for customers until
+  /// they are ready to define these functions with DMA calls. They are
+  /// essentially a memcpy.
+  llvm::Function *defineDMA2D(llvm::Function &F);
+  /// @brief Provides a default implementation for `__mux_dma_read_3D`
+  /// and `__mux_dma_write_3D`.
+  ///
+  /// These routines are not intended to be efficient for a
+  /// particular architecture and are really a placeholder for customers until
+  /// they are ready to define these functions with DMA calls. They are
+  /// essentially a memcpy.
+  llvm::Function *defineDMA3D(llvm::Function &F);
+  /// @brief Provides a default implementation for `__mux_dma_wait`.
+  ///
+  /// This routine is not intended to be efficient for a
+  /// particular architecture and are really a placeholder for customers until
+  /// they are ready to define these functions with DMA calls. This
+  /// implementation does nothing and simply returns.
+  llvm::Function *defineDMAWait(llvm::Function &F);
+};
+
+/// @brief An interface class that provides language-specific information and
+/// transformations to an instance of BuiltinInfo. All methods are to be called
+/// through from the equivalent methods in BuiltinInfo.
+class BILangInfoConcept {
+public:
+  virtual ~BILangInfoConcept() = default;
+
+  /// @see BuiltinInfo::getBuiltinsModule
+  virtual llvm::Module *getBuiltinsModule() { return nullptr; }
+  /// @see BuiltinInfo::analyzeBuiltin
+  virtual std::optional<Builtin>
+  analyzeBuiltin(const llvm::Function &F) const = 0;
+  /// @see BuiltinInfo::isBuiltinUniform
+  virtual BuiltinUniformity isBuiltinUniform(const Builtin &B,
+                                             const llvm::CallInst *,
+                                             unsigned) const = 0;
+  /// @see BuiltinInfo::getVectorEquivalent
+  virtual llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width,
+                                              llvm::Module *M = nullptr) = 0;
+  /// @see BuiltinInfo::getScalarEquivalent
+  virtual llvm::Function *getScalarEquivalent(const Builtin &B,
+                                              llvm::Module *M) = 0;
+  /// @see BuiltinInfo::emitBuiltinInline
+  virtual llvm::Value *
+  emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B,
+                    llvm::ArrayRef<llvm::Value *> Args) = 0;
+  /// @see BuiltinInfo::getBuiltinRange
+  virtual std::optional<llvm::ConstantRange>
+  getBuiltinRange(llvm::CallInst &, std::array<std::optional<uint64_t>, 3>,
+                  std::array<std::optional<uint64_t>, 3>) const {
+    return std::nullopt;
+  }
+
+  /// @see BuiltinInfo::lowerBuiltinToMuxBuiltin
+  virtual llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &,
+                                                      BIMuxInfoConcept &) {
+    return nullptr;
+  }
+  /// @see BuiltinInfo::getPrintfBuiltin
+  virtual std::optional<BuiltinID> getPrintfBuiltin() const = 0;
+};
+
+/// @brief Caches and returns the BuiltinInfo for a Module.
+class BuiltinInfoAnalysis
+    : public llvm::AnalysisInfoMixin<BuiltinInfoAnalysis> {
+  friend AnalysisInfoMixin<BuiltinInfoAnalysis>;
+
+public:
+  using Result = BuiltinInfo;
+  using CallbackFn = std::function<Result(const llvm::Module &)>;
+
+  BuiltinInfoAnalysis();
+
+  BuiltinInfoAnalysis(CallbackFn BICallback) : BICallback(BICallback) {}
+
+  /// @brief Retrieve the BuiltinInfo for the requested module.
+  Result run(llvm::Module &M, llvm::ModuleAnalysisManager &) {
+    return BICallback(M);
+  }
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "BuiltinInfo analysis"; }
+
+private:
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+
+  /// @brief Callback function producing a BuiltinInfo on demand.
+  CallbackFn BICallback;
+};
+
+/// @}
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_BUILTIN_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
new file mode 100644
index 0000000000000..16be8450d5124
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/cl_builtin_info.h
@@ -0,0 +1,216 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief OpenCL's BuiltinInfo implementation.
+
+#ifndef COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED
+#define COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+
+namespace compiler {
+namespace utils {
+/// @addtogroup utils
+/// @{
+
+/// @brief Convenience function for constructing a CLBuiltinInfo as a unique_ptr
+/// @param[in] builtins the Builtin module
+/// @return a std::unique_ptr to a new CLBuiltinInfo
+std::unique_ptr<BILangInfoConcept> createCLBuiltinInfo(llvm::Module *builtins);
+
+/// @brief Builtin loader base class.
+class CLBuiltinLoader {
+protected:
+  CLBuiltinLoader() = default;
+
+public:
+  virtual ~CLBuiltinLoader() = default;
+
+  /// @brief Load a builtin function.
+  /// @param[in] BuiltinName Name of the builtin function to materialize.
+  /// @param[in] DestM Optional module in which to load the builtin function.
+  /// @param[in] Flags Materialization flags to use.
+  /// @return Pointer to the materialized builtin function on success.
+  /// If a module is passed, the returned builtin function must live in
+  /// that module.
+  virtual llvm::Function *materializeBuiltin(llvm::StringRef BuiltinName,
+                                             llvm::Module *DestM,
+                                             BuiltinMatFlags Flags);
+
+  /// @brief Expose any builtins Module
+  virtual llvm::Module *getBuiltinsModule() { return nullptr; }
+};
+
+/// @brief Simple Builtin loader wrapping a given builtins module.
+class SimpleCLBuiltinLoader final : public CLBuiltinLoader {
+public:
+  SimpleCLBuiltinLoader(llvm::Module *builtins) : BuiltinModule(builtins) {}
+
+  ~SimpleCLBuiltinLoader() = default;
+
+  /// @brief Expose any builtins Module
+  virtual llvm::Module *getBuiltinsModule() override { return BuiltinModule; }
+
+private:
+  /// @brief Loaded builtins module.
+  llvm::Module *BuiltinModule;
+};
+
+///  @brief A class that encapsulates information and transformations concerning
+/// compiler OpenCL builtin functions.
+class CLBuiltinInfo : public BILangInfoConcept {
+public:
+  /// @brief Constructs a CLBuiltinInfo from a given Builtins module
+  CLBuiltinInfo(llvm::Module *Builtins);
+
+  /// @brief Constructs a CLBuiltinInfo with a user-provided loader
+  CLBuiltinInfo(std::unique_ptr<CLBuiltinLoader> L) : Loader(std::move(L)) {}
+
+  ~CLBuiltinInfo();
+
+  llvm::Module *getBuiltinsModule() override;
+
+  /// @see BuiltinInfo::isBuiltinUniform
+  BuiltinUniformity isBuiltinUniform(const Builtin &B, const llvm::CallInst *CI,
+                                     unsigned SimdDimIdx) const override;
+
+  /// @see BuiltinInfo::analyzeBuiltin
+  std::optional<Builtin> analyzeBuiltin(const llvm::Function &F) const override;
+  /// @see BuiltinInfo::getVectorEquivalent
+  llvm::Function *getVectorEquivalent(const Builtin &B, unsigned Width,
+                                      llvm::Module *M = nullptr) override;
+  /// @see BuiltinInfo::getScalarEquivalent
+  llvm::Function *getScalarEquivalent(const Builtin &B,
+                                      llvm::Module *M) override;
+  /// @see BuiltinInfo::emitBuiltinInline
+  llvm::Value *emitBuiltinInline(llvm::Function *Builtin, llvm::IRBuilder<> &B,
+                                 llvm::ArrayRef<llvm::Value *> Args) override;
+
+  /// @see BuiltinInfo::lowerBuiltinToMuxBuiltin
+  llvm::Instruction *lowerBuiltinToMuxBuiltin(llvm::CallInst &,
+                                              BIMuxInfoConcept &) override;
+  /// @see BuiltinInfo::getPrintfBuiltin
+  std::optional<BuiltinID> getPrintfBuiltin() const override;
+
+private:
+  std::optional<BuiltinID> identifyBuiltin(const llvm::Function &) const;
+
+  llvm::Function *
+  materializeBuiltin(llvm::StringRef BuiltinName, llvm::Module *DestM = nullptr,
+                     BuiltinMatFlags Flags = eBuiltinMatDefault);
+
+  llvm::Instruction *lowerGroupBuiltinToMuxBuiltin(llvm::CallInst &CI,
+                                                   BuiltinID ID,
+                                                   BIMuxInfoConcept &BIMuxImpl);
+  llvm::Instruction *lowerAsyncBuiltinToMuxBuiltin(llvm::CallInst &CI,
+                                                   BuiltinID ID,
+                                                   BIMuxInfoConcept &BIMuxImpl);
+
+  llvm::Value *emitBuiltinInline(BuiltinID ID, llvm::IRBuilder<> &B,
+                                 llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineAsLLVMBinaryIntrinsic(llvm::IRBuilder<> &B,
+                                                      llvm::Value *LHS,
+                                                      llvm::Value *RHS,
+                                                      llvm::Intrinsic::ID ID);
+  // 6.2 Conversions & Type Casting
+  llvm::Value *emitBuiltinInlineAs(llvm::Function *F, llvm::IRBuilder<> &B,
+                                   llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineConvert(llvm::Function *F, BuiltinID ID,
+                                        llvm::IRBuilder<> &B,
+                                        llvm::ArrayRef<llvm::Value *> Args);
+
+  // 6.11.5 Geometric Built-in Functions
+  llvm::Value *emitBuiltinInlineGeometrics(BuiltinID builtinID,
+                                           llvm::IRBuilder<> &B,
+                                           llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineDot(llvm::IRBuilder<> &B,
+                                    llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineCross(llvm::IRBuilder<> &B,
+                                      llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineLength(llvm::IRBuilder<> &B,
+                                       llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineNormalize(llvm::IRBuilder<> &B,
+                                          llvm::ArrayRef<llvm::Value *> Args);
+
+  // 6.11.6 Relational Built-in Functions
+  llvm::Value *emitBuiltinInlineRelationalsWithTwoArguments(
+      BuiltinID BuiltinID, llvm::IRBuilder<> &B,
+      llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineRelationalsWithOneArgument(BuiltinID BuiltinID,
+                                                           llvm::IRBuilder<> &B,
+                                                           llvm::Value *Arg);
+  llvm::Value *emitBuiltinInlineAll(llvm::IRBuilder<> &B,
+                                    llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineAny(llvm::IRBuilder<> &B,
+                                    llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineSelect(llvm::Function *F, llvm::IRBuilder<> &B,
+                                       llvm::ArrayRef<llvm::Value *> Args);
+
+  // 6.11.7 Vector Data Load/Store Functions
+  llvm::Value *emitBuiltinInlineVLoad(llvm::Function *F, unsigned Width,
+                                      llvm::IRBuilder<> &B,
+                                      llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineVStore(llvm::Function *F, unsigned Width,
+                                       llvm::IRBuilder<> &B,
+                                       llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineVLoadHalf(llvm::Function *F,
+                                          llvm::IRBuilder<> &B,
+                                          llvm::ArrayRef<llvm::Value *> Args);
+  llvm::Value *emitBuiltinInlineVStoreHalf(llvm::Function *F,
+                                           llvm::StringRef Mode,
+                                           llvm::IRBuilder<> &B,
+                                           llvm::ArrayRef<llvm::Value *> Args);
+
+  // 6.11.12 Miscellaneous Vector Functions
+  llvm::Value *emitBuiltinInlineShuffle(BuiltinID BuiltinID,
+                                        llvm::IRBuilder<> &B,
+                                        llvm::ArrayRef<llvm::Value *> Args);
+
+  llvm::Value *emitBuiltinInlinePrintf(BuiltinID BuiltinID,
+                                       llvm::IRBuilder<> &B,
+                                       llvm::ArrayRef<llvm::Value *> Args);
+
+  /// @brief Return the name of the builtin with the given identifier.
+  /// @param[in] ID Identifier of the builtin to return the name.
+  /// @return Name of the builtin.
+  llvm::StringRef getBuiltinName(BuiltinID ID) const;
+
+  /// @brief Declare the specified OpenCL builtin in the given module.
+  /// @param[in] M Module in which declare the builtin.
+  /// @param[in] ID Builtin identifier.
+  /// @param[in] RetTy Return type for the builtin.
+  /// @param[in] ArgTys List of argument types.
+  /// @param[in] ArgQuals List of argument qualifiers.
+  /// @param[in] Suffix Optional builtin name suffix.
+  /// @return Builtin function declaration.
+  llvm::Function *declareBuiltin(llvm::Module *M, BuiltinID ID,
+                                 llvm::Type *RetTy,
+                                 llvm::ArrayRef<llvm::Type *> ArgTys,
+                                 llvm::ArrayRef<TypeQualifiers> ArgQuals,
+                                 llvm::Twine Suffix = "");
+
+  /// @brief BuiltinLoader used to load builtins.
+  std::unique_ptr<CLBuiltinLoader> Loader;
+};
+
+/// @}
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_CL_BUILTIN_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
new file mode 100644
index 0000000000000..af33fbce17788
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/define_mux_builtins_pass.h
@@ -0,0 +1,36 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+
+#ifndef COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED
+#define COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+class DefineMuxBuiltinsPass final
+    : public llvm::PassInfoMixin<DefineMuxBuiltinsPass> {
+public:
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_DEFINE_MUX_BUILTINS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
new file mode 100644
index 0000000000000..c1002430aadc1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/device_info.h
@@ -0,0 +1,125 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Information about compiler device information.
+
+#ifndef COMPILER_UTILS_DEVICE_INFO_H_INCLUDED
+#define COMPILER_UTILS_DEVICE_INFO_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+#include <optional>
+
+namespace compiler {
+namespace utils {
+
+/// @brief Bitfield of all possible floating point capabilities.
+///
+/// Each Mux device struct has a member which denotes the floating point
+/// capabilities of that device, as a bitfield of the following enum.
+///
+/// NOTE: Must be kept in sync with mux_floating_point_capabilities_e in
+/// mux/include/mux/mux.h! This should probably be placed in an intermediary
+/// mux/compiler library and shared.
+enum device_floating_point_capabilities_e {
+  /// @brief Denormals supported.
+  device_floating_point_capabilities_denorm = 0x1,
+  /// @brief INF and NaN are supported.
+  device_floating_point_capabilities_inf_nan = 0x2,
+  /// @brief Round to nearest even supported.
+  device_floating_point_capabilities_rte = 0x4,
+  /// @brief Round to zero supported.
+  device_floating_point_capabilities_rtz = 0x8,
+  /// @brief Round to positive infinity supported.
+  device_floating_point_capabilities_rtp = 0x10,
+  /// @brief Round to negative infinity supported.
+  device_floating_point_capabilities_rtn = 0x20,
+  /// @brief Fused multiply add supported.
+  device_floating_point_capabilities_fma = 0x40,
+  /// @brief Floating point operations are written in software.
+  device_floating_point_capabilities_soft = 0x80,
+  /// @brief Binary format conforms to the IEEE-754 specification.
+  device_floating_point_capabilities_full = 0x100
+};
+
+struct DeviceInfo {
+  DeviceInfo() = default;
+
+  /// @brief Construct a DeviceInfo from individual properties
+  ///
+  /// @param h Enumeration of half-precision floating-point capabilities
+  /// @param f Enumeration of single-precision floating-point capabilities
+  /// @param d Enumeration of double-precision floating-point capabilities
+  /// @param max_work_width  The maximum number of work-items of a work-group
+  /// allowed to execute in one invocation of a kernel.
+  DeviceInfo(uint32_t h, uint32_t f, uint32_t d, uint32_t max_work_width)
+      : half_capabilities(h), float_capabilities(f), double_capabilities(d),
+        max_work_width(max_work_width) {}
+
+  uint32_t half_capabilities = 0;
+  uint32_t float_capabilities = 0;
+  uint32_t double_capabilities = 0;
+  uint32_t max_work_width = 0;
+
+  /// @brief List of supported 'required' sub-group sizes reported by this
+  /// device.
+  ///
+  /// These are only the sub-group sizes that can be requested as 'required' for
+  /// a kernel; the compiler may produce a wide range of other sub-group sizes
+  /// on undecorated kernels, assuming sub-groups are supported by the device.
+  std::vector<uint32_t> reqd_sub_group_sizes;
+
+  /// @brief Handle invalidation events from the new pass manager.
+  ///
+  /// @return false, as this analysis can never be invalidated.
+  bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &,
+                  llvm::ModuleAnalysisManager::Invalidator &) {
+    return false;
+  }
+};
+
+/// @brief Caches and returns the device information for a Module.
+class DeviceInfoAnalysis : public llvm::AnalysisInfoMixin<DeviceInfoAnalysis> {
+  friend AnalysisInfoMixin<DeviceInfoAnalysis>;
+
+public:
+  using Result = DeviceInfo;
+
+  DeviceInfoAnalysis() = default;
+  DeviceInfoAnalysis(Result res) : Info(res) {}
+
+  /// @brief Retrieve the DeviceInfo for the requested module.
+  Result run(llvm::Module &, llvm::ModuleAnalysisManager &) {
+    return Info ? *Info : Result();
+  }
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Device info analysis"; }
+
+private:
+  /// @brief Optional device information
+  std::optional<Result> Info;
+
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_DEVICE_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
new file mode 100644
index 0000000000000..815188761f272
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/dma.h
@@ -0,0 +1,91 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// LLVM DMA pass utility functions.
+
+#ifndef COMPILER_UTILS_DMA_H_INCLUDED
+#define COMPILER_UTILS_DMA_H_INCLUDED
+
+#include <llvm/ADT/Twine.h>
+#include <llvm/IR/IRBuilder.h>
+
+#include <functional>
+
+namespace llvm {
+class BasicBlock;
+class Module;
+class Value;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+class BIMuxInfoConcept;
+
+/// @addtogroup utils
+/// @{
+
+/// @brief Helper function to check the local ID of the current thread.
+///
+/// @param[in] bb Basic block to generate the check in.
+/// @param[in] x The local id in the x dimension to compare against.
+/// @param[in] y The local id in the y dimension to compare against.
+/// @param[in] z The local id in the z dimension to compare against.
+/// @param[in] GetLocalIDFn Function used to get the local work-item ID
+///
+/// @return A true Value if the local ID equals that passed via the index
+/// arguments, false otherwise.
+llvm::Value *isThreadEQ(llvm::BasicBlock *bb, unsigned x, unsigned y,
+                        unsigned z, llvm::Function &GetLocalIDFn);
+
+/// @brief Helper function to check if the local ID of the current thread is {0,
+/// 0, 0}.
+///
+/// @param[in] bb Basic block to generate the check in.
+/// @param[in] GetLocalIDFn Function used to get the local work-item ID
+///
+/// @return A true Value if the local ID is {0, 0, 0} / false otherwise.
+llvm::Value *isThreadZero(llvm::BasicBlock *bb, llvm::Function &GetLocalIDFn);
+
+/// @brief Insert 'thread-checking' logic in the entry block, so that control
+/// branches to the 'true' block when the current work-item in the first in the
+/// work-group (e.g. ID zero in all dimensions) or to the 'false' block for
+/// other work-items
+///
+/// @param[in] entryBlock Block to insert the 'thread-checking' logic
+/// @param[in] trueBlock Block to execute only on the first work-item
+/// @param[in] falseBlock Block to execute on all other work-items
+/// @param[in] GetLocalIDFn Function used to get the local work-item ID
+void buildThreadCheck(llvm::BasicBlock *entryBlock, llvm::BasicBlock *trueBlock,
+                      llvm::BasicBlock *falseBlock,
+                      llvm::Function &GetLocalIDFn);
+
+/// @brief Gets or creates the __mux_dma_event_t type.
+///
+/// This type may be declared by other passes hence we "get or create it".
+///
+/// @param[in] m LLVM Module to get or create the type in.
+///
+/// @return The opaque struct declaration of the __mux_dma_event_t type.
+llvm::StructType *getOrCreateMuxDMAEventType(llvm::Module &m);
+
+/// @}
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_DMA_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
new file mode 100644
index 0000000000000..261a5bbc7d4f8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/encode_kernel_metadata_pass.h
@@ -0,0 +1,60 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// EncodeKernelMetadataPass pass.
+
+#ifndef COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED
+#define COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include <optional>
+
+namespace compiler {
+namespace utils {
+
+/// @brief Sets up the per-function mux metadata used by later passes.
+/// Transfers per-module !opencl.kernel metadata to mux kernel metadata.
+struct TransferKernelMetadataPass
+    : public llvm::PassInfoMixin<TransferKernelMetadataPass> {
+  explicit TransferKernelMetadataPass() {}
+
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+};
+
+struct EncodeKernelMetadataPassOptions {
+  std::string KernelName;
+  std::optional<std::array<uint64_t, 3>> LocalSizes = std::nullopt;
+};
+
+struct EncodeKernelMetadataPass
+    : public llvm::PassInfoMixin<EncodeKernelMetadataPass> {
+  EncodeKernelMetadataPass(EncodeKernelMetadataPassOptions Options)
+      : KernelName(Options.KernelName), LocalSizes(Options.LocalSizes) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+
+private:
+  std::string KernelName;
+  std::optional<std::array<uint64_t, 3>> LocalSizes;
+};
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_ENCODE_KERNEL_METADATA_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
new file mode 100644
index 0000000000000..fcbd07825fb22
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/group_collective_helpers.h
@@ -0,0 +1,112 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Helper functions for working with sub_group and work_group functions.
+
+#ifndef COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED
+#define COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED
+
+#include <llvm/Analysis/IVDescriptors.h>
+
+namespace llvm {
+class Constant;
+class Function;
+class Type;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+/// @brief Utility function for retrieving the neutral value of a
+/// reduction/scan operation. A neutral value is one that does not affect the
+/// result of a given operation, e.g., adding 0 or multiplying by 1.
+///
+/// @param[in] Kind The kind of scan/reduction operation
+/// @param[in] Ty The type of the returned neutral value. Must match the type
+/// assumed by @a Kind, e.g., a floating-point type for floating-point
+/// operations.
+///
+/// @return The neutral value, or nullptr if unhandled.
+llvm::Constant *getNeutralVal(llvm::RecurKind Kind, llvm::Type *Ty);
+
+/// @brief Utility function for retrieving the identity value of a
+/// reduction/scan operation. The identity value is one that is expected to be
+/// found in the first element of an exclusive scan. It is equal to the neutral
+/// value (see @ref getNeutralVal) in all cases except in floating-point
+/// min/max, where -INF/+INF is the expected identity and in floating-point
+/// addition, where 0.0 (not -0.0 which is the neutral value) is the expected
+/// identity.
+///
+/// @param[in] Kind The kind of scan/reduction operation
+/// @param[in] Ty The type of the returned neutral value. Must match the type
+/// assumed by @a Kind, e.g., a floating-point type for floating-point
+/// operations.
+///
+/// @return The neutral value, or nullptr if unhandled.
+llvm::Constant *getIdentityVal(llvm::RecurKind Kind, llvm::Type *Ty);
+
+/// @brief Represents a work-group or sub-group collective operation.
+struct GroupCollective {
+  /// @brief The different operation types a group collective can represent.
+  enum class OpKind {
+    All,
+    Any,
+    Reduction,
+    ScanInclusive,
+    ScanExclusive,
+    Broadcast,
+    Shuffle,
+    ShuffleUp,
+    ShuffleDown,
+    ShuffleXor,
+  };
+
+  /// @brief The possible scopes of a group collective.
+  enum class ScopeKind { WorkGroup, SubGroup, VectorGroup };
+
+  /// @brief The operation type of the group collective.
+  OpKind Op = OpKind::All;
+  /// @brief The scope of the group collective operation.
+  ScopeKind Scope = ScopeKind::WorkGroup;
+  /// @brief The llvm recurrence operation this can be mapped to. For broadcasts
+  /// this will be None.
+  llvm::RecurKind Recurrence = llvm::RecurKind::None;
+  /// @brief True if the operation is logical, rather than bitwise.
+  bool IsLogical = false;
+  /// @brief Returns true for Any/All type collective operations.
+  bool isAnyAll() const { return Op == OpKind::Any || Op == OpKind::All; }
+  /// @brief Returns true for inclusive/exclusive scan collective operations.
+  bool isScan() const {
+    return Op == OpKind::ScanExclusive || Op == OpKind::ScanInclusive;
+  }
+  /// @brief Returns true for reduction collective operations.
+  bool isReduction() const { return Op == OpKind::Reduction; }
+  /// @brief Returns true for broadcast collective operations.
+  bool isBroadcast() const { return Op == OpKind::Broadcast; }
+  bool isShuffleLike() const {
+    return Op == OpKind::Shuffle || Op == OpKind::ShuffleUp ||
+           Op == OpKind::ShuffleDown || Op == OpKind::ShuffleXor;
+  }
+  /// @brief Returns true for sub-group collective operations.
+  bool isSubGroupScope() const { return Scope == ScopeKind::SubGroup; }
+  /// @brief Returns true for work-group collective operations.
+  bool isWorkGroupScope() const { return Scope == ScopeKind::WorkGroup; }
+};
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_GROUP_COLLECTIVE_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
new file mode 100644
index 0000000000000..66e6a89bd5d43
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/mangling.h
@@ -0,0 +1,408 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Simple function mangling framework.
+
+#ifndef COMPILER_UTILS_MANGLING_H_INCLUDED
+#define COMPILER_UTILS_MANGLING_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+
+#include <optional>
+
+namespace llvm {
+class LLVMContext;
+class Type;
+class raw_ostream;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+/// @brief Describes type qualifiers, which are aspects that need to be taken
+/// into account when mangling function names. Such aspects are not represented
+/// in the LLVM type. This is why such qualifiers need to be used along types.
+enum TypeQualifier : int32_t {
+  /// @brief The type has no special qualifier.
+  eTypeQualNone = 0,
+  /// @brief The type is a signed integer.
+  eTypeQualSignedInt = 1,
+  /// @brief The type is a constant pointer.
+  eTypeQualPointerConst = 2,
+  /// @brief The type is a volatile pointer.
+  eTypeQualPointerVolatile = 4,
+  /// @brief The type is a restrict pointer.
+  eTypeQualPointerRestrict = 8
+};
+
+/// @brief Contains a small hierarchical list of TypeQualifier.
+///
+/// This hierarchy maps to derived types such as pointers or vectors:
+/// * First qualifier for the pointer type.
+/// * Second qualifier for the pointed-to type.
+class TypeQualifiers final {
+  using StorageT = uint64_t;
+
+public:
+  /// @brief Create a type qualifier list with no qualifiers.
+  TypeQualifiers();
+  /// @brief Create a type qualifier list with one qualifiers.
+  ///
+  /// @param[in] Qual First qualifier.
+  TypeQualifiers(TypeQualifier Qual);
+  /// @brief Create a type qualifier list with two qualifiers.
+  ///
+  /// @param[in] Qual1 First qualifier.
+  /// @param[in] Qual2 Second qualifier.
+  TypeQualifiers(TypeQualifier Qual1, TypeQualifier Qual2);
+
+  /// @brief Create a type qualifier list with one qualifiers.
+  /// @note Convenience function that allows bit manipulation of qualifiers.
+  ///
+  /// @param[in] Qual First qualifier.
+  TypeQualifiers(unsigned Qual);
+  /// @brief Create a type qualifier list with two qualifiers.
+  /// @note Convenience function that allows bit manipulation of qualifiers.
+  ///
+  /// @param[in] Qual1 First qualifier.
+  /// @param[in] Qual2 Second qualifier.
+  TypeQualifiers(unsigned Qual1, unsigned Qual2);
+
+  /// @brief Number of type qualifiers contained in the list.
+  StorageT getCount() const;
+
+  /// @brief Top-most qualifier from the list.
+  TypeQualifier front() const;
+
+  /// @brief Remove the top-most qualifier from the list and returns it.
+  TypeQualifier pop_front();
+
+  /// @brief Return the qualifier at the given index.
+  TypeQualifier at(unsigned Idx) const;
+
+  /// @brief Add a qualifier to the list, making it bottom-most.
+  ///
+  /// @param[in] Qual Qualifier to add to the list.
+  ///
+  /// @return true if there was enough space to add the qualifier, or false.
+  bool push_back(TypeQualifier Qual);
+  /// @brief Add a qualifier to the list, making it bottom-most.
+  /// @note Convenience function that allows bit manipulation of qualifiers.
+  ///
+  /// @param[in] Qual Qualifier to add to the list.
+  ///
+  /// @return true if there was enough space to add the qualifier, or false.
+  bool push_back(unsigned Qual);
+  /// @brief Add qualifiers to the end of the list.
+  ///
+  /// @param[in] Quals Qualifiers to add to the list.
+  ///
+  /// @return true if there was enough space to add the qualifiers, or false.
+  bool push_back(TypeQualifiers Quals);
+
+  /// @brief Determine whether two qualifier lists are equal.
+  bool operator==(const TypeQualifiers &other) {
+    return storage_ == other.storage_;
+  }
+
+  /// @brief Determine whether two qualifier lists are different.
+  bool operator!=(const TypeQualifiers &other) { return !(*this == other); }
+
+private:
+  /// @brief Set the number of type qualifiers contained in the list.
+  void setCount(StorageT newCount);
+
+  /// @brief Bits that make up the list. Deliberately small to pass by value.
+  StorageT storage_;
+
+  /// @brief Number of bits used to encode the size of the list.
+  static const unsigned NumCountBits = 4;
+
+  /// @brief Number of bits used to encode one qualifier in the list.
+  static const unsigned NumQualBits = 10;
+
+  /// @brief Number of bits that can be used to store the list.
+  static const unsigned NumStorageBits = sizeof(StorageT) * 8;
+
+  /// @brief Maximum size of the list.
+  static const unsigned MaxSize = (NumStorageBits - NumCountBits) / NumQualBits;
+
+  static_assert(MaxSize < (1 << NumCountBits) - 1, "MaxSize cannot be encoded");
+};
+
+/// @brief Helps with light parsing such as demangling function names.
+class Lexer final {
+public:
+  /// @brief Create a new lexer with the given text.
+  ///
+  /// @param[in] text Text to lex.
+  Lexer(llvm::StringRef text);
+
+  /// @brief Number of characters left to lex.
+  unsigned Left() const;
+  /// @brief Current lexing position in the text.
+  unsigned CurrentPos() const;
+  /// @brief String containing the text remaining to be lexed.
+  llvm::StringRef TextLeft() const;
+  /// @brief Current character.
+  /// @return Character or negative value if no text is left.
+  int Current() const;
+
+  /// @brief Consume one character, advancing to the next character in the
+  /// string.
+  /// @return true if a character was consumed, false if no text left.
+  bool Consume();
+  /// @brief Consume several characters, advancing through the string.
+  ///
+  /// @param[in] Size Number of characters to consume.
+  ///
+  /// @return true if Size characters were consumed, false otherwise.
+  bool Consume(unsigned Size);
+  /// @brief Consume a string, and skip past it.
+  ///
+  /// @param[in] Pattern String to consume.
+  ///
+  /// @return true if Pattern was found and consumed, false otherwise.
+  bool Consume(llvm::StringRef Pattern);
+  /// @brief Consume an unsigned integer, and skip past it.
+  ///
+  /// @param[out] Result Consumed unsigned integer.
+  ///
+  /// @return true if an unsigned integer was consumed, false otherwise.
+  bool ConsumeInteger(unsigned &Result);
+  /// @brief Consume a signed integer, and skip past it.
+  ///
+  /// @param[out] Result Consumed signed integer.
+  ///
+  /// @return true if a signed integer was consumed, false otherwise.
+  bool ConsumeSignedInteger(int &Result);
+  /// @brief Consume consecutive alphabetic characters and skip past them.
+  ///
+  /// @param[out] Result Consumed string.
+  ///
+  /// @return true if an alphabetic string was consumed, false otherwise.
+  bool ConsumeAlpha(llvm::StringRef &Result);
+  /// @brief Consume consecutive alphanumeric characters and skip past them.
+  ///
+  /// @param[out] Result Consumed string.
+  ///
+  /// @return true if an alphanumeric string was consumed, false otherwise.
+  bool ConsumeAlphanumeric(llvm::StringRef &Result);
+  /// @brief Consume all characters until C is found. C is not consumed.
+  ///
+  /// @param[in] C Delimiter character.
+  /// @param[out] Result Consumed string.
+  ///
+  /// @return true if C was found, false otherwise.
+  bool ConsumeUntil(char C, llvm::StringRef &Result);
+  /// @brief Consume all whitespace characters
+  ///
+  /// @return true if any whitespace was consumed or false otherwise
+  bool ConsumeWhitespace();
+
+private:
+  /// @brief Text to lex.
+  llvm::StringRef Text;
+  /// @brief Current lexing position into the text.
+  unsigned Pos;
+};
+
+/// @brief Converts between mangled and non-mangled function names.
+class NameMangler final {
+public:
+  /// @brief Create a new name mangler.
+  ///
+  /// @param[in] context LLVM context to use.
+  NameMangler(llvm::LLVMContext *context);
+
+  /// @brief Determine the mangled name of a function.
+  ///
+  /// @param[in] Name Non-mangled name of the function.
+  /// @param[in] Tys List of types, one for each function argument.
+  /// @param[in] Quals Qualifiers, one for each type in Tys..
+  ///
+  /// @return The mangled name of the function.
+  std::string mangleName(llvm::StringRef Name, llvm::ArrayRef<llvm::Type *> Tys,
+                         llvm::ArrayRef<TypeQualifiers> Quals);
+
+  /// @brief Try to mangle the given qualified type.
+  ///
+  /// @param[in] O Output stream to write the mangled name to.
+  /// @param[in] Type Type to mangle.
+  /// @param[in] Quals Type qualifiers.
+  ///
+  /// @return true if the type name could be mangled.
+  bool mangleType(llvm::raw_ostream &O, llvm::Type *Type, TypeQualifiers Quals);
+
+  /// @brief Try to mangle the given qualified type, taking substitutions into
+  /// account.
+  ///
+  /// @param[in] O Output stream to write the mangled name to.
+  /// @param[in] Type Type to mangle.
+  /// @param[in] Quals Type qualifiers.
+  /// @param[in] PrevTys Previously mangled types.
+  /// @param[in] PrevQuals Qualifiers for previously mangled types.
+  ///
+  /// @return true if the type name could be mangled.
+  bool mangleType(llvm::raw_ostream &O, llvm::Type *Type, TypeQualifiers Quals,
+                  llvm::ArrayRef<llvm::Type *> PrevTys,
+                  llvm::ArrayRef<TypeQualifiers> PrevQuals);
+
+  /// @brief Remove the mangling of a function name, retrieving argument types
+  ///        and qualifiers in the process.
+  ///
+  /// @param[in] Name Mangled function name to demangle.
+  /// @param[out] Types Vector that will receive LLVM types for the arguments.
+  /// @param[out] Quals Vector that will receive type qualifiers for the
+  /// arguments.
+  ///
+  /// @return Demangled name or an empty string on failure
+  llvm::StringRef demangleName(llvm::StringRef Name,
+                               llvm::SmallVectorImpl<llvm::Type *> &Types,
+                               llvm::SmallVectorImpl<TypeQualifiers> &Quals);
+
+  /// @brief Remove the mangling of a function name, retrieving argument types
+  ///        and qualifiers in the process.
+  ///
+  /// @param[in] Name Mangled function name to demangle.
+  /// @param[out] Types Vector that will receive LLVM types for the arguments.
+  /// @param[out] PointerElementTypes Vector that will receive LLVM types for
+  /// the *first level* of pointer element types.
+  /// @param[out] Quals Vector that will receive type qualifiers for the
+  /// arguments.
+  ///
+  /// For example:
+  ///   _Z3fooPii
+  ///     Types[0]               = PointerType
+  ///     PointerElementTypes[0] = i32
+  ///     Quals[0] = (PointerQual, SignedIntQual)
+  ///
+  ///     Types[1] = i32
+  ///     PointerElementTypes[1] = nullptr
+  ///     Quals[1] = (SignedIntQual)
+  ///
+  /// @return Demangled name or an empty string on failure
+  llvm::StringRef
+  demangleName(llvm::StringRef Name, llvm::SmallVectorImpl<llvm::Type *> &Types,
+               llvm::SmallVectorImpl<llvm::Type *> &PointerElementTypes,
+               llvm::SmallVectorImpl<TypeQualifiers> &Quals);
+
+  /// @brief Remove the mangling of a function name.
+  ///
+  /// @param[in] Name Mangled function name to demangle.
+  ///
+  /// @return Demangled name or original name if not mangled.
+  llvm::StringRef demangleName(llvm::StringRef Name);
+
+private:
+  /// @brief Try to mangle the given qualified type. This only works for simple
+  /// types that do not require string manipulation.
+  ///
+  /// @param[in] Ty Type to mangle.
+  /// @param[in] Qual Type qualifier.
+  ///
+  /// @return Mangled name of the type or nullptr.
+  const char *mangleSimpleType(llvm::Type *Ty, TypeQualifier Qual);
+  /// @brief Try to mangle the given builtin type name. This only works for
+  /// 'spirv' target extension types (LLVM 17+).
+  ///
+  /// @param[in] Ty type to mangle.
+  ///
+  /// @return string if builtin type could be mangled otherwise empty string.
+  std::optional<std::string> mangleBuiltinType(llvm::Type *Ty);
+  /// @brief Try to demangle the given type name. This only works for simple
+  /// types that do not require string manipulation.
+  ///
+  /// @param[in,out] L Lexer for the mangled type name.
+  /// @param[out] Ty Demangled type.
+  /// @param[out] Qual Demangled type qualifier.
+  ///
+  /// @return true if the type name could be demangled.
+  bool demangleSimpleType(Lexer &L, llvm::Type *&Ty, TypeQualifier &Qual);
+  /// @brief Try to demangle the given type name. This only works for opencl
+  /// builtin types.
+  ///
+  /// @param[in,out] L Lexer for the mangled type name.
+  /// @param[out] Ty Demangled type.
+  ///
+  /// @return true if the type name could be demangled.
+  bool demangleOpenCLBuiltinType(Lexer &L, llvm::Type *&Ty);
+  /// @brief Try to demangle the given type.
+  ///
+  /// @param[in] L Lexer currently pointing at a type.
+  /// @param[out] Ty Demangled type.
+  /// @param[out] PointerEltTy If null, unchanged. Else, set to the demangled
+  /// pointer element type, if Ty is a non-opaque pointer type. Else set to
+  /// nulltpr.
+  /// @param[out] Quals Demangled type qualifiers.
+  /// @param[in] CtxTypes Previously demangled types, used for substitutions.
+  /// @param[in] CtxQuals Previously demangled qualifiers.
+  ///
+  /// @return true if the type could be demangled, false otherwise.
+  bool demangleType(Lexer &L, llvm::Type *&Ty, llvm::Type **PointerEltTy,
+                    TypeQualifiers &Quals,
+                    llvm::SmallVectorImpl<llvm::Type *> &CtxTypes,
+                    llvm::SmallVectorImpl<TypeQualifiers> &CtxQuals);
+
+  /// @brief Demangle a name.
+  ///
+  /// @param[in] L Lexer currently pointing at a mangled name.
+  ///
+  /// @return Demangled name or an empty string.
+  llvm::StringRef demangleName(Lexer &L);
+  /// @brief Determine the type 'index' the substitution refers to.
+  ///
+  /// @param[in] SubID Substitution ID.
+  /// @param[in] Tys List of types.
+  /// @param[in] Quals Qualifiers for the types.
+  ///
+  /// @return Resolved type index or negative value.
+  int resolveSubstitution(unsigned SubID,
+                          llvm::SmallVectorImpl<llvm::Type *> &Tys,
+                          llvm::SmallVectorImpl<TypeQualifiers> &Quals);
+  /// @brief Try to emit a substituion for the given type instead of mangling
+  /// it.
+  ///
+  /// @param[in,out] O Stream to write the substitution to.
+  /// @param[in] Ty Type to mangle
+  /// @param[in] Quals Qualifiers for the type.
+  /// @param[in] PrevTys Types that have previously been mangled.
+  /// @param[in] PrevQuals Qualifiers for the previously mangled types.
+  ///
+  /// @return true if a substitution was emitted, false otherwise.
+  bool emitSubstitution(llvm::raw_ostream &O, llvm::Type *Ty,
+                        TypeQualifiers Quals,
+                        llvm::ArrayRef<llvm::Type *> PrevTys,
+                        llvm::ArrayRef<TypeQualifiers> PrevQuals);
+  /// @brief Determine whether the type is a builtin type or not. Builtin types
+  /// are not considered for substitutions.
+  ///
+  /// @param[in] Ty Type to analyze.
+  /// @param[in] Quals Type qualifiers.
+  ///
+  /// @return true if the type is a builtin type, or false.
+  bool isTypeBuiltin(llvm::Type *Ty, TypeQualifiers &Quals);
+
+  /// @brief LLVM context used to access LLVM types.
+  llvm::LLVMContext *Context;
+};
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_MANGLING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
new file mode 100644
index 0000000000000..eda860477aaee
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/metadata.h
@@ -0,0 +1,296 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef COMPILER_UTILS_METADATA_H_INCLUDED
+#define COMPILER_UTILS_METADATA_H_INCLUDED
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/Support/TypeSize.h>
+
+#include <optional>
+
+namespace llvm {
+class Function;
+class Module;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+/// @brief OpenCL C standard to target.
+enum OpenCLCVer {
+  /// @brief OpenCL C 1.0
+  OpenCLC10 = (1 * 100 + 0) * 1000,
+  /// @brief OpenCL C 1.1
+  OpenCLC11 = (1 * 100 + 1) * 1000,
+  /// @brief OpenCL C 1.2
+  OpenCLC12 = (1 * 100 + 2) * 1000,
+  /// @brief OpenCL C 2.0
+  OpenCLC20 = (2 * 100 + 0) * 1000,
+  /// @brief OpenCL C 3.0
+  OpenCLC30 = (3 * 100 + 0) * 1000,
+};
+
+/// @brief Returns the OpenCL version, encoded as (Major*100 + Minor)*1000.
+///
+/// If the Module does not contain any information, then OpenCLC12 is returned.
+uint32_t getOpenCLVersion(const llvm::Module &m);
+
+/// @brief Describes the state of vectorization on a function/loop.
+struct VectorizationInfo {
+  /// @brief The VectorizationFactor. A scalar value if unvectorized.
+  llvm::ElementCount vf;
+  /// @brief The dimension along which vectorization took place.
+  unsigned simdDimIdx;
+  /// @brief Whether or not the function/loop was vector-predicated.
+  bool IsVectorPredicated;
+};
+
+/// @brief Encodes metadata indicating vectorization failure to a kernel, along
+/// with the the vectorization factor and dimension that failed.
+///
+/// @param[in] f Function in which to encode the link.
+/// @param[in] info Vectorization info serving as the key.
+void encodeVectorizationFailedMetadata(llvm::Function &f,
+                                       const VectorizationInfo &info);
+
+/// @brief Encodes the vectorization metadata linking the original kernel to a
+/// vectorized one, using the vectorization factor and dimension as the key.
+///
+/// @param[in] origF Original function in which to encode the link.
+/// @param[in] vectorizedF Vectorized function to link.
+/// @param[in] info Vectorization factor serving as the key.
+void linkOrigToVeczFnMetadata(llvm::Function &origF,
+                              llvm::Function &vectorizedF,
+                              const VectorizationInfo &info);
+
+/// @brief Encodes the vectorization metadata linking a vectorized kernel back
+/// to its original one, using the vectorization factor and dimension as the
+/// key.
+///
+/// @param[in] vectorizedF Vectorized function in which to encode the link.
+/// @param[in] origF Original function to link.
+/// @param[in] info Vectorization factor serving as the key.
+void linkVeczToOrigFnMetadata(llvm::Function &vectorizedF,
+                              llvm::Function &origF,
+                              const VectorizationInfo &info);
+
+using LinkMetadataResult = std::pair<llvm::Function *, VectorizationInfo>;
+
+/// @brief Decodes the metadata linking a kernel to its vectorized variant.
+///
+/// @param[in] f Function for which to decode the metadata.
+/// @param[out] factors unordered vector of recovered vectorization links.
+///
+/// @return true on success, false if there is no vectorization metadata for the
+/// function.
+bool parseOrigToVeczFnLinkMetadata(
+    llvm::Function &f, llvm::SmallVectorImpl<LinkMetadataResult> &factors);
+
+/// @brief Decodes the metadata linking a vectorized kernel back to its
+/// original one.
+///
+/// @param[in] f Function for which to decode the metadata.
+///
+/// @return On success, a pair containing a pointer to the original kernel
+/// function and the vectorization factor used as the key. The original
+/// function may be null. On decoding failure, std::nullopt.
+std::optional<LinkMetadataResult>
+parseVeczToOrigFnLinkMetadata(llvm::Function &f);
+
+/// @brief Drops "base" vectorization metadata from a function, if present.
+///
+/// @param[in] f Function to drop metadata from.
+void dropVeczOrigMetadata(llvm::Function &f);
+
+/// @brief Drops "derived" vectorization metadata from a function, if present.
+///
+/// @param[in] f Function to drop metadata from.
+void dropVeczDerivedMetadata(llvm::Function &f);
+
+/// @brief Encodes metadata indicating the various components that constitute a
+/// kernel function wrapped with the WorkItemLoopsPass.
+///
+/// @param[in] f Function in which to encode the metadata.
+/// @param[in] simdDimIdx The dimension (0,1,2) along which vectorization took
+/// place.
+/// @param[in] mainInfo VectorizationInfo used on the 'main' work-item
+/// iterations.
+/// @param[in] tailInfo VectorizationInfo used on the tail iterations, if
+/// applicable.
+///
+/// Note that a 'tail' is defined as the work done to execute work-items not
+/// covered by the 'main' body. Therefore an unvectorized kernel should expect
+/// a scalar 'main' vectorization factor and no 'tail' (rather than the other
+/// way round).
+
+/// Some examples of *typical* usage:
+/// 1. An unvectorized kernel will encode a scalar VF for the main iterations
+/// and nothing for the tail ones.
+/// 2. A vectorized kernel will encode vectorization factor for its main
+/// iterations. If it handles the case in which the local work-group size does
+/// not evenly divide the vectorization factor, it will encode how it manages
+/// the tail iterations. This is *typically* with a series of scalar
+/// iterations, encoded in tailVF.
+/// 3. Vector-predicated kernels with no tails will encode the *maximum* VF used
+/// for the main loop, with no tail iterations.
+///
+/// This metadata is encoded as:
+/// define void @foo() !codeplay_ca_wrapper !X
+/// !X = { !Main, !Tail }
+/// !Main = { i32 mKnownMin, i32 mIsScalable, i32 simdDimIdx, i32 mIsVP }
+/// if tailVF is None:
+///   !Tail = {}
+/// else
+///   !Tail = { i32 tKnownMin, i32 tIsScalable, i32 simdDimIdx, i32 tIsVP }
+void encodeWrapperFnMetadata(llvm::Function &f,
+                             const VectorizationInfo &mainInfo,
+                             std::optional<VectorizationInfo> tailInfo);
+
+/// @brief Decodes the metadata describing a wrapped kernel's loop structure.
+///
+/// @param[in] f Function for which to decode the metadata.
+///
+/// @return On success, a pair containing the VectorizationInfo for the main
+/// loop(s) and the (optional) VectorizationInfo info for the tail loop(s). On
+/// decoding failure, std::nullopt.
+std::optional<std::pair<VectorizationInfo, std::optional<VectorizationInfo>>>
+parseWrapperFnMetadata(llvm::Function &f);
+
+/// @brief Copies function metadata from one function to another.
+///
+/// @param[in] fromF Function from which to copy the metadata.
+/// @param[in] toF Function onto which to copy the metadata.
+/// @param[in] includeDebug Whether or not to copy debug function metadata.
+void copyFunctionMetadata(llvm::Function &fromF, llvm::Function &toF,
+                          bool includeDebug = false);
+
+/// @brief Encodes information about a function's local work group size as
+/// metadata.
+///
+/// @param[in] f Function in which to encode the metadata.
+/// @param[in] localSizes array of size information to encode.
+void encodeLocalSizeMetadata(llvm::Function &f,
+                             const std::array<uint64_t, 3> &localSizes);
+
+/// @brief Retrieves information about a function's local sizes via metadata.
+///
+/// @param[in] f Function from which to decode the metadata
+/// @returns The local size array if present, else `std::nullopt`
+std::optional<std::array<uint64_t, 3>>
+getLocalSizeMetadata(const llvm::Function &f);
+
+/// @brief Drops all !mux_scheduled_fn metadata from a function.
+void dropSchedulingParameterMetadata(llvm::Function &f);
+
+/// @brief Retrieves the indices of scheduling parameters from the function.
+llvm::SmallVector<int, 4>
+getSchedulingParameterFunctionMetadata(const llvm::Function &f);
+
+/// @brief Sets scheduling-parameter metadata on the given function
+void setSchedulingParameterFunctionMetadata(llvm::Function &f,
+                                            llvm::ArrayRef<int> idxs);
+
+/// @brief Sets module-level metadata describing the set of scheduling
+/// parameters.
+void setSchedulingParameterModuleMetadata(llvm::Module &m,
+                                          llvm::ArrayRef<std::string> names);
+
+/// @brief Retrieves module-level metadata describing the set of scheduling
+/// parameters or nullptr.
+llvm::NamedMDNode *getSchedulingParameterModuleMetadata(const llvm::Module &m);
+
+/// @brief If the given function parameter index is considered a scheduling
+/// parameter, it returns the corresponding index into the target's list of
+/// scheduling parameters.
+///
+/// It uses !mux_scheduled_fn metadata for this check.
+std::optional<unsigned> isSchedulingParameter(const llvm::Function &f,
+                                              unsigned idx);
+
+/// @brief Extracts the required work group size from a kernel's function
+/// metadata.
+///
+/// @param[in] f Kernel for extraction.
+///
+/// @return The work group size or std::nullopt if there is no such metadata.
+std::optional<std::array<uint64_t, 3>>
+parseRequiredWGSMetadata(const llvm::Function &f);
+
+/// @brief Extracts the required work group size from an opencl.kernels subnode,
+/// which is similar to the function metadata, but the size is stored under
+/// different indices than on a function.
+///
+/// @param[in] node Kernel's subnode for extraction.
+///
+/// @return The work group size or std::nullopt if there is no such metadata.
+std::optional<std::array<uint64_t, 3>>
+parseRequiredWGSMetadata(const llvm::MDNode &node);
+
+/// @brief Extracts the maximum work dimension from a kernel's function
+/// metadata
+///
+/// @param[in] f Kernel for extraction.
+///
+/// @return The maximum work dimension or std::nullopt if there is no such
+/// metadata.
+std::optional<uint32_t> parseMaxWorkDimMetadata(const llvm::Function &f);
+
+/// @brief Describes the state of vectorization on a function/loop.
+struct KernelInfo {
+  explicit KernelInfo(llvm::StringRef name) : Name(name) {}
+  /// @brief The function name
+  std::string Name;
+  /// @brief The required work-group size. Optional.
+  std::optional<std::array<uint64_t, 3>> ReqdWGSize;
+};
+
+/// @brief Helper function to populate a list of kernels and associated
+/// information from a module.
+///
+/// @param m Module to retrieve kernels from
+/// @param results List of kernel info parsed from metadata or taken from the
+/// module.
+void populateKernelList(llvm::Module &m,
+                        llvm::SmallVectorImpl<KernelInfo> &results);
+
+/// @brief Replaces instances of kernel fromF with toF in module-level
+/// !opencl.kernels metadata.
+/// @param fromF Function to replace with toF in metadata
+/// @param toF Function with which to replace references to fromF
+/// @param M Module in which to find the metadata
+void replaceKernelInOpenCLKernelsMetadata(llvm::Function &fromF,
+                                          llvm::Function &toF, llvm::Module &M);
+
+/// @brief Encodes information about a function's local work group size as
+/// metadata.
+///
+/// @param[in] f Function in which to encode the metadata.
+/// @param[in] size sub-group size information to encode.
+void encodeReqdSubgroupSizeMetadata(llvm::Function &f, uint32_t size);
+
+/// @brief Retrieves information about a function's required sub-group size via
+/// metadata.
+///
+/// @param[in] f Function from which to decode the metadata
+/// @returns The required sub-group size if present, else `std::nullopt`
+std::optional<uint32_t> getReqdSubgroupSize(const llvm::Function &f);
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_METADATA_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
new file mode 100644
index 0000000000000..678b753b98a7e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/optimal_builtin_replacement_pass.h
@@ -0,0 +1,115 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Optimal builtin replacement pass.
+
+#ifndef COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED
+#define COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED
+
+#include <compiler/utils/mangling.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/LazyCallGraph.h>
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+/// @brief A Callgraph optimization pass which replaces calls to builtin
+/// functions with more optimal versions, either via inlined code, or calls to
+/// suitable llvm intrinsics which will later be lowered to optimal machine
+/// code. When run with a non-null BuiltinInfo analysis, the builtin info is
+/// queried to determine the properties of each call in the graph.
+///
+/// A set of replacement functions with identical signatures is kept by this
+/// pass. These are invoked in order one after another on each call instruction
+/// in the call graph. If any replacement returns a non-null `Value*` it is
+/// used to replace the call and no further replacements are attempted on that
+/// call. It is assumed that no replacement introduces new calls to the graph.
+/// The set of replacements can be modified by users by setting
+/// `adjustReplacements`.
+///
+/// The default set of replacement functions, in order, is:
+/// * replaceAbacusCLZ
+/// * replaceAbacusMulhi
+/// * replaceAbacusFMinFMax
+/// * Invoking emitBuiltinInline from BuiltinInfo analysis
+class OptimalBuiltinReplacementPass
+    : public llvm::PassInfoMixin<OptimalBuiltinReplacementPass> {
+public:
+  using ReplacementFnTy = std::function<llvm::Value *(
+      llvm::CallBase &, llvm::StringRef,
+      const llvm::SmallVectorImpl<llvm::Type *> &,
+      const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &)>;
+
+  /// @brief Constructor. Sets up default builtin replacements.
+  OptimalBuiltinReplacementPass();
+
+  llvm::PreservedAnalyses run(llvm::LazyCallGraph::SCC &C,
+                              llvm::CGSCCAnalysisManager &AM,
+                              llvm::LazyCallGraph &CG,
+                              llvm::CGSCCUpdateResult &UR);
+
+  /// @brief A callback invoked per-SCC before any replacements are performed,
+  /// allowing customization of the replacements to be performed. The default
+  /// set of replacements are passed in and may be modified in any way.
+  std::function<void(std::vector<ReplacementFnTy> &)> adjustReplacements;
+
+  /// @brief Replaces calls __abacus_clz(ty) with @llvm.ctlz(ty, i1 false)
+  /// indicating that zero does not produce a poison result.
+  /// Note: This replacement is not performend on 64-bit scalar or vectors of
+  /// 64-bit scalar types.
+  static llvm::Value *replaceAbacusCLZ(
+      llvm::CallBase &CB, llvm::StringRef,
+      const llvm::SmallVectorImpl<llvm::Type *> &,
+      const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &);
+
+  /// @brief Replaces __abacus_mul_hi(ty lhs, ty rhs) with a sequence:
+  ///   %lhs.ext = ext ty %lhs to x2bw(ty)
+  ///   %rhs.ext = ext ty %rhs to x2bw(ty)
+  ///   %mul.ext = mul x2bw(ty) %lhs.ext, %rhs.ext
+  ///   %lo.part = ashr x2bw(ty) %mul.ext, bw(ty)
+  ///   %res = trunc x2bw(ty) %lo.part to ty
+  /// Where x2bw(ty) returns a type with twice the (element) bit-width, and
+  /// bw(ty) returns the bit-width of a (element) type as an integer.
+  /// This pattern is better matched by LLVM and target backends often produce
+  /// "mul_hi" instructions as a result.
+  static llvm::Value *replaceAbacusMulhi(
+      llvm::CallBase &, llvm::StringRef,
+      const llvm::SmallVectorImpl<llvm::Type *> &,
+      const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &);
+
+  /// @brief Replaces __abacus_(fmin|fmax)(ty1 lhs, ty2 rhs) with
+  /// @llvm.(minnum|maxnum)(ty1 lhs, ty1 rhs), where ty2 may be a scalar type
+  /// which is splatted to a vector of ty1, where appropriate.
+  /// Note: This replacement is not performed on ARM or AArch64 targets, due to
+  /// LLVM backend bugs (https://llvm.org/PR27363).
+  static llvm::Value *replaceAbacusFMinFMax(
+      llvm::CallBase &, llvm::StringRef,
+      const llvm::SmallVectorImpl<llvm::Type *> &,
+      const llvm::SmallVectorImpl<compiler::utils::TypeQualifiers> &);
+
+private:
+  std::vector<ReplacementFnTy> replacements;
+
+  llvm::Value *replaceBuiltinWithInlineIR(llvm::CallBase &CB) const;
+};
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_OPTIMAL_BUILTIN_REPLACEMENT_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
new file mode 100644
index 0000000000000..b60847eb53f1f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_functions.h
@@ -0,0 +1,319 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// LLVM pass utility functions.
+
+#ifndef COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED
+#define COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED
+
+#include <llvm/ADT/Twine.h>
+#include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Function.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+
+#include <array>
+#include <functional>
+
+namespace llvm {
+class Argument;
+class BasicBlock;
+class Constant;
+class ConstantExpr;
+class Function;
+class IntegerType;
+class LLVMContext;
+class Module;
+class ModulePass;
+class Type;
+class Value;
+class IRBuilderBase;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+/// @addtogroup utils
+/// @{
+
+/// @brief Calculate (approximately) the amount of private memory used by a
+/// kernel.
+///
+/// @param fn The kernel function
+///
+/// @return uint64_t The private memory used by the kernel function in bytes.
+uint64_t computeApproximatePrivateMemoryUsage(const llvm::Function &fn);
+
+/// @brief Forces a constant expression or constant vector back to a normal
+/// instruction
+///
+/// @param[in] constant to be replaced
+void replaceConstantExpressionWithInstruction(llvm::Constant *const constant);
+
+/// @brief remap operands of a constant expression
+///
+/// @note This will create a new constant expression and replace references to
+/// the original constant with the new one
+///
+/// @param[in] expr Constant expression to be remapped
+/// @param[in] from Constant which if found in expression will be
+/// replaced
+/// @param[in] to Constant which will replace any operands which are `from`
+void remapConstantExpr(llvm::ConstantExpr *expr, llvm::Constant *from,
+                       llvm::Constant *to);
+
+/// @brief remap operands of a constant array
+///
+/// @note This will create a new constant array and replace references to
+/// the original constant with the new one
+///
+/// @param[in] arr Constant array to be remapped
+/// @param[in] from Constant which if found in array will be
+/// replaced
+/// @param[in] to Constant which will replace any operands which are `from`
+void remapConstantArray(llvm::ConstantArray *arr, llvm::Constant *from,
+                        llvm::Constant *to);
+
+/// @brief Discover if input function references debug info metadata nodes
+///
+/// @param[in] func Function to check
+/// @param[in,out] vmap Value map updated with identity mappings of any debug
+/// info metadata found
+///
+/// @return bool True if function contains debug info, false otherwise
+bool funcContainsDebugMetadata(const llvm::Function &func,
+                               llvm::ValueToValueMapTy &vmap);
+
+/// @brief Return a copy of a function's function, return, and parameter
+/// attributes.
+///
+/// Only parameter attributes from indices 0 to numParams are copied. If
+/// numParams is negative, all parameter attributes are copied.
+llvm::AttributeList getCopiedFunctionAttrs(const llvm::Function &oldFn,
+                                           int numParams = -1);
+
+/// @brief Copy a function's attributes to a new function.
+///
+/// @param[in] oldFn Function to copy function attributes from.
+/// @param[in] newFn Function to copy function attributes to.
+/// @param[in] numParams number of parameters to copy attributes from, starting
+/// from the first parameter. If set to a negative number, will copy all
+/// parameter attributes.
+void copyFunctionAttrs(const llvm::Function &oldFn, llvm::Function &newFn,
+                       int numParams = -1);
+
+using ParamTypeAttrsPair = std::pair<llvm::Type *, llvm::AttributeSet>;
+
+using UpdateMDCallbackFn =
+    std::function<void(llvm::Function &oldFn, llvm::Function &newFn, unsigned)>;
+
+/// @brief Clone functions in a module and add an argument to them
+///
+/// @param module LLVM module containing the functions
+/// @param paramTypeFunc Additional parameter to be added defined as a function
+/// returning the type and set of attributes.
+/// This function takes a module, primarily to access DataLayout
+/// @param toBeClonedFunc function which dictates whether each function is
+/// cloned
+/// @param updateMetaDataCallback if set, is invoked with the old function, new
+/// function and new argument index.
+///
+/// @return bool if the module has changed (currently always true)
+///
+/// This iterates through all the functions in a module but only clones and adds
+/// the extra param for those that meet the following criteria after setting
+/// `clonedNoBody` and `ClonedWithBody` from the toBeCloned expression:-
+///
+/// 1.  `!function` declaration or `ClonedNoBody` _or_ is a function
+///     declaration and `ClonedWithBody`
+/// 2.  Not already processed
+bool cloneFunctionsAddArg(
+    llvm::Module &module,
+    std::function<ParamTypeAttrsPair(llvm::Module &)> paramTypeFunc,
+    std::function<void(const llvm::Function &, bool &ClonedWithBody,
+                       bool &ClonedNoBody)>
+        toBeClonedFunc,
+    const UpdateMDCallbackFn &updateMetaDataCallback = nullptr);
+
+/// @brief Updates call instructions after to function clone to point to
+/// `newFunc` instead of `oldFunc`, old call instructions are deleted.
+///
+/// @param[in] oldFunc Function which has been cloned
+/// @param[in] newFunc Cloned function to point callsites to
+/// @param[in] extraArg Whether the cloned callee has an extra argument added
+void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc,
+                          bool extraArg);
+
+using CreateLoopBodyFn = std::function<llvm::BasicBlock *(
+    llvm::BasicBlock *, llvm::Value *, llvm::ArrayRef<llvm::Value *>,
+    llvm::MutableArrayRef<llvm::Value *>)>;
+
+struct CreateLoopOpts {
+  /// @brief indexInc Value by which to increment the loop counter. If nullptr,
+  /// then it is created as the constant 1, based on type of `indexStart`,
+  /// which is a parameter to compiler::utils::createLoop proper.
+  llvm::Value *indexInc = nullptr;
+  /// @brief disableVectorize Sets loop metadata disabling further
+  /// vectorization.
+  bool disableVectorize = false;
+  /// @brief headerName Optional name for the loop header block. Defaults to:
+  /// "loopIR".
+  llvm::StringRef headerName = "loopIR";
+  /// @brief An optional list of incoming IV values.
+  ///
+  /// Each of these is used as the incoming value to a PHI created by
+  /// createLoop. These PHIs are provided to the 'body' function of createLoop,
+  /// which should in turn set the 'next' version of the IV.
+  std::vector<llvm::Value *> IVs;
+  /// @brief An optional list of IV names, to be set on the PHIs provided by
+  /// 'IVs' field/parameter.
+  ///
+  /// If set, the names are assumed to correlate 1:1 with those IVs. The list
+  /// may be shorter than the list of IVs, in which case the trailing IVs are
+  /// not named.
+  std::vector<std::string> loopIVNames;
+};
+
+/// @brief Create a loop around a body, creating an implicit induction variable
+/// (IV) between specified start and end values, and incremented by a
+/// user-specified amount. The loop thus has a trip count equal to the
+/// following C-style loop: `for (auto i = start; i < end; i += incr)`.
+///
+/// Note that this helper always creates a CFG loop, even if the loop bounds
+/// are known not to produce a loop at compile time. Users can use stock LLVM
+/// optimizations to eliminate/simplify the loop in such a case.
+///
+/// @param entry Loop pre-header block. This block will be rewired to jump into
+/// the new loop.
+/// @param exit Loop exit block. The new loop will jump to this once it exits.
+/// @param indexStart The start index
+/// @param indexEnd The end index (we compare for <)
+/// @param opts Set of options configuring the generation of this loop.
+/// @param body Body of code to insert into loop.
+///
+/// The parameters of this function are as follows: the loop body BasicBlock;
+/// the Value corresponding to the IV beginning at `indexStart` and incremented
+/// each iteration by `indexInc` while less than `indexEnd`; the list of IVs
+/// for this iteration of the loop (may or may not be PHIs, depending on the
+/// loop bounds); the list of IVs for the next iteration of the loop (the
+/// function is required to fill these in). Both these sets of IVs will be
+/// arrays of equal length to the original list of IVs, in the same order. The
+/// function returns the loop latch/exiting block: this block will be given the
+/// branch that decides between continuing the loop and exiting from it.
+///
+/// @return llvm::BasicBlock* The exit block
+llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
+                             llvm::Value *indexStart, llvm::Value *indexEnd,
+                             const CreateLoopOpts &opts, CreateLoopBodyFn body);
+
+/// @brief Get the last argument of a function.
+///
+/// @param f An LLVM function to get an argument from.
+///
+/// @return An LLVM argument.
+llvm::Argument *getLastArgument(llvm::Function *f);
+
+/// @brief get the device-side size of size_t type in bytes.
+unsigned getSizeTypeBytes(const llvm::Module &m);
+
+/// @brief get a size_t type.
+/// @return a LLVM IntegerType representing size_t.
+llvm::IntegerType *getSizeType(const llvm::Module &m);
+
+/// @brief Creates a wrapper function (without body), intended for calling @p F
+/// @param M Containing module
+/// @param F Kernel function which is being replaced
+/// @param ArgTypes List of types to be used for the new function
+/// @param Suffix String to which to append to the new function
+/// @param OldSuffix String to which to append to the old function
+/// @note This takes the metadata and debug from the original function.
+///       This is intended to be used for creating a function which replaces
+///       the original function but calls the original.
+///
+/// @note The name of the wrapper function is computed as the original name of
+///       F followed by the Suffix. The original name of F is taken from F's
+///       'mux-base-fn-name' attribute, if set, else it is F's name:
+///
+///         declare void @foo()
+///         ; Function attrs "mux-base-fn-name"="baz"
+///         declare void @bar()
+///
+///       With suffix '.wrapper', this function will produce:
+///
+///         declare void @foo.wrapper()
+///         declare void @baz.wrapper()
+///
+///       With suffix '.new' and old suffix '.old', this function will produce:
+///
+///         declare void @foo.old()
+///         ; Function attrs "mux-base-fn-name"="baz"
+///         declare void @bar.old()
+///
+///         declare void @foo.new()
+///         declare void @baz.new()
+///
+///       It is advised that the suffix begins with a character that may not
+///       occur in the original source language, to avoid clashes with user
+///       functions.
+llvm::Function *createKernelWrapperFunction(
+    llvm::Module &M, llvm::Function &F, llvm::ArrayRef<llvm::Type *> ArgTypes,
+    llvm::StringRef Suffix, llvm::StringRef OldSuffix = "");
+
+/// @brief As above, but creating a wrapper with the exact function signature
+/// of @p F.
+///
+/// Copies over all parameter names and attributes.
+llvm::Function *createKernelWrapperFunction(llvm::Function &F,
+                                            llvm::StringRef Suffix,
+                                            llvm::StringRef OldSuffix = "");
+
+/// @brief Creates a call to a a wrapped function
+///
+/// Sets the calling convention and call-site attributes to match the wrapped
+/// function.
+///
+/// @param WrappedF the function to call
+/// @param Args the list of arguments to pass to the call
+/// @param BB the basic block into which to insert the call. May be null, in
+/// which case the call is not inserted anywhere.
+/// @param InsertPt the point in BB at which to insert the call
+/// @param Name the name of the call instruction. May be empty.
+/// @return The call instruction
+llvm::CallInst *createCallToWrappedFunction(
+    llvm::Function &WrappedF, const llvm::SmallVectorImpl<llvm::Value *> &Args,
+    llvm::BasicBlock *BB, llvm::BasicBlock::iterator InsertPt,
+    llvm::StringRef Name = "");
+
+/// @brief Create a binary operation corresponding to the given
+/// `llvm::RecurKind` with the two provided arguments. It may not
+/// necessarily return one of LLVM's in-built `BinaryOperator`s, or even one
+/// operation: integer min/max operations may defer to multiple instructions or
+/// intrinsics depending on the LLVM version.
+///
+/// @param[in] B the IRBuilder to build new instructions
+/// @param[in] LHS the left-hand value for the operation
+/// @param[in] RHS the right-hand value for the operation
+/// @param[in] Kind the kind of operation to create
+/// @return The binary operation.
+llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS,
+                                     llvm::Value *RHS, llvm::RecurKind Kind);
+/// @}
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_PASS_FUNCTIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
new file mode 100644
index 0000000000000..671cc9baf7051
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/pass_machinery.h
@@ -0,0 +1,148 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Hold global state and objects used for managing a pass pipeline.
+
+#ifndef COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED
+#define COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED
+
+#include <llvm/ADT/StringMap.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/LoopAnalysisManager.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Passes/StandardInstrumentations.h>
+
+namespace llvm {
+class TargetMachine;
+}
+
+namespace compiler {
+namespace utils {
+extern bool VerifyEachIsEnabled;
+
+/// @brief Mirror's LLVM's DebugLogging options in its `opt` tool. Clang has
+/// a boolean on/off version.
+enum class DebugLogging { None, Normal, Verbose, Quiet };
+
+extern DebugLogging DebugPasses;
+
+/// @brief A class that manages the lifetime and initialization of all
+/// components required to set up a new-style LLVM pass manager.
+class PassMachinery {
+public:
+  PassMachinery(llvm::LLVMContext &Ctx, llvm::TargetMachine *TM,
+                bool VerifyEach = false,
+                DebugLogging debugLogLevel = DebugLogging::None);
+
+  virtual ~PassMachinery();
+
+  /// @brief Initializes the PassBuilder and calls registerPasses.
+  void initializeStart(
+      llvm::PipelineTuningOptions PTO = llvm::PipelineTuningOptions());
+
+  /// @brief Cross-registers analysis managers, adds callbacks and
+  /// instrumentation support. Calls addClassToPassNames and
+  /// registerPassCallbacks.
+  void initializeFinish();
+
+  /// @brief Calls buildDefaultAAPipeline and registerLLVMAnalyses.
+  virtual void registerPasses();
+
+  /// @brief Helper method to register the standard LLVM AA pipeline.
+  ///
+  /// Registers:
+  /// * llvm::PassBuilder::buildDefaultAAPipeline
+  void buildDefaultAAPipeline();
+
+  /// @brief Helper method to register the standard LLVM analyses.
+  ///
+  /// Calls:
+  /// * llvm::PassBuilder::registerModuleAnalyses
+  /// * llvm::PassBuilder::registerCGSCCAnalyses
+  /// * llvm::PassBuilder::registerFunctionAnalyses
+  /// * llvm::PassBuilder::registerLoopAnalyses
+  void registerLLVMAnalyses();
+
+  /// @brief Method to allow customization of class-to-pass-names for
+  /// instrumentation purposes. By default, none are set up by
+  /// PassMachinery::initialize.
+  virtual void addClassToPassNames() {}
+
+  /// @brief Method to allow customization of pass callbacks via
+  /// llvm::PassBuilder. of customization of class-to-pass-names for By
+  /// default, no callbacks are set up by PassMachinery::initialize.
+  virtual void registerPassCallbacks() {}
+
+  /// @brief print pass names in style of opt --print-passes
+  /// @note This should print parameters too
+  virtual void printPassNames(llvm::raw_ostream &) {}
+
+  llvm::ModuleAnalysisManager &getMAM() { return MAM; }
+  const llvm::ModuleAnalysisManager &getMAM() const { return MAM; }
+
+  llvm::FunctionAnalysisManager &getFAM() { return FAM; }
+  const llvm::FunctionAnalysisManager &getFAM() const { return FAM; }
+
+  llvm::PassBuilder &getPB() { return PB; }
+  const llvm::PassBuilder &getPB() const { return PB; }
+
+  llvm::TargetMachine *getTM() { return TM; }
+  const llvm::TargetMachine *getTM() const { return TM; }
+
+protected:
+  /// @brief TargetMachine to be used for passes. May be nullptr.
+  llvm::TargetMachine *TM;
+  // Note: the order here is important! They must be destructed in this order.
+  /// @brief Holds state for Loop analyses.
+  llvm::LoopAnalysisManager LAM;
+  /// @brief Holds state for Function analyses.
+  llvm::FunctionAnalysisManager FAM;
+  /// @brief Holds state for CGSCC analyses.
+  llvm::CGSCCAnalysisManager CGAM;
+  /// @brief Holds state for Module analyses.
+  llvm::ModuleAnalysisManager MAM;
+  /// @brief Manages the state for any instrumentation callbacks.
+  std::unique_ptr<llvm::StandardInstrumentations> SI;
+  /// @brief Provides an interface to register callbacks.
+  llvm::PassInstrumentationCallbacks PIC;
+  /// @brief Helper to build and parse pass pipelines.
+  llvm::PassBuilder PB;
+};
+
+/// Helper functions for pass printing.
+
+/// @brief Helper function for printing a pass name, to be used by
+/// printPassNames.
+/// @param PassName Name of pass from a debug/parsing perspective.
+/// @param OS stream to write to.
+/// @note This is a direct copy from PassBuilder.cpp.
+void printPassName(llvm::StringRef PassName, llvm::raw_ostream &OS);
+
+/// @brief Helper function for printing a pass name with parameters, to be.
+/// @param PassName Name of pass from a debug/parsing perspective.
+/// @param Params Textual representation of the parameters.
+/// @param OS stream to write to.
+/// @note This is a direct copy from PassBuilder.cpp.
+void printPassName(llvm::StringRef PassName, llvm::StringRef Params,
+                   llvm::raw_ostream &OS);
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_PASS_MACHINERY_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
new file mode 100644
index 0000000000000..4bdcb2da83969
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/prepare_barriers_pass.h
@@ -0,0 +1,45 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Prepare barriers pass.
+
+#ifndef COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED
+#define COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+/// @brief Pass for ensuring consistent barrier handling.
+///
+/// It inlines functions that contain barriers and gives each barrier call a
+/// unique ID as metadata to ensure consistent handling of barriers in
+/// different versions of the kernel (i.e. Scalar vs Vector). Run before Vecz
+/// for mixed wrapper kernels made up of multiple kernels to work.
+///
+/// Runs over all kernels with "kernel entry point" metadata.
+class PrepareBarriersPass final
+    : public llvm::PassInfoMixin<PrepareBarriersPass> {
+public:
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_PREPARE_BARRIERS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
new file mode 100644
index 0000000000000..bde53d712aab7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/replace_local_module_scope_variables_pass.h
@@ -0,0 +1,44 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Replace local module-scope variables pass.
+
+#ifndef COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED
+#define COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+/// @brief __local address space automatic variables are represented in the
+/// LLVM module as global variables with address space 3. This pass identifies
+/// these variables and places them into a struct allocated (via alloca) in a
+/// newly created wrapper function. A pointer to the struct is then passed
+/// via a parameter to the original kernel.
+///
+/// Runs over all kernels with "kernel" metadata.
+class ReplaceLocalModuleScopeVariablesPass final
+    : public llvm::PassInfoMixin<ReplaceLocalModuleScopeVariablesPass> {
+public:
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_REPLACE_LOCAL_MODULE_SCOPE_VARIABLES_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
new file mode 100644
index 0000000000000..08c923b5e56f8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/scheduling.h
@@ -0,0 +1,143 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Various utlities to help with work-item and work-group scheduling.
+
+#ifndef COMPILER_UTILS_SCHEDULING_H_INCLUDED
+#define COMPILER_UTILS_SCHEDULING_H_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+namespace llvm {
+class Function;
+class Module;
+class StructType;
+class Argument;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+
+namespace WorkItemInfoStructField {
+enum Type : uint32_t {
+  local_id,
+  sub_group_id,
+  num_sub_groups,
+  max_sub_group_size,
+  total
+};
+}
+
+namespace WorkGroupInfoStructField {
+enum Type : uint32_t {
+  group_id = 0,
+  num_groups,
+  global_offset,
+  local_size,
+  work_dim,
+  total
+};
+}
+
+/// @brief Computes the work item info structure type for the given module.
+llvm::StructType *getWorkItemInfoStructTy(llvm::Module &M);
+
+/// @brief Computes the work item info structure type for the given module.
+llvm::StructType *getWorkGroupInfoStructTy(llvm::Module &M);
+
+/// @brief Populates an empty function with code to look up and return a value
+/// from a pointer-to-struct argument.
+///
+/// The function may optionally have a 'rank', in which case the struct field
+/// index is expected to be a 3D array of values. Ranked functions must have an
+/// integer index as their first parameter. Any integer type is supported. The
+/// generated code for ranked functions is given a bounds check to ensure the
+/// index is less than 3. If the index is out of bounds, the default value is
+/// returned.
+///
+/// The pointer-to-struct may be any parameter other than the index, which
+/// comes first.
+///
+/// if !hasRankArg:
+///   ; where structFieldIdx identifies the field.
+///   %struct = type { ..., i64, ... }
+///   declare i64 @foo(ptr %struct-ptr)
+///
+/// if hasRankArg:
+///   ; where structFieldIdx identifies the field and the %idx parameter
+///   ; identifies the sub-field.
+///   %struct = type { ..., [i64, i64, i64], ... }
+///   declare i64 @foo(i32 %idx, ptr %struct-ptr)
+///
+/// @param[in,out] F The function to define
+/// @param[in] structPtrArg The pointer-to-struct argument
+/// @param[in] structTy The underlying type of the pointer-to-struct argument,
+/// used for offset calculations
+/// @param[in] structFieldIdx The struct type's field index to load from
+/// @param[in] hasRankArg True if the struct type's field index is a 3D array,
+/// and thus the function's first parameter is an index parameter.
+/// @param[in] defaultValue The default value returned if the index is out of
+/// bounds. Only valid for ranked functions.
+void populateStructGetterFunction(llvm::Function &F,
+                                  llvm::Argument &structPtrArg,
+                                  llvm::StructType *const structTy,
+                                  uint32_t structFieldIdx, bool hasRankArg,
+                                  size_t defaultValue = 0);
+
+/// @brief Populates an empty function with code to store a value into a
+/// pointer-to-struct argument.
+///
+/// The function may optionally have a 'rank', in which case the struct field
+/// index is expected to be a 3D array of values. Ranked functions must have an
+/// integer index as their first parameter. Any integer type is supported.
+///
+/// The value to store is the next parameter (either first or second) and the
+/// pointer-to-struct may be any other unoccupied parameter.
+///
+/// if !hasRankArg:
+///   ; where structFieldIdx identifies the field.
+///   %struct = type { ..., i64, ... }
+///   declare void @foo(i64 %val, ptr %struct-ptr)
+///
+/// if hasRankArg:
+///   ; where structFieldIdx identifies the field and the %idx parameter
+///   ; identifies the sub-field.
+///   %struct = type { ..., [i64, i64, i64], ... }
+///   declare void @foo(i32 %idx, i64 %val, ptr %struct-ptr)
+///
+/// Note that unlike populateStructGetterFunction, no bounds check is
+/// performed. The setter functions are only available internally to the
+/// compiler, and thus the indices are assumed to be within bounds.
+///
+/// @param[in,out] F The function to define
+/// @param[in] structPtrArg The pointer-to-struct argument
+/// @param[in] structTy The underlying type of the pointer-to-struct argument,
+/// used for offset calculations
+/// @param[in] structFieldIdx The struct type's field index to store to
+/// @param[in] hasRankArg True if the struct type's field index is a 3D array,
+/// and thus the function's first parameter is an index parameter.
+void populateStructSetterFunction(llvm::Function &F,
+                                  llvm::Argument &structPtrArg,
+                                  llvm::StructType *const structTy,
+                                  uint32_t structFieldIdx, bool hasRankArg);
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_SCHEDULING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
new file mode 100644
index 0000000000000..af615f3a6f4bf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/sub_group_analysis.h
@@ -0,0 +1,115 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED
+#define COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include <map>
+#include <set>
+
+namespace compiler {
+namespace utils {
+
+/// @brief Provides module-level information about the sub-group usage of each
+/// function contained within.
+///
+/// The results for each function are cached in a map. Declarations are not
+/// processed. Thus an external function declaration that uses sub-group
+/// builtins will be missed.
+///
+/// Internal mux sub-group 'setter' functions are not counted. This is because
+/// they only used internally by the oneAPI Construction Kit as scaffolding for
+/// the sub-group support that the user can observe.
+///
+/// Each function contains the set of mux sub-group builtins it (transitively)
+/// calls.
+class GlobalSubgroupInfo {
+  struct SubgroupInfo {
+    std::set<BuiltinID> UsedSubgroupBuiltins;
+  };
+
+  using FunctionMapTy =
+      std::map<const llvm::Function *, std::unique_ptr<SubgroupInfo>>;
+
+  FunctionMapTy FunctionMap;
+
+  compiler::utils::BuiltinInfo &BI;
+
+public:
+  GlobalSubgroupInfo(llvm::Module &M, BuiltinInfo &);
+
+  compiler::utils::BuiltinInfo &getBuiltinInfo() { return BI; }
+
+  using iterator = FunctionMapTy::iterator;
+  using const_iterator = FunctionMapTy::const_iterator;
+
+  /// @brief Returns the SubgroupInfo for the provided function.
+  ///
+  /// The function must already exist in the map.
+  const SubgroupInfo *operator[](const llvm::Function *F) const {
+    const const_iterator I = FunctionMap.find(F);
+    assert(I != FunctionMap.end() && "Function not in sub-group info!");
+    return I->second.get();
+  }
+
+  bool usesSubgroups(const llvm::Function &F) const;
+
+  /// @brief Returns true if the provided function is a mux sub-group
+  /// collective builtin or sub-group barrier.
+  std::optional<compiler::utils::Builtin>
+  isMuxSubgroupBuiltin(const llvm::Function *F) const;
+};
+
+/// @brief Computes and returns the GlobalSubgroupInfo for a Module.
+class SubgroupAnalysis : public llvm::AnalysisInfoMixin<SubgroupAnalysis> {
+  friend AnalysisInfoMixin<SubgroupAnalysis>;
+
+public:
+  using Result = GlobalSubgroupInfo;
+
+  explicit SubgroupAnalysis() {}
+
+  /// @brief Retrieve the GlobalSubgroupInfo for the module.
+  Result run(llvm::Module &M, llvm::ModuleAnalysisManager &);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Sub-group analysis"; }
+
+private:
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+/// @brief Helper pass to print out the contents of the SubgroupAnalysis
+/// analysis.
+class SubgroupAnalysisPrinterPass
+    : public llvm::PassInfoMixin<SubgroupAnalysisPrinterPass> {
+  llvm::raw_ostream &OS;
+
+public:
+  explicit SubgroupAnalysisPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+};
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_SUB_GROUP_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
new file mode 100644
index 0000000000000..c8c97f7848a2e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/target_extension_types.h
@@ -0,0 +1,144 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED
+#define COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED
+
+namespace llvm {
+class Type;
+class LLVMContext;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+namespace tgtext {
+
+/// @brief The indices of the *integer* parameters of a "spirv.Image" type.
+enum ImageTyIntParamIdx {
+  ImageTyDimensionalityIdx = 0,
+  ImageTyDepthIdx,
+  ImageTyArrayedIdx,
+  ImageTyMSIdx,
+  ImageTySampledIdx,
+  ImageTyFormatIdx,
+  ImageTyAccessQualIdx,
+};
+
+/// @brief Values the 'dimensionality' parameter of a "spirv.Image" type may
+/// hold.
+///
+/// Note that not all of these are supported by the compiler.
+enum ImageTyDimensionalityParam {
+  ImageDim1D = 0,
+  ImageDim2D,
+  ImageDim3D,
+  ImageDimCube,
+  ImageDimRect,
+  ImageDimBuffer,
+  ImageDimSubpassData,
+};
+
+/// @brief Values the 'depth' parameter of a "spirv.Image" type may hold.
+enum ImageTyDepthParam {
+  ImageDepthNone = 0, // Not a depth image
+  ImageDepth,         // A depth image
+  ImageDepthUnknown,  // No indication as to whether this is a depth or
+                      // non-depth image
+};
+
+/// @brief Values the 'arrayed' parameter of a "spirv.Image" type may hold.
+enum ImageTyArrayedParam {
+  ImageNonArrayed = 0,
+  ImageArrayed,
+};
+
+/// @brief Values the 'MS' parameter of a "spirv.Image" type may hold.
+enum ImageTyMSParam {
+  ImageMSSingleSampled = 0,
+  ImageMSMultiSampled,
+};
+
+/// @brief Values the 'Sampled' parameter of a "spirv.Image" type may hold.
+enum ImageTySampledParam {
+  ImageSampledRuntime = 0,     // only known at run time
+  ImageSampledCompat,          // compatible with sampling operations
+  ImageSampledReadWriteCompat, // compatiable with read/write operations (a
+                               // storage or subpass data image)
+};
+
+enum ImageTyAccessQualParam {
+  ImageAccessQualReadOnly = 0,
+  ImageAccessQualWriteOnly,
+  ImageAccessQualReadWrite,
+};
+
+/// @brief Returns the TargetExtType representing an 'event' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getEventTy(llvm::LLVMContext &Ctx);
+
+/// @brief Returns the TargetExtType representing an 'sampler' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *getSamplerTy(llvm::LLVMContext &Ctx);
+
+/// @brief Returns the TargetExtType representing an 'image1d_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *
+getImage1DTy(llvm::LLVMContext &Ctx,
+             ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image1d_array_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *
+getImage1DArrayTy(llvm::LLVMContext &Ctx,
+                  ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image1d_buffer_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *
+getImage1DBufferTy(llvm::LLVMContext &Ctx,
+                   ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image2d_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *
+getImage2DTy(llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false,
+             ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image2d_array_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *
+getImage2DArrayTy(llvm::LLVMContext &Ctx, bool Depth = false, bool MS = false,
+                  ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+/// @brief Returns the TargetExtType representing an 'image3d_t' type.
+///
+/// Note: Only intended for use LLVM 17+ - throws 'unreachable' otherwise.
+llvm::Type *
+getImage3DTy(llvm::LLVMContext &Ctx,
+             ImageTyAccessQualParam AccessQual = ImageAccessQualReadOnly);
+
+} // namespace tgtext
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_TARGET_EXTENSION_TYPES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
new file mode 100644
index 0000000000000..88dd7a6fb0c50
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/unique_opaque_structs_pass.h
@@ -0,0 +1,55 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Make opaque structure types unique.
+
+#ifndef COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED
+#define COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace compiler {
+namespace utils {
+
+/// @addtogroup utils
+/// @{
+
+/// @brief This pass replaces instances of suffixed opaque structure types
+/// with unsuffixed versions if an unsuffixed version exists in the context.
+///
+/// When linking together two modules that declare the same opaque struct
+/// type, or deserializing a module referencing an opaque struct type in a
+/// context that already contains an opaque type with the same name, LLVM
+/// will attempt to resolve the clash by appending a suffix to the name in
+/// module. For example, deserializing a module referencing the
+/// opencl.event_t in a context that already has this type will result in
+/// the references all being renamed to opencl.event_t.0. This is
+/// problematic if passes rely on the name of the struct to identify them.
+/// This pass can be used to  resolve this issue by searching for
+/// problematic types and replacing them with their unsuffixed version.
+class UniqueOpaqueStructsPass
+    : public llvm::PassInfoMixin<UniqueOpaqueStructsPass> {
+public:
+  UniqueOpaqueStructsPass() = default;
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_UNIQUE_OPAQUE_STRUCTS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
new file mode 100644
index 0000000000000..21c7b62dff496
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/compiler/utils/work_item_loops_pass.h
@@ -0,0 +1,116 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// Work-item loops pass, splitting into "barrier regions"
+
+#ifndef COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED
+#define COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED
+
+#include <compiler/utils/barrier_regions.h>
+#include <compiler/utils/metadata.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+#include <string>
+
+namespace llvm {
+class DominatorTree;
+}
+
+namespace compiler {
+namespace utils {
+
+class BuiltinInfo;
+class BarrierWithLiveVars;
+
+struct WorkItemLoopsPassOptions {
+  /// @brief Set to true if the pass should add extra alloca
+  /// instructions to preserve the values of variables between barriers.
+  bool IsDebug = false;
+  /// @brief Set to true if the pass should forcibly omit scalar
+  /// tail loops from wrapped vector kernels, even if the local work-group size
+  /// is not known to be a multiple of the vectorization factor.
+  bool ForceNoTail = false;
+};
+
+/// @brief The "work-item loops" pass.
+///
+/// This pass adds loops around implicitly SIMT kernels such that the original
+/// kernel is wrapped in a new function that runs over each work-item in the
+/// work-group and calls the original kernel: the scheduling model thus becomes
+/// explicit.
+///
+/// The work-item loops pass assumes that:
+///
+/// * Any functions containing barrier-like functions have already been inlined
+/// into the kernel entry points
+/// * the IDs of pairs of barrier-like functions align between 'main' and 'tail
+/// kernels.
+///
+/// Both of these can be achieved by first running the PrepareBarriersPass.
+///
+/// The pass will query a kernel function for the `reqd_work_group_size`
+/// metadata and optimize accordingly in the presence of it.
+///
+/// Runs over all kernels with "kernel entry point" metadata. Work-item orders
+/// are sourced from the "work item order" function metadata on each kernel.
+class WorkItemLoopsPass final : public llvm::PassInfoMixin<WorkItemLoopsPass> {
+public:
+  /// @brief Constructor.
+  WorkItemLoopsPass(const WorkItemLoopsPassOptions &Options)
+      : IsDebug(Options.IsDebug), ForceNoTail(Options.ForceNoTail) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+
+private:
+  /// @brief Make the work-item-loop wrapper function.
+  /// This creates a wrapper function that iterates over a work group, calling
+  /// the kernel for each work item, respecting the semantics of any barriers
+  /// present. The wrapped kernel may be a scalar kernel, a vectorized kernel,
+  /// or both. When the wrapped kernel wraps both a vector and scalar kernel,
+  /// all vectorized work items will be executed first, and the scalar tail
+  /// last.
+  ///
+  /// The wrapper function is created as a new function suffixed by
+  /// ".mux-barrier-wrapper". The original unwrapped kernel(s)s will be left in
+  /// the Module, but marked as internal linkage so later passes can remove
+  /// them if uncalled once inlined into the wrapper function.
+  ///
+  /// When wrapping only a scalar kernel, or only a vector kernel, pass the
+  /// same Barrier object as both Barrier input parameters.
+  ///
+  /// @param[in] barrierMain the Barrier object of the main kernel function
+  /// @param[in] barrierTail the Barrier object of the tail kernel function
+  /// (may be nullptr).
+  /// @param[in] baseName the base name to use on the new wrapper function
+  /// @param[in] M the module the kernels live in
+  /// @param[in] BI BuiltinInfo providing builtin information
+  /// @return The new wrapper function
+  llvm::Function *makeWrapperFunction(BarrierWithLiveVars &barrierMain,
+                                      BarrierWithLiveVars *barrierTail,
+                                      llvm::StringRef baseName, llvm::Module &M,
+                                      BuiltinInfo &BI);
+
+  const bool IsDebug;
+  const bool ForceNoTail;
+};
+} // namespace utils
+} // namespace compiler
+
+#endif // COMPILER_UTILS_WORK_ITEM_LOOPS_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
new file mode 100644
index 0000000000000..8da2fdcae20dd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.h
@@ -0,0 +1,54 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED
+#define MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED
+
+#include <llvm/IR/Instructions.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+namespace detail {
+
+template <typename T = llvm::AtomicRMWInst::BinOp, typename = void>
+struct BinOpHelper;
+
+// TODO Make this entirely version-based once we no longer have to account for
+// older LLVM 21 snapshots that use the LLVM 20 definition of
+// llvm::AtomicRMWInst::BinOp.
+#define LLVM 21
+#include <multi_llvm/instructions.inc>
+#undef LLVM
+#define LLVM 20
+#include <multi_llvm/instructions.inc>
+#undef LLVM
+
+} // namespace detail
+
+static std::optional<llvm::AtomicRMWInst::BinOp>
+consume_binop_with_underscore(llvm::StringRef &String) {
+  return multi_llvm::detail::BinOpHelper<>::consume_front_with_underscore(
+      String);
+}
+
+static llvm::StringRef to_string(llvm::AtomicRMWInst::BinOp BinOp) {
+  return multi_llvm::detail::BinOpHelper<>::to_string(BinOp);
+}
+
+} // namespace multi_llvm
+
+#endif // MULTI_LLVM_MULTI_INSTRUCTIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
new file mode 100644
index 0000000000000..787822d16859b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/instructions.inc
@@ -0,0 +1,76 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#if LLVM == 20
+template <typename T>
+struct BinOpHelper<T, std::enable_if_t<T::LAST_BINOP - T::FIRST_BINOP == 18>>
+#define BINOP_LLVM21(OP, STR)
+#elif LLVM == 21
+template <typename T, typename>
+struct BinOpHelper
+#define BINOP_LLVM21(OP, STR) BINOP(OP, STR)
+#endif
+{
+#define BINOPS()                                                               \
+  BINOP(Xchg, "xchg")                                                          \
+  BINOP(Add, "add")                                                            \
+  BINOP(Sub, "sub")                                                            \
+  BINOP(And, "and")                                                            \
+  BINOP(Nand, "nand")                                                          \
+  BINOP(Or, "or")                                                              \
+  BINOP(Xor, "xor")                                                            \
+  BINOP(Max, "max")                                                            \
+  BINOP(Min, "min")                                                            \
+  BINOP(UMax, "umax")                                                          \
+  BINOP(UMin, "umin")                                                          \
+  BINOP(FAdd, "fadd")                                                          \
+  BINOP(FSub, "fsub")                                                          \
+  BINOP(FMax, "fmax")                                                          \
+  BINOP(FMin, "fmin")                                                          \
+  BINOP_LLVM21(FMaximum, "fmaximum")                                           \
+  BINOP_LLVM21(FMinimum, "fminumum")                                           \
+  BINOP(UIncWrap, "uincwrap")                                                  \
+  BINOP(UDecWrap, "udecwrap")                                                  \
+  BINOP(USubCond, "usubcond")                                                  \
+  BINOP(USubSat, "usubsat")
+
+  static std::optional<T>
+  consume_front_with_underscore(llvm::StringRef &String) {
+#define BINOP(BINOP, STR)                                                      \
+  if (String.consume_front(STR "_")) {                                         \
+    return T::BINOP;                                                           \
+  }
+    BINOPS()
+#undef BINOP
+    return std::nullopt;
+  }
+
+  static llvm::StringRef to_string(T BinOp) {
+    switch (BinOp) {
+#define BINOP(BINOP, STR)                                                      \
+  case T::BINOP:                                                               \
+    return STR;
+      BINOPS()
+#undef BINOP
+    case T::BAD_BINOP:
+      break;
+    }
+    llvm_unreachable("Unexpected BinOp");
+  }
+
+#undef BINOPS
+#undef BINOP_LLVM21
+};
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
new file mode 100644
index 0000000000000..cecbb7f02ddae
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/intrinsic.h
@@ -0,0 +1,49 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
+#define MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
+
+#include <llvm/IR/Intrinsics.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+// Drop getAttributes workaround when LLVM 21 is minimum version
+namespace detail {
+template <typename... T>
+auto getAttributes(T... args)
+    -> decltype(llvm::Intrinsic::getAttributes(args...)) {
+  return llvm::Intrinsic::getAttributes(args...);
+}
+template <typename... T>
+auto getAttributes(T... args, llvm::FunctionType *)
+    -> decltype(llvm::Intrinsic::getAttributes(args...)) {
+  return llvm::Intrinsic::getAttributes(args...);
+}
+} // namespace detail
+
+namespace Intrinsic {
+static inline auto getAttributes(llvm::LLVMContext &C, llvm::Intrinsic::ID ID,
+                                 llvm::FunctionType *FT) {
+  return detail::getAttributes<llvm::LLVMContext &, llvm::Intrinsic::ID>(C, ID,
+                                                                         FT);
+}
+} // namespace Intrinsic
+
+} // namespace multi_llvm
+
+#endif // MULTI_LLVM_MULTI_INTRINSIC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
new file mode 100644
index 0000000000000..802471f4562cc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/llvm_version.h
@@ -0,0 +1,38 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_LLVM_VERSION_H_INCLUDED
+#define MULTI_LLVM_LLVM_VERSION_H_INCLUDED
+
+#include <llvm/Config/llvm-config.h>
+
+#define LLVM_VERSION_EQUAL(MAJOR, MINOR)                                       \
+  (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR == (MINOR))
+
+#define LLVM_VERSION_LESS(MAJOR, MINOR)                                        \
+  ((LLVM_VERSION_MAJOR < (MAJOR)) ||                                           \
+   (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR < (MINOR)))
+
+#define LLVM_VERSION_LESS_EQUAL(MAJOR, MINOR)                                  \
+  (LLVM_VERSION_EQUAL(MAJOR, MINOR) || LLVM_VERSION_LESS(MAJOR, MINOR))
+
+#define LLVM_VERSION_GREATER(MAJOR, MINOR)                                     \
+  ((LLVM_VERSION_MAJOR > (MAJOR)) ||                                           \
+   (LLVM_VERSION_MAJOR == (MAJOR) && LLVM_VERSION_MINOR > (MINOR)))
+
+#define LLVM_VERSION_GREATER_EQUAL(MAJOR, MINOR)                               \
+  (LLVM_VERSION_EQUAL(MAJOR, MINOR) || LLVM_VERSION_GREATER(MAJOR, MINOR))
+
+#endif // MULTI_LLVM_LLVM_VERSION_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
new file mode 100644
index 0000000000000..ea350fd4bdec2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/multi_llvm.h
@@ -0,0 +1,22 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_MULTI_LLVM_H_INCLUDED
+#define MULTI_LLVM_MULTI_LLVM_H_INCLUDED
+
+#include <multi_llvm/llvm_version.h>
+
+#endif // MULTI_LLVM_MULTI_LLVM_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
new file mode 100644
index 0000000000000..6d8e608b860bd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/target_transform_info.h
@@ -0,0 +1,74 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED
+#define MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED
+
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+namespace detail {
+
+template <typename TargetTransformInfo>
+auto isLegalMaskedLoadImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
+                           llvm::Align Alignment, unsigned)
+    -> decltype(TTI.isLegalMaskedLoad(Ty, Alignment)) {
+  return TTI.isLegalMaskedLoad(Ty, Alignment);
+}
+
+template <typename TargetTransformInfo>
+auto isLegalMaskedStoreImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
+                            llvm::Align Alignment, unsigned)
+    -> decltype(TTI.isLegalMaskedStore(Ty, Alignment)) {
+  return TTI.isLegalMaskedStore(Ty, Alignment);
+}
+
+#if LLVM_VERSION_GREATER_EQUAL(21, 0)
+// TODO: Make this depend only on LLVM version once we do not have to remain
+// compatible with slightly older LLVM 21 snapshots.
+
+template <typename TargetTransformInfo>
+auto isLegalMaskedLoadImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
+                           llvm::Align Alignment, unsigned AddrSpace)
+    -> decltype(TTI.isLegalMaskedLoad(Ty, Alignment, AddrSpace)) {
+  return TTI.isLegalMaskedLoad(Ty, Alignment, AddrSpace);
+}
+
+template <typename TargetTransformInfo>
+auto isLegalMaskedStoreImpl(const TargetTransformInfo &TTI, llvm::Type *Ty,
+                            llvm::Align Alignment, unsigned AddrSpace)
+    -> decltype(TTI.isLegalMaskedStore(Ty, Alignment, AddrSpace)) {
+  return TTI.isLegalMaskedStore(Ty, Alignment, AddrSpace);
+}
+#endif
+
+} // namespace detail
+
+bool isLegalMaskedLoad(const llvm::TargetTransformInfo &TTI, llvm::Type *Ty,
+                       llvm::Align Alignment, unsigned AddrSpace) {
+  return detail::isLegalMaskedLoadImpl(TTI, Ty, Alignment, AddrSpace);
+}
+
+bool isLegalMaskedStore(const llvm::TargetTransformInfo &TTI, llvm::Type *Ty,
+                        llvm::Align Alignment, unsigned AddrSpace) {
+  return detail::isLegalMaskedStoreImpl(TTI, Ty, Alignment, AddrSpace);
+}
+
+} // namespace multi_llvm
+
+#endif // MULTI_LLVM_TARGET_TRANSFORM_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
new file mode 100644
index 0000000000000..576b04f284d8e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/targetinfo.h
@@ -0,0 +1,58 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED
+#define MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED
+
+#include <clang/Basic/TargetInfo.h>
+#include <multi_llvm/llvm_version.h>
+
+namespace multi_llvm {
+
+namespace detail {
+
+#if LLVM_VERSION_GREATER_EQUAL(21, 0)
+
+template <typename TargetInfo = clang::TargetInfo>
+auto createTargetInfo(clang::DiagnosticsEngine &Diags,
+                      clang::TargetOptions &Opts)
+    -> decltype(TargetInfo::CreateTargetInfo(Diags, Opts)) {
+  return TargetInfo::CreateTargetInfo(Diags, Opts);
+}
+
+#endif
+
+template <typename TargetInfo = clang::TargetInfo>
+auto createTargetInfo(clang::DiagnosticsEngine &Diags,
+                      clang::TargetOptions &Opts)
+    -> decltype(TargetInfo::CreateTargetInfo(
+        Diags, std::make_shared<clang::TargetOptions>(Opts))) {
+  return TargetInfo::CreateTargetInfo(
+      Diags, std::make_shared<clang::TargetOptions>(Opts));
+}
+
+} // namespace detail
+
+struct TargetInfo {
+  static clang::TargetInfo *CreateTargetInfo(clang::DiagnosticsEngine &Diags,
+                                             clang::TargetOptions &Opts) {
+    return multi_llvm::detail::createTargetInfo(Diags, Opts);
+  }
+};
+
+} // namespace multi_llvm
+
+#endif // MULTI_LLVM_TARGET_TARGETINFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
new file mode 100644
index 0000000000000..d13b9d531b8a9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/include/multi_llvm/vector_type_helper.h
@@ -0,0 +1,69 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#ifndef MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED
+#define MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED
+
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Type.h>
+#include <llvm/Support/TypeSize.h>
+
+namespace multi_llvm {
+
+// The functions defined below are common functions to allow us to generically
+// get VectorType information from a base Type class, due to either deprecation
+// or removal of these in LLVM 11 (result of scalable/fixed vectors separation)
+
+inline llvm::Type *getVectorElementType(llvm::Type *ty) {
+  assert(llvm::isa<llvm::VectorType>(ty) && "Not a vector type");
+  return llvm::cast<llvm::VectorType>(ty)->getElementType();
+}
+inline llvm::Type *getVectorElementType(const llvm::Type *ty) {
+  assert(llvm::isa<llvm::VectorType>(ty) && "Not a vector type");
+  return llvm::cast<llvm::VectorType>(ty)->getElementType();
+}
+
+inline uint64_t getVectorNumElements(llvm::Type *ty) {
+  assert(ty->getTypeID() == llvm::Type::FixedVectorTyID &&
+         "Not a fixed vector type");
+  return llvm::cast<llvm::FixedVectorType>(ty)
+      ->getElementCount()
+      .getFixedValue();
+}
+inline uint64_t getVectorNumElements(const llvm::Type *ty) {
+  assert(ty->getTypeID() == llvm::Type::FixedVectorTyID &&
+         "Not a fixed vector type");
+  return llvm::cast<llvm::FixedVectorType>(ty)
+      ->getElementCount()
+      .getFixedValue();
+}
+
+inline llvm::ElementCount getVectorElementCount(llvm::Type *ty) {
+  return llvm::cast<llvm::VectorType>(ty)->getElementCount();
+}
+inline llvm::ElementCount getVectorElementCount(const llvm::Type *ty) {
+  return llvm::cast<llvm::VectorType>(ty)->getElementCount();
+}
+
+inline unsigned getVectorKnownMinNumElements(llvm::Type *ty) {
+  return getVectorElementCount(ty).getKnownMinValue();
+}
+
+inline unsigned getVectorKnownMinNumElements(const llvm::Type *ty) {
+  return getVectorElementCount(ty).getKnownMinValue();
+}
+} // namespace multi_llvm
+
+#endif // MULTI_LLVM_VECTOR_TYPE_HELPER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
new file mode 100644
index 0000000000000..98d63c713e0d4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/attributes.cpp
@@ -0,0 +1,206 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <llvm/ADT/StringExtras.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+
+#include <optional>
+
+namespace compiler {
+namespace utils {
+using namespace llvm;
+
+static constexpr const char *MuxKernelAttrName = "mux-kernel";
+
+void setIsKernel(Function &F) { F.addFnAttr(MuxKernelAttrName, ""); }
+
+void setIsKernelEntryPt(Function &F) {
+  F.addFnAttr(MuxKernelAttrName, "entry-point");
+}
+
+bool isKernel(const Function &F) {
+  return F.getFnAttribute(MuxKernelAttrName).isValid();
+}
+
+bool isKernelEntryPt(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(MuxKernelAttrName);
+  if (Attr.isValid()) {
+    return Attr.getValueAsString() == "entry-point";
+  }
+  return false;
+}
+
+void dropIsKernel(Function &F) { F.removeFnAttr(MuxKernelAttrName); }
+
+void takeIsKernel(Function &ToF, Function &FromF) {
+  if (!isKernel(FromF)) {
+    return;
+  }
+  // Check whether we need to add entry-point data.
+  const bool IsEntryPt = isKernelEntryPt(FromF);
+  // Drop all data for simplicity
+  dropIsKernel(ToF);
+  dropIsKernel(FromF);
+  // Add the new data
+  IsEntryPt ? setIsKernelEntryPt(ToF) : setIsKernel(ToF);
+}
+
+static StringRef getFnNameFromAttr(const Function &F, StringRef AttrName) {
+  const Attribute Attr = F.getFnAttribute(AttrName);
+  if (Attr.isValid()) {
+    return Attr.getValueAsString();
+  }
+  return "";
+}
+
+static constexpr const char *OrigFnNameAttr = "mux-orig-fn";
+
+void setOrigFnName(Function &F) { F.addFnAttr(OrigFnNameAttr, F.getName()); }
+
+StringRef getOrigFnName(const Function &F) {
+  return getFnNameFromAttr(F, OrigFnNameAttr);
+}
+
+StringRef getOrigFnNameOrFnName(const Function &F) {
+  auto N = getFnNameFromAttr(F, OrigFnNameAttr);
+  return N.empty() ? F.getName() : N;
+}
+
+static constexpr const char *BaseFnNameAttr = "mux-base-fn-name";
+
+void setBaseFnName(Function &F, StringRef N) { F.addFnAttr(BaseFnNameAttr, N); }
+
+StringRef getBaseFnName(const Function &F) {
+  return getFnNameFromAttr(F, BaseFnNameAttr);
+}
+
+StringRef getBaseFnNameOrFnName(const Function &F) {
+  auto N = getFnNameFromAttr(F, BaseFnNameAttr);
+  return N.empty() ? F.getName() : N;
+}
+
+StringRef getOrSetBaseFnName(Function &F, const Function &SetFromF) {
+  const Attribute Attr = F.getFnAttribute(BaseFnNameAttr);
+  if (Attr.isValid()) {
+    return Attr.getValueAsString();
+  }
+
+  // Try and peer through the original function's name
+  StringRef BaseFnName = getBaseFnNameOrFnName(SetFromF);
+  F.addFnAttr(BaseFnNameAttr, BaseFnName);
+  setBaseFnName(F, BaseFnName);
+  return BaseFnName;
+}
+
+static std::optional<int> getStringFnAttrAsInt(const Attribute &Attr) {
+  if (Attr.isValid()) {
+    int AttrValue = 0;
+    if (!Attr.getValueAsString().getAsInteger(10, AttrValue)) {
+      return AttrValue;
+    }
+  }
+  return std::nullopt;
+}
+
+static constexpr const char *LocalMemUsageAttrName = "mux-local-mem-usage";
+
+void setLocalMemoryUsage(Function &F, uint64_t LocalMemUsage) {
+  const Attribute Attr = Attribute::get(F.getContext(), LocalMemUsageAttrName,
+                                        itostr(LocalMemUsage));
+  F.addFnAttr(Attr);
+}
+
+std::optional<uint64_t> getLocalMemoryUsage(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(LocalMemUsageAttrName);
+  auto Val = getStringFnAttrAsInt(Attr);
+  // Only return non-negative integers
+  return Val && Val >= 0 ? std::optional<uint64_t>(*Val) : std::nullopt;
+}
+
+static constexpr const char *DMAReqdSizeBytesAttrName = "mux-dma-reqd-size";
+
+void setDMAReqdSizeBytes(Function &F, uint32_t DMASizeBytes) {
+  const Attribute Attr = Attribute::get(
+      F.getContext(), DMAReqdSizeBytesAttrName, itostr(DMASizeBytes));
+  F.addFnAttr(Attr);
+}
+
+std::optional<uint32_t> getDMAReqdSizeBytes(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(DMAReqdSizeBytesAttrName);
+  auto Val = getStringFnAttrAsInt(Attr);
+  // Only return non-negative integers
+  return Val && Val >= 0 ? std::optional<uint32_t>(*Val) : std::nullopt;
+}
+
+static constexpr const char *BarrierScheduleAttrName = "mux-barrier-schedule";
+
+void setBarrierSchedule(CallInst &CI, BarrierSchedule Sched) {
+  StringRef Val = "unknown";
+  switch (Sched) {
+  case BarrierSchedule::Unordered:
+    Val = "unordered";
+    break;
+  case BarrierSchedule::Once:
+    Val = "once";
+    break;
+  case BarrierSchedule::ScalarTail:
+    Val = "scalar-tail";
+    break;
+  case BarrierSchedule::Linear:
+    Val = "linear";
+    break;
+  }
+
+  const Attribute Attr =
+      Attribute::get(CI.getContext(), BarrierScheduleAttrName, Val);
+  CI.addFnAttr(Attr);
+}
+
+BarrierSchedule getBarrierSchedule(const CallInst &CI) {
+  const Attribute Attr = CI.getFnAttr(BarrierScheduleAttrName);
+  if (Attr.isValid()) {
+    return StringSwitch<BarrierSchedule>(Attr.getValueAsString())
+        .Case("once", BarrierSchedule::Once)
+        .Case("scalar-tail", BarrierSchedule::ScalarTail)
+        .Case("linear", BarrierSchedule::Linear)
+        .Default(BarrierSchedule::Unordered);
+  }
+  return BarrierSchedule::Unordered;
+}
+
+static constexpr const char *MuxNoSubgroupsAttrName = "mux-no-subgroups";
+
+void setHasNoExplicitSubgroups(Function &F) {
+  F.addFnAttr(MuxNoSubgroupsAttrName);
+}
+
+bool hasNoExplicitSubgroups(const Function &F) {
+  const Attribute Attr = F.getFnAttribute(MuxNoSubgroupsAttrName);
+  return Attr.isValid();
+}
+
+unsigned getMuxSubgroupSize(const llvm::Function &) {
+  // FIXME: The mux sub-group size is currently assumed to be 1 for all
+  // functions, kerrnels, and targets. This helper function is just to avoid
+  // hard-coding the constant 1 in places that will eventually need updated.
+  return 1;
+}
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
new file mode 100644
index 0000000000000..df6cf77da1b8e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/barrier_regions.cpp
@@ -0,0 +1,1467 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/barrier_regions.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/ADT/SetOperations.h>
+#include <llvm/ADT/SetVector.h>
+#include <llvm/ADT/SmallSet.h>
+#include <llvm/ADT/SmallString.h>
+#include <llvm/ADT/StringSet.h>
+#include <llvm/ADT/TinyPtrVector.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfo.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <llvm/Transforms/Utils/LCSSA.h>
+#include <llvm/Transforms/Utils/Local.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <optional>
+
+using namespace llvm;
+
+#define NDEBUG_BARRIER
+#define DEBUG_TYPE "barrier-regions"
+
+namespace {
+using AlignIntTy = uint64_t;
+
+/// @brief it returns true if and only if the instruction is a work group
+/// collective call, and returns false otherwise.
+std::optional<compiler::utils::GroupCollective>
+getWorkGroupCollectiveCall(Instruction *inst,
+                           compiler::utils::BuiltinInfo &bi) {
+  auto *const ci = dyn_cast_or_null<CallInst>(inst);
+  if (!ci) {
+    return std::nullopt;
+  }
+
+  if (Function *callee = ci->getCalledFunction()) {
+    if (const auto b = bi.analyzeBuiltin(*callee)) {
+      const auto info = bi.isMuxGroupCollective(b->ID);
+      if (info && info->isWorkGroupScope()) {
+        return info;
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+/// @brief Builds a stub function containing only a void return instruction.
+///
+/// @note This is useful for client debuggers that want to break on a
+/// particular barrier and work item. Customer specific passes can fill the
+/// contents since it may involve inline assembly for breakpoint traps. The
+/// stub function takes a single i32 argument which is an id identifying the
+/// barrier which invoked the stub. A client debugger should be able to read
+/// this argument using the arch calling convention even without debug info
+/// since it's always the first argument, although customer passes may
+/// rearrange parameters later.
+///
+/// @param[in] name What to name the stub function.
+/// @param[in] module Current module.
+/// @param[in] cc Calling convention for function.
+///
+/// @return Return function created.
+Function *MakeStubFunction(StringRef name, Module &module, CallingConv::ID cc) {
+  // If we've already created a stub return the existing function
+  if (Function *existing = module.getFunction(name)) {
+    return existing;
+  }
+
+  auto &context = module.getContext();
+  // 32-bit integer parameter
+  IntegerType *int32_type = IntegerType::get(context, 32);
+  // Function returns void
+  FunctionType *func_type =
+      FunctionType::get(Type::getVoidTy(context), {int32_type}, false);
+
+  // Create function in module
+  Function *stub_func =
+      Function::Create(func_type, Function::ExternalLinkage, name, &module);
+
+  // Don't inline the function since we want the debugger to be able to hook it
+  stub_func->addFnAttr(Attribute::NoInline);
+
+  // we don't use exceptions
+  stub_func->addFnAttr(Attribute::NoUnwind);
+  stub_func->setCallingConv(cc);
+
+  // No stub or cloned function should have SPIR_KERNEL calling convention.
+  // Please consider using SPIR_FUNC instead of SPIR_KERNEL. In case the
+  // original code has a different calling convention, we should preserve that
+  // one.
+  assert(cc != CallingConv::SPIR_KERNEL && "calling convention mismatch");
+
+  // Single basic block containing only a return void instruction
+  IRBuilder<> IRBuilder(BasicBlock::Create(context, "entry", stub_func));
+  IRBuilder.CreateRetVoid();
+
+  // Build debug info for function if compiled with -g
+  DIBuilder DIB(module, /*AllowUnresolved*/ false);
+
+  // Find module compile unit
+  auto *cu = DIB.createCompileUnit(
+      dwarf::DW_LANG_OpenCL, DIB.createFile("debug", "/"), "", false, "", 0);
+
+  // Create DISubprogram metadata for function
+  auto type_array =
+      DIB.getOrCreateTypeArray({DIB.createUnspecifiedParameter()});
+  auto subprogram_type = DIB.createSubroutineType(type_array);
+  auto DISubprogram = DIB.createFunction(
+      cu->getFile(), name, name, cu->getFile(), 0, subprogram_type, 0,
+      DINode::FlagZero, DISubprogram::SPFlagDefinition);
+
+  // Set function compile unit
+  DISubprogram->replaceUnit(cu);
+
+  // Assigned debug info to function
+  stub_func->setSubprogram(DISubprogram);
+
+  DIB.finalize();
+
+  return stub_func;
+}
+
+/// @brief Check whether this value is valid as def.
+///
+/// @param[in] v Value for checking.
+///
+/// @return True = valid for definition, False = not valid.
+inline bool CheckValidDef(Value *v) {
+  return !(isa<BranchInst>(v) || isa<ReturnInst>(v));
+}
+
+/// @brief Check whether this value is valid as use.
+///
+/// @param[in] v - value for checking.
+///
+/// @return True = valid for use, False = not valid.
+inline bool CheckValidUse(Value *v) {
+  return !(isa<Constant>(v) || isa<BasicBlock>(v) || isa<MetadataAsValue>(v));
+}
+
+bool IsRematerializableBuiltinCall(Value *v, compiler::utils::BuiltinInfo &bi) {
+  if (auto *call = dyn_cast<CallInst>(v)) {
+    if (auto *F = call->getCalledFunction()) {
+      if (const auto B = bi.analyzeBuiltin(*F)) {
+        if (B->properties & compiler::utils::eBuiltinPropertyRematerializable) {
+          for (auto &op : call->operands()) {
+            if (isa<Instruction>(op.get())) {
+              return false;
+            }
+          }
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// It traces through instructions with a single Instruction operand, looking
+// for work item functions or function arguments.
+bool IsTrivialValue(Value *v, unsigned depth,
+                    compiler::utils::BuiltinInfo &bi) {
+  while (depth--) {
+    auto *const I = dyn_cast<Instruction>(v);
+    if (!I || IsRematerializableBuiltinCall(v, bi)) {
+      return true;
+    }
+
+    // Pass through a vector splat to the splatted value
+    if (auto *const shuffle = dyn_cast<ShuffleVectorInst>(I)) {
+      if (shuffle->isZeroEltSplat()) {
+        if (auto *const ins =
+                dyn_cast<InsertElementInst>(shuffle->getOperand(0))) {
+          if (auto *const src = dyn_cast<Instruction>(ins->getOperand(1))) {
+            v = src;
+            continue;
+          } else {
+            // Splat of a non-Instruction (i.e. an Argument)
+            return true;
+          }
+        }
+      }
+      return false;
+    }
+
+    // Consider only certain trivial operations
+    if (!I->isBinaryOp() && !I->isCast() && !I->isUnaryOp()) {
+      return false;
+    }
+
+    Value *chain = nullptr;
+    for (auto *op : I->operand_values()) {
+      if (auto *const opI = dyn_cast<Instruction>(op)) {
+        if (!chain) {
+          chain = opI;
+        } else if (chain != op) {
+          // It's non-trivial if it has more than one Instruction operand.
+          return false;
+        }
+      }
+    }
+
+    // It's trivial if it didn't have any operands that were instructions.
+    if (!chain) {
+      return true;
+    }
+
+    v = chain;
+  }
+  return false;
+}
+
+// GEPs typically have a low cost, allow up to 1 non-trivial operand
+// (including the pointer operand as well as the indices).
+bool IsTrivialGEP(Value *v, SmallVectorImpl<Value *> &operands) {
+  auto *const GEP = dyn_cast<GetElementPtrInst>(v);
+  if (!GEP) {
+    return false;
+  }
+
+  unsigned inst_ops = 0;
+  for (auto *op : GEP->operand_values()) {
+    if (isa<Instruction>(op) && ++inst_ops > 1) {
+      return false;
+    }
+  }
+
+  for (auto *op : GEP->operand_values()) {
+    if (isa<Instruction>(op)) {
+      operands.push_back(op);
+    }
+  }
+  return true;
+}
+
+/// @brief Update all basic block edges for PHINodes, and drop edges from
+/// basic blocks that are not in the the new function (which only consists of
+/// the subset of blocks that make up one region).
+///
+/// @param[in] BB Basic block to process.
+/// @param[in] vmap Map for value for cloning.
+void UpdateAndTrimPHINodeEdges(BasicBlock *BB, ValueToValueMapTy &vmap) {
+  for (auto &phi : BB->phis()) {
+    for (unsigned i = 0; i < phi.getNumIncomingValues(); i++) {
+      const BasicBlock *incoming_bb = phi.getIncomingBlock(i);
+
+      // If the incoming basic block was processed during cloning then
+      // update the edge, if it wasn't then it is not in the region so
+      // remove it.
+      if (vmap.count(incoming_bb)) {
+        Value *updated_bb = vmap[incoming_bb];
+        phi.setIncomingBlock(i, cast<BasicBlock>(updated_bb));
+      } else {
+        // Note: Updating the loop iterator to reflect the updated
+        // post-deletion indices.
+        phi.removeIncomingValue(i--);
+      }
+    }
+  }
+}
+
+/// @brief Returns true if the type is a struct type containing any scalable
+/// vectors in its list of elements
+bool isStructWithScalables(Type *ty) {
+  if (auto *const struct_ty = dyn_cast<StructType>(ty)) {
+    return any_of(struct_ty->elements(),
+                  [](Type *ty) { return isa<ScalableVectorType>(ty); });
+  }
+  return false;
+}
+
+} // namespace
+
+Value *compiler::utils::Barrier::LiveValuesHelper::getExtractValueGEP(
+    const Value *live) {
+  if (auto *const extract = dyn_cast<ExtractValueInst>(live)) {
+    // We can't handle extracts with multiple indices
+    if (extract->getIndices().size() == 1) {
+      return getGEP(extract->getAggregateOperand(), extract->getIndices()[0]);
+    }
+  }
+  return nullptr;
+}
+
+Value *compiler::utils::Barrier::LiveValuesHelper::getGEP(const Value *live,
+                                                          unsigned member_idx) {
+  auto key = std::make_pair(live, member_idx);
+  if (auto gep_it = live_GEPs.find(key); gep_it != live_GEPs.end()) {
+    return gep_it->second;
+  }
+
+  Value *gep;
+
+  if (auto field_it = barrier.live_variable_index_map_.find(key);
+      field_it != barrier.live_variable_index_map_.end()) {
+    LLVMContext &context = barrier.module_.getContext();
+    const unsigned field_index = field_it->second;
+    Value *live_variable_info_idxs[2] = {
+        ConstantInt::get(Type::getInt32Ty(context), 0),
+        ConstantInt::get(Type::getInt32Ty(context), field_index)};
+
+    gep = gepBuilder.CreateInBoundsGEP(barrier.live_var_mem_ty_, barrier_struct,
+                                       live_variable_info_idxs,
+                                       Twine("live_gep_") + live->getName());
+  } else if (auto field_it = barrier.live_variable_scalables_map_.find(key);
+             field_it != barrier.live_variable_scalables_map_.end()) {
+    const unsigned field_offset = field_it->second;
+    Value *scaled_offset = nullptr;
+
+    LLVMContext &context = barrier.module_.getContext();
+    if (field_offset != 0) {
+      if (!vscale) {
+        Type *size_type = gepBuilder.getIntNTy(barrier.size_t_bytes * 8);
+        vscale = gepBuilder.CreateIntrinsic(Intrinsic::vscale, size_type, {});
+      }
+      scaled_offset = gepBuilder.CreateMul(
+          vscale, gepBuilder.getIntN(barrier.size_t_bytes * 8, field_offset));
+    } else {
+      scaled_offset = ConstantInt::get(Type::getInt32Ty(context), 0);
+    }
+
+    Value *live_variable_info_idxs[3] = {
+        ConstantInt::get(Type::getInt32Ty(context), 0),
+        ConstantInt::get(Type::getInt32Ty(context),
+                         barrier.live_var_mem_scalables_index),
+        scaled_offset,
+    };
+
+    // Gep into the raw byte buffer
+    gep = gepBuilder.CreateInBoundsGEP(
+        barrier.live_var_mem_ty_, barrier_struct, live_variable_info_idxs,
+        Twine("live_gep_scalable_") + live->getName());
+  } else {
+    // Fall back and see if this live variable is actually a decomposed
+    // structure type.
+    return getExtractValueGEP(live);
+  }
+
+  // Cache this GEP for later
+  live_GEPs[key] = gep;
+
+  return gep;
+}
+
+Value *compiler::utils::Barrier::LiveValuesHelper::getReload(Value *live,
+                                                             IRBuilderBase &ir,
+                                                             const char *name,
+                                                             bool reuse) {
+  auto &mapped = reloads[live];
+  if (reuse && mapped) {
+    return mapped;
+  }
+
+  if (Value *v = getGEP(live)) {
+    if (!isa<AllocaInst>(live)) {
+      // If live variable is not allocainst, insert load.
+      if (!isStructWithScalables(live->getType())) {
+        v = ir.CreateLoad(live->getType(), v, Twine(live->getName(), name));
+      } else {
+        auto *const struct_ty = cast<StructType>(live->getType());
+        // Start off with a poison value, and build the struct up member by
+        // member, reloading each member at a time from their respective
+        // offsets.
+        v = PoisonValue::get(struct_ty);
+        for (auto [idx, ty] : enumerate(struct_ty->elements())) {
+          auto *const elt_addr = getGEP(live, idx);
+          assert(elt_addr && "Could not get address of struct element");
+          auto *const reload =
+              ir.CreateLoad(ty, elt_addr, Twine(live->getName(), name));
+          v = ir.CreateInsertValue(v, reload, idx);
+        }
+      }
+    }
+    mapped = v;
+    return v;
+  }
+
+  if (auto *I = dyn_cast<Instruction>(live)) {
+    // Save these
+    auto insPoint = ir.GetInsertPoint();
+    auto *const insBB = ir.GetInsertBlock();
+
+    if (!reuse || !mapped) {
+      auto *clone = I->clone();
+      clone->setName(I->getName());
+      clone->setDebugLoc(DebugLoc());
+      ir.Insert(clone);
+      if (gepBuilder.GetInsertPoint() == ir.GetInsertPoint()) {
+        gepBuilder.SetInsertPoint(clone);
+      }
+      ir.SetInsertPoint(clone);
+      mapped = clone;
+      I = clone;
+    } else {
+      return mapped;
+    }
+
+    for (auto op_it = I->op_begin(); op_it != I->op_end();) {
+      auto &op = *op_it++;
+      if (auto *op_inst = dyn_cast<Instruction>(op.get())) {
+        ir.SetInsertPoint(I);
+        op.set(getReload(op_inst, ir, name, reuse));
+      }
+    }
+
+    // Restore the original insert point
+    ir.SetInsertPoint(insBB, insPoint);
+    return I;
+  }
+
+  return live;
+}
+
+void compiler::utils::Barrier::Run(llvm::ModuleAnalysisManager &mam) {
+  bi_ = &mam.getResult<BuiltinInfoAnalysis>(module_);
+  FindBarriers();
+
+  kernel_id_map_[kBarrier_EndID] = nullptr;
+
+  if (barriers_.empty()) {
+    // If there are no barriers, we can use the original function as the
+    // single barrier region.
+    auto &node = barrier_region_id_map_[kBarrier_FirstID];
+    node.entry = &func_.getEntryBlock();
+    node.id = kBarrier_FirstID;
+    node.successor_ids.push_back(kBarrier_EndID);
+    kernel_id_map_[kBarrier_FirstID] = &func_;
+    return;
+  }
+
+  // If we found some barriers, we need to split up our kernel across them!
+  {
+    ModulePassManager pm;
+    // It's convenient to create LCSSA PHI nodes to stop values defined
+    // within a loop being stored to the barrier unnecessarily on every
+    // iteration (if, for instance, the loop is entirely between two
+    // barriers, but the value is used outside of that barrier region).
+    pm.addPass(llvm::createModuleToFunctionPassAdaptor(LCSSAPass()));
+    pm.run(module_, mam);
+    mam.invalidate(module_, PreservedAnalyses::allInSet<CFGAnalyses>());
+  }
+
+  // Do the splitting first in case a value is used on both sides of a barrier
+  // within the same basic block.
+  SplitBlockwithBarrier();
+  FindLiveVariables();
+
+  // Tidy up the barrier struct, removing values that we can
+  // reload/rematerialize on the other side of the barrier.
+  // NB: We don't do this if any of the barriers is a work-group broadcast. In
+  // the case that a broadcasted value is non-uniform (i.e., it depends on
+  // work-item builtins), we must preserve it in the barrier struct! This is
+  // because we can't rematerialize the local ID and broadcast that; we need
+  // to broadcast the specific local ID for the broadcasted work-item.
+  // This is very crude. We could either:
+  // 1. Trace through all candidate values we want to remove and ensure they're
+  // not being broadcasted.
+  // 2. Add some more advanced rematerialization logic to substitute
+  // rematerializable work-item functions with values specific to a given
+  // work-item. Note that the builtins we rematerialize are ultimately up to
+  // the BuiltinInfo to identify, so we can't assume anything here and would
+  // have to defer back to the BuiltinInfo to do this correctly.
+  if (llvm::none_of(barriers_, [this](llvm::CallInst *const CI) {
+        auto Info = getWorkGroupCollectiveCall(CI, *bi_);
+        return Info && Info->isBroadcast();
+      })) {
+    TidyLiveVariables();
+  }
+
+  MakeLiveVariableMemType();
+  SeperateKernelWithBarrier();
+}
+
+void compiler::utils::Barrier::replaceSubkernel(Function *from, Function *to) {
+  for (auto &k : kernel_id_map_) {
+    if (k.second == from) {
+      k.second = to;
+    }
+  }
+}
+
+/// @brief Find Barriers.
+void compiler::utils::Barrier::FindBarriers() {
+  SmallVector<std::pair<unsigned, CallInst *>, 8> orderedBarriers;
+
+  // Check whether current function has barrier or not.
+  for (BasicBlock &b : func_) {
+    for (Instruction &bi : b) {
+      // Check call instructions for barrier.
+      if (CallInst *call_inst = dyn_cast<CallInst>(&bi)) {
+        if (Function *callee = call_inst->getCalledFunction()) {
+          const auto B = bi_->analyzeBuiltin(*callee);
+          if (B && BuiltinInfo::isMuxBuiltinWithWGBarrierID(B->ID)) {
+            auto *const id_param = call_inst->getOperand(0);
+            auto *const id_param_c = cast<ConstantInt>(id_param);
+            const auto id = id_param_c->getZExtValue();
+            orderedBarriers.emplace_back(id, call_inst);
+          }
+        }
+      }
+    }
+  }
+
+  std::sort(orderedBarriers.begin(), orderedBarriers.end());
+  for (const auto &barrier : orderedBarriers) {
+    barriers_.push_back(barrier.second);
+  }
+}
+
+/// @brief Split block with barrier.
+void compiler::utils::Barrier::SplitBlockwithBarrier() {
+  // If debugging, create stub functions in the module which will be invoked
+  // before each barrier, and after each barrier, by every work item.
+  Function *entry_stub = nullptr;
+  Function *exit_stub = nullptr;
+  if (is_debug_) {
+    CallingConv::ID stub_cc;
+    if (func_.getCallingConv() == CallingConv::SPIR_KERNEL) {
+      stub_cc = CallingConv::SPIR_FUNC;
+    } else {
+      stub_cc = func_.getCallingConv();
+    }
+    entry_stub = MakeStubFunction("__barrier_entry", module_, stub_cc);
+    exit_stub = MakeStubFunction("__barrier_exit", module_, stub_cc);
+  }
+
+  auto &node = barrier_region_id_map_[kBarrier_FirstID];
+  node.entry = &func_.getEntryBlock();
+  node.id = kBarrier_FirstID;
+
+  for (CallInst *split_point : barriers_) {
+    // ID identifying which barrier invoked stub used as argument to call.
+    auto *id = cast<ConstantInt>(split_point->getOperand(0));
+    const auto barrier_id = kBarrier_StartNewID + id->getZExtValue();
+
+    if (is_debug_) {
+      assert(entry_stub != nullptr); // Guaranteed as is_debug_ is const.
+      assert(exit_stub != nullptr);  // Guaranteed as is_debug_ is const.
+
+      // Create call instructions invoking debug stubs for every barrier. We
+      // don't insert these into a basic block yet since we want to insert
+      // them at a point where live variables have already been loaded. This
+      // info won't be available till later.
+
+      // Call invoking entry stub
+      auto entry_caller = CallInst::Create(entry_stub, id);
+      entry_caller->setDebugLoc(split_point->getDebugLoc());
+      entry_caller->setCallingConv(entry_stub->getCallingConv());
+
+      // Call invoking exit stub
+      auto exit_caller = CallInst::Create(exit_stub, id);
+      exit_caller->setDebugLoc(split_point->getDebugLoc());
+      exit_caller->setCallingConv(exit_stub->getCallingConv());
+
+      // Store call instructions in map for later insertion
+      barrier_stub_call_map_[barrier_id] =
+          std::make_pair(entry_caller, exit_caller);
+    }
+
+    auto &node = barrier_region_id_map_[barrier_id];
+    node.barrier_inst = split_point;
+    node.id = barrier_id;
+    node.schedule = getBarrierSchedule(*split_point);
+
+    // Our scan implementation requires a linear work-item ordering, to loop
+    // over all of the 'main' and 'tail' work-items in order.
+    if (auto collective = getWorkGroupCollectiveCall(split_point, *bi_)) {
+      if (collective->isScan()) {
+        node.schedule = BarrierSchedule::Linear;
+      }
+    }
+
+    split_point->getParent()->splitBasicBlock(split_point, "barrier");
+  }
+
+  // We have to gather the basic block data after splitting, because we
+  // might not be processing barriers in program order, and things can get
+  // awfully confused.
+  for (auto &[i, node] : barrier_region_id_map_) {
+    if (node.barrier_inst) {
+      auto *const bb = node.barrier_inst->getParent();
+      barrier_id_map_[bb] = node.id;
+      barrier_successor_set_.insert(*predecessors(bb).begin());
+      node.entry = bb;
+    }
+  }
+}
+
+/// @brief Generate an empty kernel that only duplicates the source kernel's
+/// CFG
+///
+/// This is used to do a "dry run" of kernel splitting in order to obtain the
+/// dominator tree, which is needed for correct identification of values that
+/// cross the barrier.
+///
+/// @param[in] region the region to clone into the new kernel.
+/// @param[out] bbmap a mapping of original blocks onto the empty clones.
+/// @return the fake kernel
+Function *compiler::utils::Barrier::GenerateFakeKernel(
+    BarrierRegion &region, DenseMap<BasicBlock *, BasicBlock *> &bbmap) {
+  LLVMContext &context = module_.getContext();
+
+  // Make new kernel function.
+  FunctionType *new_fty = FunctionType::get(Type::getVoidTy(context), false);
+  Function *new_kernel =
+      Function::Create(new_fty, Function::InternalLinkage, "tmp", &module_);
+  ValueToValueMapTy vmap;
+
+  for (auto *bb : region.blocks) {
+    BasicBlock *new_bb = BasicBlock::Create(context, "", new_kernel);
+    if (region.barrier_blocks.contains(bb)) {
+      ReturnInst::Create(context, nullptr, new_bb);
+    } else {
+      bb->getTerminator()->clone()->insertInto(new_bb, new_bb->end());
+    }
+    vmap[bb] = new_bb;
+    bbmap[bb] = new_bb;
+  }
+
+  const RemapFlags remapFlags =
+      RF_IgnoreMissingLocals | llvm::RF_ReuseAndMutateDistinctMDs;
+  for (auto &f : *new_kernel) {
+    auto *term = f.getTerminator();
+    RemapInstruction(term, vmap, remapFlags);
+  }
+
+  return new_kernel;
+}
+
+/// @brief Obtain a set of Basic Blocks for an inter-barrier region
+///
+/// It traverses the CFG, following successors, until it hits a barrier,
+/// building the region's internal data.
+///
+/// @param[out] region the region to process
+void compiler::utils::Barrier::GatherBarrierRegionBlocks(
+    BarrierRegion &region) {
+  DenseSet<BasicBlock *> visited;
+  region.blocks.push_back(region.entry);
+  visited.insert(region.entry);
+  size_t index = 0;
+  while (index < region.blocks.size()) {
+    BasicBlock *BB = region.blocks[index++];
+    if (barrier_successor_set_.contains(BB)) {
+      region.barrier_blocks.insert(BB);
+    } else {
+      for (BasicBlock *succ : successors(BB)) {
+        if (visited.insert(succ).second) {
+          region.blocks.push_back(succ);
+        }
+      }
+    }
+  }
+}
+
+/// @brief Obtain a set of Values used in a region that cross a barrier
+///
+/// A value use crosses a barrier in the following cases:
+/// * Its use is not in the same region as the defintion
+/// * Its definition does not dominate the use
+///
+/// @param[in] region The inter-barrier region
+/// @param[in] ignore set of values to ignore
+void compiler::utils::Barrier::GatherBarrierRegionUses(
+    BarrierRegion &region, DenseSet<Value *> &ignore) {
+  DenseMap<BasicBlock *, BasicBlock *> bbmap;
+  Function *fake_func = GenerateFakeKernel(region, bbmap);
+
+  // We should check the dominance relation between definition bb of live
+  // variables and user bb. If def bb does not dominate user bb, the user is
+  // modified by live variable information.
+  DominatorTree DT;
+  DT.recalculate(*fake_func);
+
+  for (auto *BB : region.blocks) {
+    BasicBlock *BBclone = bbmap[BB];
+    for (auto &I : *BB) {
+      if (PHINode *pn = dyn_cast<PHINode>(&I)) {
+        for (unsigned i = 0, e = pn->getNumIncomingValues(); i != e; i++) {
+          Value *val = pn->getIncomingValue(i);
+          if (CheckValidUse(val) && !ignore.contains(val)) {
+            if (auto *inst = dyn_cast<Instruction>(val)) {
+              BasicBlock *incoming = pn->getIncomingBlock(i);
+              BasicBlock *parent = inst->getParent();
+              // If the incoming edge comes from outside the region, it is
+              // going to get removed anyway, so disregard it
+              if (bbmap.contains(incoming)) {
+                if (!bbmap.contains(parent)) {
+                  region.uses_ext.insert(val);
+                } else if (!DT.dominates(bbmap[parent], bbmap[incoming])) {
+                  region.uses_int.insert(val);
+                }
+              }
+            }
+          }
+        }
+      } else {
+        for (Value *val : I.operands()) {
+          if (CheckValidUse(val) && !ignore.contains(val)) {
+            if (auto *inst = dyn_cast<Instruction>(val)) {
+              BasicBlock *parent = inst->getParent();
+              if (!bbmap.contains(parent)) {
+                region.uses_ext.insert(val);
+              } else if (!DT.dominates(bbmap[parent], BBclone)) {
+                region.uses_int.insert(val);
+              }
+            }
+          }
+        }
+      }
+      if (CheckValidDef(&I) && !I.use_empty()) {
+        region.defs.insert(&I);
+      }
+    }
+  }
+  DT.reset();
+  fake_func->eraseFromParent();
+}
+
+/// @brief Find livein and liveout variables per each basic block.
+void compiler::utils::Barrier::FindLiveVariables() {
+  DenseSet<Value *> func_args;
+  for (Argument &arg : func_.args()) {
+    func_args.insert(&arg);
+  }
+
+#ifndef NDEBUG
+  // Make sure there aren't any stray allocas outside the entry block.
+  for (auto block = func_.begin(); ++block != func_.end();) {
+    for (auto &inst : *block) {
+      assert(!isa<AllocaInst>(inst) && "Alloca found outside entry block!");
+    }
+  }
+#endif // ndef NDEBUG
+
+  // Put all the original allocas into the barrier struct, in case they get
+  // indirectly referenced from the other side of a barrier.
+  for (Instruction &bi : func_.front()) {
+    if (isa<AllocaInst>(&bi)) {
+      whole_live_variables_set_.insert(&bi);
+    } else {
+      continue;
+    }
+  }
+
+  for (auto &[i, region] : barrier_region_id_map_) {
+    GatherBarrierRegionBlocks(region);
+    GatherBarrierRegionUses(region, func_args);
+    whole_live_variables_set_.set_union(region.uses_int);
+    whole_live_variables_set_.set_union(region.uses_ext);
+  }
+}
+
+/// @brief Remove variables that are better recalculated than stored in the
+///        barrier, for instance casts and vector splats.
+void compiler::utils::Barrier::TidyLiveVariables() {
+  const auto &dl = module_.getDataLayout();
+
+  // Start off by doing a simple sweep of stuff that is better off not in the
+  // barrier: vector splats, no-op/widening casts, and single/zero index GEPs
+  // since we might as well put their source operand in the barrier, instead.
+  SmallVector<Value *, 16> removals;
+  SmallVector<Value *, 16> redirects;
+  for (auto v : whole_live_variables_set_) {
+    if (auto *const shuffle = dyn_cast<ShuffleVectorInst>(v)) {
+      if (shuffle->isZeroEltSplat()) {
+        // if we remove a vector splat, we have to make sure the scalar
+        // source operand is in the barrier instead.
+        Value *const op = shuffle->getOperand(0);
+        if (auto *const ins = dyn_cast<InsertElementInst>(op)) {
+          removals.push_back(v);
+
+          Value *const src = ins->getOperand(1);
+          // Put the source instruction in the barrier instead.
+          // If it's not an instruction, it is probably a function argument.
+          if (isa<Instruction>(src) && !IsTrivialGEP(src, redirects)) {
+            redirects.push_back(src);
+          }
+        }
+      }
+    } else if (auto *const cast = dyn_cast<CastInst>(v)) {
+      if (auto *const src = dyn_cast<Instruction>(cast->getOperand(0))) {
+        if (cast->isNoopCast(dl) ||
+            (cast->getSrcTy()->getScalarSizeInBits() <
+             cast->getDestTy()->getScalarSizeInBits())) {
+          removals.push_back(v);
+
+          // Put the source instruction in the barrier instead.
+          if (isa<Instruction>(src) && !IsTrivialGEP(src, redirects)) {
+            redirects.push_back(src);
+          }
+        }
+      } else {
+        // No casts of non-instructions in the barrier, please..
+        removals.push_back(v);
+      }
+    } else if (IsTrivialGEP(v, redirects)) {
+      removals.push_back(v);
+    }
+  }
+
+  // We put the redirects into the barrier first, so that if they in turn
+  // turn out to be redundant, we can remove them again.
+  whole_live_variables_set_.set_union(redirects);
+
+  // Remove work item calls and casts of arguments or other barrier members.
+  for (auto v : whole_live_variables_set_) {
+    if (IsTrivialValue(v, 4u, *bi_)) {
+      removals.push_back(v);
+    } else if (auto *cast = dyn_cast<CastInst>(v)) {
+      Value *op = cast->getOperand(0);
+      if (whole_live_variables_set_.contains(op)) {
+        removals.push_back(v);
+      }
+    }
+  }
+  whole_live_variables_set_.set_subtract(removals);
+}
+
+/// @brief Pad the field types to an alignment by adding an int array if
+/// needed
+/// @param field_tys The vector of types representing the final structure
+/// @param offset The current offset in the structure
+/// @param alignment The required alignment
+/// @return The new offset (or original offset if no padding needed)
+unsigned compiler::utils::Barrier::PadTypeToAlignment(
+    SmallVectorImpl<Type *> &field_tys, unsigned offset, unsigned alignment) {
+  if (alignment) {
+    // check if member is not already aligned
+    const unsigned int remainder = offset % alignment;
+    if (0 != remainder) {
+      // calculate number of padding bytes
+      const unsigned int padding = alignment - remainder;
+
+      // Use a byte array to pad struct rather than trying to create
+      // an arbitrary intNTy, since this may not be supported by the backend.
+      const auto padByteType = Type::getInt8Ty(module_.getContext());
+      const auto padByteArrayType = ArrayType::get(padByteType, padding);
+      field_tys.push_back(padByteArrayType);
+
+      // bump offset by padding size
+      offset += padding;
+    }
+  }
+  return offset;
+}
+
+/// @brief Make type for whole live variables.
+void compiler::utils::Barrier::MakeLiveVariableMemType() {
+  SmallVector<Type *, 128> field_tys;
+  max_live_var_alignment = 0;
+
+  const auto &dl = module_.getDataLayout();
+
+  struct member_info {
+    /// @brief The root `value` being stored.
+    Value *value;
+    /// @brief The member index of this member inside `value`, if `value` is a
+    /// decomposed structure type. Zero otherwise.
+    unsigned member_idx;
+    /// @brief The type of `value`, or of the specific member of `value`.
+    Type *type;
+    /// @brief The alignment of the value being stored
+    unsigned alignment;
+    /// @brief The size of the value being stored
+    unsigned size;
+  };
+
+  SmallVector<member_info, 8> barrier_members;
+  barrier_members.reserve(whole_live_variables_set_.size());
+  for (Value *live_var : whole_live_variables_set_) {
+    LLVM_DEBUG(dbgs() << "whole live set:" << *live_var << '\n';
+               dbgs() << "type:" << *(live_var->getType()) << '\n';);
+    Type *field_ty = live_var->getType();
+
+    Type *member_ty = nullptr;
+    unsigned alignment = 0;
+    // If allocainst is live variable, get element type of pointer type
+    // from field_ty and remember alignment
+    if (const auto *AI = dyn_cast<AllocaInst>(live_var)) {
+      member_ty = AI->getAllocatedType();
+      alignment = AI->getAlign().value();
+    } else {
+      member_ty = field_ty;
+    }
+
+    std::vector<Type *> member_tys = {member_ty};
+    // If this is a struct type containing any scalable members, we must
+    // decompose the value into its individual components.
+    if (isStructWithScalables(member_ty)) {
+      member_tys = cast<StructType>(member_ty)->elements().vec();
+    }
+
+    for (auto [idx, ty] : enumerate(member_tys)) {
+      // For a scalable vector, we need the size of the equivalent fixed vector
+      // based on its known minimum size.
+      auto member_ty_fixed = ty;
+      if (isa<ScalableVectorType>(ty)) {
+        auto *const eltTy = multi_llvm::getVectorElementType(ty);
+        auto n = multi_llvm::getVectorElementCount(ty).getKnownMinValue();
+        member_ty_fixed = VectorType::get(eltTy, ElementCount::getFixed(n));
+      }
+
+      // Need to ensure that alloc alignment or preferred alignment is kept
+      // in the new struct so pad as necessary.
+      const unsigned size = dl.getTypeAllocSize(member_ty_fixed);
+      alignment = std::max(dl.getPrefTypeAlign(ty).value(),
+                           static_cast<AlignIntTy>(alignment));
+      max_live_var_alignment = std::max(alignment, max_live_var_alignment);
+
+      barrier_members.push_back(
+          {live_var, static_cast<unsigned>(idx), ty, alignment, size});
+    }
+  }
+
+  // sort the barrier members by decreasing alignment to minimise the amount
+  // of padding required (use a stable sort so it's deterministic)
+  std::stable_sort(barrier_members.begin(), barrier_members.end(),
+                   [](const member_info &lhs, const member_info &rhs) -> bool {
+                     return lhs.alignment > rhs.alignment;
+                   });
+
+  // Deal with non-scalable members first
+  unsigned offset = 0;
+  for (auto &member : barrier_members) {
+    if (isa<ScalableVectorType>(member.type)) {
+      continue;
+    }
+
+    offset = PadTypeToAlignment(field_tys, offset, member.alignment);
+
+    // Check if the alloca has a debug info source variable attached. If
+    // so record this and the matching byte offset into the struct.
+    const auto DVRDeclares = findDVRDeclares(member.value);
+    for (auto *const DVRDeclare : DVRDeclares) {
+      debug_variable_records_.push_back(std::make_pair(DVRDeclare, offset));
+    }
+    offset += member.size;
+    live_variable_index_map_[std::make_pair(member.value, member.member_idx)] =
+        field_tys.size();
+    field_tys.push_back(member.type);
+  }
+  // Pad the end of the struct to the max alignment as we are creating an
+  // array
+  offset = PadTypeToAlignment(field_tys, offset, max_live_var_alignment);
+  live_var_mem_size_fixed = offset; // No more offsets required.
+
+  // Now deal with any scalable members. We reset the offset to zero because
+  // scalables are indexed bytewise starting from the beginning of the
+  // variable-sized scalables section at the end of the struct.
+  SmallVector<Type *, 128> field_tys_scalable;
+  offset = 0;
+  for (auto &member : barrier_members) {
+    if (!isa<ScalableVectorType>(member.type)) {
+      continue;
+    }
+
+    offset = PadTypeToAlignment(field_tys_scalable, offset, member.alignment);
+
+    live_variable_scalables_map_[std::make_pair(member.value,
+                                                member.member_idx)] = offset;
+    offset += member.size;
+    field_tys_scalable.push_back(member.type);
+  }
+  // Pad the end of the struct to the max alignment as we are creating an
+  // array
+  offset =
+      PadTypeToAlignment(field_tys_scalable, offset, max_live_var_alignment);
+  live_var_mem_size_scalable = offset; // No more offsets required.
+
+  LLVMContext &context = module_.getContext();
+  // if the barrier contains scalables, add a flexible byte array on the end
+  if (offset != 0) {
+    live_var_mem_scalables_index = field_tys.size();
+    field_tys.push_back(ArrayType::get(IntegerType::getInt8Ty(context), 0));
+  }
+
+  // Create struct type for live variable memory allocation; we create this
+  // even when the type is empty. The big entry point pass depends on this
+  // to detect that the barrier pass has been executed.
+  SmallString<128> name;
+  live_var_mem_ty_ = StructType::create(
+      context, field_tys,
+      (Twine(func_.getName() + "_live_mem_info")).toStringRef(name), false);
+
+  name.clear();
+
+  LLVM_DEBUG(dbgs() << "Barrier size: " << offset << "\n";
+             dbgs() << "whole live set type:" << *(live_var_mem_ty_) << '\n';);
+}
+
+/// @brief Generate new kernel from an inter-barrier region such that no call
+/// to barriers occur within it.
+///
+/// @param[in] region the inter-barrier region to create the kernel from
+/// @return the new kernel
+Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
+  BasicBlock *entry_point = region.entry;
+  LLVMContext &context = module_.getContext();
+
+  LLVM_DEBUG({
+    dbgs() << "\n";
+    unsigned I = 0;
+    for (auto *D : region.blocks) {
+      dbgs() << "entry block: " << entry_point->getName() << "\n";
+      dbgs() << "region visited path [" << I++ << "] = " << D->getName()
+             << "\n\n";
+      dbgs() << *D << "\n\n";
+    }
+  });
+
+  SmallVector<Type *, 8> new_func_params;
+  // First kernel adds original kernel's parameters.
+  for (const auto &arg : func_.args()) {
+    new_func_params.push_back(arg.getType());
+  }
+
+  // If we have a work group collective call, we need to create a new argument
+  // so that the result can be passed in.
+  const bool collective =
+      getWorkGroupCollectiveCall(region.barrier_inst, *bi_).has_value();
+  if (collective) {
+    new_func_params.push_back(region.barrier_inst->getType());
+  }
+
+  // Add live variables' parameter as last if there are any.
+  const bool hasBarrierStruct = !whole_live_variables_set_.empty() &&
+                                region.schedule != BarrierSchedule::Once;
+  if (hasBarrierStruct) {
+    PointerType *pty = PointerType::get(context, /*AddressSpace=*/0);
+    new_func_params.push_back(pty);
+  }
+
+  // Make new kernel function.
+  FunctionType *new_fty = FunctionType::get(Type::getInt32Ty(context),
+                                            new_func_params, func_.isVarArg());
+  Function *new_kernel =
+      Function::Create(new_fty, Function::InternalLinkage,
+                       func_.getName() + ".mux-barrier-region", &module_);
+
+  // We don't use exceptions.
+  new_kernel->setAttributes(func_.getAttributes());
+
+  // We also want to always inline this function (unless it is noinline).
+  if (!new_kernel->hasFnAttribute(Attribute::NoInline)) {
+    new_kernel->addFnAttr(Attribute::AlwaysInline);
+  }
+
+  // copy the calling convention from the old function, except for
+  // SPIR_KERNEL. SPIR_KERNELs need to be split into SPIR_FUNC
+  CallingConv::ID new_kernel_cc;
+  if (func_.getCallingConv() == CallingConv::SPIR_KERNEL) {
+    new_kernel_cc = CallingConv::SPIR_FUNC;
+  } else {
+    new_kernel_cc = func_.getCallingConv();
+  }
+  new_kernel->setCallingConv(new_kernel_cc);
+
+  // Copy the metadata into the new kernel ignoring any debug info.
+  compiler::utils::copyFunctionMetadata(func_, *new_kernel);
+
+  // We're not interested in these sub-kernels being registered as kernels.
+  // While they're technically kernels, they're only ever called from our
+  // actual wrapper entry point.
+  compiler::utils::dropIsKernel(*new_kernel);
+
+  live_variable_mem_t live_vars_defs_in_kernel;
+  ValueToValueMapTy vmap;
+  // First kernel follows original kernel's arguments first.
+  Function::arg_iterator new_arg = new_kernel->arg_begin();
+  for (const auto &arg : func_.args()) {
+    vmap[&arg] = &*(new_arg++);
+  }
+
+  // Copy a region to the new kernel function.
+  bool returns_from_kernel = false;
+  for (auto *block : region.blocks) {
+    BasicBlock *cloned_bb =
+        CloneBasicBlock(block, vmap, "", live_vars_defs_in_kernel, new_kernel);
+    vmap[block] = cloned_bb;
+
+    // Remove last terminator from clone block with barrier.
+    if (region.barrier_blocks.contains(block)) {
+      cloned_bb->getTerminator()->eraseFromParent();
+
+      // Return the next barrier's id.
+      const unsigned next_barrier_id =
+          barrier_id_map_[block->getSingleSuccessor()];
+      ConstantInt *barrier_id_cst =
+          ConstantInt::get(Type::getInt32Ty(context), next_barrier_id);
+      auto new_ret = ReturnInst::Create(context, barrier_id_cst, cloned_bb);
+
+      // Barrier blocks should be unique.
+      region.successor_ids.push_back(next_barrier_id);
+
+      // Insert call to debug stub before return if debugging, this stub
+      // signifies that we're about to enter the next barrier
+      if (is_debug_) {
+        // Look up entry call instruction in map
+        CallInst *entry_call = barrier_stub_call_map_[next_barrier_id].first;
+
+        // Check for null since if this is the final kernel there won't be
+        // a next barrier to have an entry for.
+        if (!entry_call) {
+          continue;
+        }
+
+        // Check if the entry call already has a parent since there can be
+        // multiple return instructions in a kernel, if it does then clone
+        // the instruction first.
+        if (nullptr == entry_call->getParent()) {
+          entry_call->insertBefore(new_ret->getIterator());
+        } else {
+          entry_call->clone()->insertBefore(new_ret->getIterator());
+        }
+      }
+    } else if (ReturnInst *ret =
+                   dyn_cast<ReturnInst>(cloned_bb->getTerminator())) {
+      // Change return instruction with end barrier number.
+      ConstantInt *cst_endid =
+          ConstantInt::get(Type::getInt32Ty(context), kBarrier_EndID);
+      ReturnInst *new_ret = ReturnInst::Create(context, cst_endid);
+      new_ret->insertBefore(ret->getIterator());
+      ret->replaceAllUsesWith(new_ret);
+      ret->eraseFromParent();
+
+      // We can have multiple return points, but should only count it once.
+      returns_from_kernel = true;
+    }
+  }
+  if (returns_from_kernel) {
+    region.successor_ids.push_back(kBarrier_EndID);
+  }
+  // Keep things consistent
+  std::sort(region.successor_ids.begin(), region.successor_ids.end());
+
+  // Update the incoming edges to phi nodes, and drop edges to basic blocks
+  // that are not present in the new function.  Note that this must happen
+  // after all the basic blocks have been cloned, so that we know how to
+  // update the incoming edges to phi nodes that represent back edges.
+  for (auto *block : region.blocks) {
+    UpdateAndTrimPHINodeEdges(cast<BasicBlock>(vmap[block]), vmap);
+  }
+
+  BasicBlock *new_kernel_entry_block = &(new_kernel->getEntryBlock());
+  Instruction *insert_point = &*new_kernel_entry_block->getFirstNonPHIOrDbg();
+  auto *const cloned_barrier_call =
+      region.barrier_inst ? insert_point : nullptr;
+
+  // If we have a work group collective call, we need to remap its result from
+  // the arguments list.
+  if (collective) {
+    vmap[insert_point] = &*(new_arg++);
+  }
+
+  // The entry kernel might have allocas in it that don't get removed,
+  // so better make sure to insert after them.
+  while (isa<AllocaInst>(insert_point)) {
+    insert_point = insert_point->getNextNode();
+  }
+
+  // It puts all the GEPs at the start of the kernel, but only once
+  LiveValuesHelper live_values(
+      *this, insert_point,
+      hasBarrierStruct ? compiler::utils::getLastArgument(new_kernel)
+                       : nullptr);
+
+  // Load live variables and map them.
+  // These variables are defined in a different kernel, so we insert the
+  // relevant load instructions in the entry block of the kernel.
+  {
+    // Note that if our barrier is a work group collective, its operand will
+    // probably still get reloaded here, even though it's going to get deleted,
+    // so we hope that it gets optimized away later, in this case.
+    for (const auto cur_live : region.uses_ext) {
+      IRBuilder<> insertIR(insert_point);
+      vmap[cur_live] = live_values.getReload(cur_live, insertIR, "_load", true);
+    }
+  }
+
+  SmallVector<Instruction *, 8> allocas_and_intrinsics_to_remove;
+
+  // Store only live variables that are defined in this kernel.
+  //
+  // We might like to store the variables at the point we hit the barrier.
+  // However, this is not always possible because the value definition might
+  // not dominate any or all of the exit blocks. Furthermore, if this value
+  // is used again in the same kernel after looping around the barrier, we
+  // have to be aware that the usage might be expecting the updated value.
+  // (This can happen in nested loops, where the outer increment becomes a
+  // conditional block.) Therefore, we put the store right after the
+  // definition instead.
+  for (const auto live_var : live_vars_defs_in_kernel) {
+    // If allocainst is live variable and defined in this function, then
+    // change the alloca to a GEP directly into the live variables struct
+    // otherwise we store the value to the struct. This is needed because
+    // it is possible for one live variable to reference another by
+    // pointer. When we then save them to the live variable struct they
+    // will point to the wrong address. By GEPping directly to the final
+    // live struct we resolve this issue as it will always use the final
+    // address.
+    if (auto *alloca_inst = dyn_cast<AllocaInst>(live_var)) {
+      // Check to see if it is still an alloca after vmap. If not we may
+      // have processed it before and no work needs doing as we are using
+      // the live variable struct directly.
+      if (auto *new_alloca_inst = dyn_cast<AllocaInst>(vmap[alloca_inst])) {
+        allocas_and_intrinsics_to_remove.push_back(new_alloca_inst);
+        // Also remove any assume-like intrinsics that are users of this
+        // alloca. These assumptions may not hold. For example, lifetime
+        // intrinsics are definitely dangerous, as by directly replacing their
+        // alloca operands with the address of the live variable struct, we are
+        // telling LLVM that *all* accesses of the live variable struct also
+        // start/end at that point, which is not true.
+        // Similarly, llvm.assume and llvm.experimental.noalias.scope.decl may
+        // hold for the alloca but not the live variables struct.
+        for (auto *const user : alloca_inst->users()) {
+          if (auto *const intrinsic = dyn_cast<IntrinsicInst>(user);
+              intrinsic && intrinsic->isAssumeLikeIntrinsic()) {
+            allocas_and_intrinsics_to_remove.push_back(intrinsic);
+          }
+        }
+        // change the vmap to point to the GEP instead of the original alloca
+        vmap[live_var] = live_values.getGEP(live_var);
+      }
+    } else {
+      // Place the new store immediately after the definition, but if it's a
+      // PHI node we have to make sure to put it after any other PHI nodes.
+      Instruction *inst = cast<Instruction>(vmap[live_var]);
+      Instruction *insert_point = inst->getNextNode();
+      while (isa<PHINode>(insert_point)) {
+        insert_point = insert_point->getNextNode();
+      }
+      IRBuilder<> B(insert_point);
+      if (!isStructWithScalables(live_var->getType())) {
+        auto *addr = live_values.getGEP(live_var);
+        B.CreateStore(live_var, addr);
+      } else {
+        // Store this struct containing scalable members piece-wise
+        auto member_tys = cast<StructType>(live_var->getType())->elements();
+        for (auto [idx, ty] : enumerate(member_tys)) {
+          auto *extract = B.CreateExtractValue(live_var, idx);
+          auto *extract_addr = live_values.getGEP(extract);
+          assert(extract_addr);
+          B.CreateStore(extract, extract_addr);
+        }
+      }
+    }
+  }
+
+  // Iterate instruction from insert point at entry basic block.
+  insert_point = &*new_kernel_entry_block->getFirstNonPHIOrDbg();
+  const RemapFlags remapFlags =
+      RF_IgnoreMissingLocals | llvm::RF_ReuseAndMutateDistinctMDs;
+  BasicBlock::iterator b_iter = insert_point->getIterator();
+  while (b_iter != new_kernel_entry_block->end()) {
+    RemapInstruction(&*b_iter, vmap, remapFlags);
+    b_iter++;
+  }
+
+  // Remove barrier. We do this after creating stores so that if it's a work
+  // group collective, it will have been processed as normal above and written
+  // into the barrier struct where needed.
+  if (cloned_barrier_call) {
+    // When debugging insert a call to the exit debug stub at the insert
+    // point, this location is important since all the live variables will
+    // have been loaded by this point.
+    if (is_debug_) {
+      const unsigned barrier_id = barrier_id_map_[entry_point];
+      // Get call instruction invoking exit stub from map
+      CallInst *exit_caller = barrier_stub_call_map_[barrier_id].second;
+      exit_caller->insertAfter(cloned_barrier_call);
+      // Use updated debug info scope since call_inst will have had
+      // this set by ModifyDebugInfoScopes()
+      exit_caller->setDebugLoc(cloned_barrier_call->getDebugLoc());
+    }
+    if (collective) {
+      cloned_barrier_call->replaceAllUsesWith(vmap[cloned_barrier_call]);
+    }
+    cloned_barrier_call->eraseFromParent();
+  }
+
+  // don't remap the first basicblock again..
+  Function::iterator cfi = ++(new_kernel->begin());
+  const Function::iterator cfie = new_kernel->end();
+  for (; cfi != cfie; cfi++) {
+    for (Instruction &cbi : *cfi) {
+      RemapInstruction(&cbi, vmap, remapFlags);
+    }
+  }
+
+  // Remove any allocas and their dependent intrinsics that have been replaced
+  // by a GEP instruction
+  for (auto *inst : allocas_and_intrinsics_to_remove) {
+    inst->eraseFromParent();
+  }
+
+  // This needs resetting for the sake of any further new GEPs created
+  live_values.gepBuilder.SetInsertPoint(
+      new_kernel_entry_block->getFirstNonPHIOrDbg());
+
+  // If there are definitions of live variable in this function, process it
+  // here. As mentioned above regarding value stores, the user might want to
+  // load the value after it has been updated. Therefore, we place the new
+  // loads right before their uses.
+  //
+  // Potentially, this is not optimal, since it might create multiple loads.
+  // Ideally we should use some kind of reachability query to determine if
+  // the load can be placed before the store, and if not, PHI nodes could
+  // be inserted instead to get the value directly from the new definition.
+  //
+  // It would be nice not to have to build the Dominator Tree here again,
+  // since we already did it when we gathered the barrier crossing values.
+  // The problem is it's a use/user pair that crosses a barrier, not just the
+  // use itself. Some users may be dominated, and others not.
+  //
+  // NOTE it is impossible for any of these to be an Alloca.
+  DominatorTree DT;
+  DT.recalculate(*new_kernel);
+
+  for (auto OldDef : region.uses_int) {
+    Instruction *NewDef = cast<Instruction>(vmap[OldDef]);
+    BasicBlock *DefBB = NewDef->getParent();
+
+    for (auto use_it = NewDef->use_begin(); use_it != NewDef->use_end();) {
+      auto &U = *use_it++;
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      BasicBlock *UserBB = UserInst->getParent();
+
+      // Check whether user is in current function.
+      if (UserBB->getParent() == new_kernel) {
+        Instruction *load_insert = nullptr;
+
+        // Check dominance relation between def bb and user bb.
+        if (auto *PHI = dyn_cast<PHINode>(UserInst)) {
+          BasicBlock *incoming = PHI->getIncomingBlock(U);
+          if (!DT.dominates(DefBB, incoming)) {
+            load_insert = incoming->getTerminator();
+          }
+        } else if (!DT.dominates(DefBB, UserBB)) {
+          load_insert = UserInst;
+        }
+
+        if (load_insert) {
+          IRBuilder<> loadIR(load_insert);
+          U.set(live_values.getReload(OldDef, loadIR, "_reload"));
+        }
+      }
+    }
+  }
+
+  // Removing incoming PHI node edges might have created some redundant ones.
+  for (auto *BB : region.blocks) {
+    BasicBlock *cBB = cast<BasicBlock>(vmap[BB]);
+    for (auto I = cBB->begin(); I != cBB->end();) {
+      if (auto *PHI = dyn_cast<PHINode>(&*(I++))) {
+        if (auto *V = PHI->hasConstantValue()) {
+          PHI->replaceAllUsesWith(V);
+          PHI->eraseFromParent();
+        }
+      } else {
+        break;
+      }
+    }
+  }
+
+  // Remap any remaining unmapped instructions coming from DT-based reloads
+  for (auto &BB : *new_kernel) {
+    for (Instruction &I : BB) {
+      RemapInstruction(&I, vmap, remapFlags);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "new kernel function: " << new_kernel->getName()
+                    << "\n";);
+  return new_kernel;
+}
+
+/// @brief This function is a copy from llvm::CloneBasicBlock. In order to
+/// update live variable information, some of codes are added.
+///
+/// @param[in] bb Basic block to copy.
+/// @param[out] vmap Map for value for cloning.
+/// @param[in] name_suffix Name for suffix.
+/// @param[out] live_defs_info Live definitions' info current basic block.
+/// @param[in] F Current function.
+///
+/// @return Return cloned basic block.
+BasicBlock *compiler::utils::Barrier::CloneBasicBlock(
+    BasicBlock *bb, ValueToValueMapTy &vmap, const Twine &name_suffix,
+    live_variable_mem_t &live_defs_info, Function *F) {
+  BasicBlock *new_bb = BasicBlock::Create(bb->getContext(), "", F);
+  if (bb->hasName())
+    new_bb->setName(bb->getName() + name_suffix);
+
+  // Loop over all instructions, and copy them over.
+  for (Instruction &i : *bb) {
+    Instruction *new_inst = i.clone();
+    if (i.hasName())
+      new_inst->setName(i.getName() + name_suffix);
+    new_inst->insertInto(new_bb, new_bb->end());
+
+    // Record live variables' defs which are in current kernel.
+    if (whole_live_variables_set_.contains(&i)) {
+      live_defs_info.insert(&i);
+    }
+
+    vmap[&i] = new_inst;
+  }
+  return new_bb;
+}
+
+/// @brief Seperate kernel function with barrier boundary.
+void compiler::utils::Barrier::SeperateKernelWithBarrier() {
+  if (barriers_.empty())
+    return;
+
+  for (auto &[i, region] : barrier_region_id_map_) {
+    kernel_id_map_[region.id] = GenerateNewKernel(region);
+  }
+
+  // Record barrier information on metadata.
+  SmallString<128> name;
+  LLVMContext &context = module_.getContext();
+  ValueAsMetadata *num_barriers_ = ValueAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(context), barriers_.size()));
+  MDNode *num_barriers__md =
+      MDNode::get(context, ArrayRef<Metadata *>(num_barriers_));
+  NamedMDNode *barrier_md = module_.getOrInsertNamedMetadata(
+      Twine(func_.getName() + "_barrier").toStringRef(name));
+  barrier_md->addOperand(num_barriers__md);
+
+  LLVM_DEBUG({
+    for (const auto &Kid : kernel_id_map_) {
+      dbgs() << "kernel_id[" << Kid.first << "] = " << Kid.second->getName()
+             << "\n";
+    }
+
+    dbgs() << "\n\n" << module_ << "\n\n";
+  });
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
new file mode 100644
index 0000000000000..372280d135302
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/builtin_info.cpp
@@ -0,0 +1,1270 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/cl_builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/scheduling.h>
+#include <llvm/ADT/StringExtras.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/IR/Module.h>
+#include <multi_llvm/intrinsic.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+AnalysisKey BuiltinInfoAnalysis::Key;
+
+BuiltinInfoAnalysis::BuiltinInfoAnalysis()
+    : BICallback([](const Module &) -> BuiltinInfo {
+        return BuiltinInfo(std::make_unique<CLBuiltinInfo>(nullptr));
+      }) {}
+
+Module *BuiltinInfo::getBuiltinsModule() {
+  if (LangImpl) {
+    return LangImpl->getBuiltinsModule();
+  }
+  // Mux builtins don't need a module.
+  return nullptr;
+}
+
+std::optional<std::pair<BuiltinID, std::vector<Type *>>>
+BuiltinInfo::identifyMuxBuiltin(const Function &F) const {
+  StringRef Name = F.getName();
+  auto ID =
+      StringSwitch<std::optional<BuiltinID>>(Name)
+          .Case(MuxBuiltins::isftz, eMuxBuiltinIsFTZ)
+          .Case(MuxBuiltins::usefast, eMuxBuiltinUseFast)
+          .Case(MuxBuiltins::isembeddedprofile, eMuxBuiltinIsEmbeddedProfile)
+          .Case(MuxBuiltins::get_global_size, eMuxBuiltinGetGlobalSize)
+          .Case(MuxBuiltins::get_global_id, eMuxBuiltinGetGlobalId)
+          .Case(MuxBuiltins::get_global_offset, eMuxBuiltinGetGlobalOffset)
+          .Case(MuxBuiltins::get_local_size, eMuxBuiltinGetLocalSize)
+          .Case(MuxBuiltins::get_local_id, eMuxBuiltinGetLocalId)
+          .Case(MuxBuiltins::set_local_id, eMuxBuiltinSetLocalId)
+          .Case(MuxBuiltins::get_sub_group_id, eMuxBuiltinGetSubGroupId)
+          .Case(MuxBuiltins::set_sub_group_id, eMuxBuiltinSetSubGroupId)
+          .Case(MuxBuiltins::get_num_groups, eMuxBuiltinGetNumGroups)
+          .Case(MuxBuiltins::get_num_sub_groups, eMuxBuiltinGetNumSubGroups)
+          .Case(MuxBuiltins::set_num_sub_groups, eMuxBuiltinSetNumSubGroups)
+          .Case(MuxBuiltins::get_max_sub_group_size,
+                eMuxBuiltinGetMaxSubGroupSize)
+          .Case(MuxBuiltins::set_max_sub_group_size,
+                eMuxBuiltinSetMaxSubGroupSize)
+          .Case(MuxBuiltins::get_group_id, eMuxBuiltinGetGroupId)
+          .Case(MuxBuiltins::get_work_dim, eMuxBuiltinGetWorkDim)
+          .Case(MuxBuiltins::dma_read_1d, eMuxBuiltinDMARead1D)
+          .Case(MuxBuiltins::dma_read_2d, eMuxBuiltinDMARead2D)
+          .Case(MuxBuiltins::dma_read_3d, eMuxBuiltinDMARead3D)
+          .Case(MuxBuiltins::dma_write_1d, eMuxBuiltinDMAWrite1D)
+          .Case(MuxBuiltins::dma_write_2d, eMuxBuiltinDMAWrite2D)
+          .Case(MuxBuiltins::dma_write_3d, eMuxBuiltinDMAWrite3D)
+          .Case(MuxBuiltins::dma_wait, eMuxBuiltinDMAWait)
+          .Case(MuxBuiltins::get_global_linear_id, eMuxBuiltinGetGlobalLinearId)
+          .Case(MuxBuiltins::get_local_linear_id, eMuxBuiltinGetLocalLinearId)
+          .Case(MuxBuiltins::get_enqueued_local_size,
+                eMuxBuiltinGetEnqueuedLocalSize)
+          .Case(MuxBuiltins::get_sub_group_size, eMuxBuiltinGetSubGroupSize)
+          .Case(MuxBuiltins::get_sub_group_local_id,
+                eMuxBuiltinGetSubGroupLocalId)
+          .Case(MuxBuiltins::work_group_barrier, eMuxBuiltinWorkGroupBarrier)
+          .Case(MuxBuiltins::sub_group_barrier, eMuxBuiltinSubGroupBarrier)
+          .Case(MuxBuiltins::mem_barrier, eMuxBuiltinMemBarrier)
+          .Default(std::nullopt);
+  if (ID) {
+    switch (*ID) {
+    default:
+      return {{*ID, {}}};
+    case eMuxBuiltinDMARead1D:
+    case eMuxBuiltinDMARead2D:
+    case eMuxBuiltinDMARead3D:
+    case eMuxBuiltinDMAWrite1D:
+    case eMuxBuiltinDMAWrite2D:
+    case eMuxBuiltinDMAWrite3D:
+      // Return the event type used by these builtins. The event type is
+      // required to declare/define these builtins, so return it here for
+      // the sake of completeness. The event type doesn't change the
+      // builtins' name (i.e., it's not mangled) as it's required to be
+      // consistent at any single snapshot of the module, though it may
+      // change through time.
+      return {{*ID, {F.getReturnType()}}};
+    }
+  }
+
+  // Now check for group functions, which are a bit more involved as there's
+  // many of them and they're also mangled. We enforce that the mangling makes
+  // sense, otherwise the builtin is declared as invalid.
+  const bool IsSubgroupOp = Name.consume_front("__mux_sub_group_");
+  const bool IsVecgroupOp = Name.consume_front("__mux_vec_group_");
+  if (!IsSubgroupOp && !IsVecgroupOp &&
+      !Name.consume_front("__mux_work_group_")) {
+    return std::nullopt;
+  }
+
+#define SCOPED_GROUP_OP(OP)                                                    \
+  (IsSubgroupOp   ? eMuxBuiltinSubgroup##OP                                    \
+   : IsVecgroupOp ? eMuxBuiltinVecgroup##OP                                    \
+                  : eMuxBuiltinWorkgroup##OP)
+
+  // Most group operations have one argument, except for broadcasts. Despite
+  // that, we don't mangle the indices as they're fixed.
+  const unsigned NumExpectedMangledArgs = 1;
+
+  if (Name.consume_front("any")) {
+    ID = SCOPED_GROUP_OP(Any);
+  } else if (Name.consume_front("all")) {
+    ID = SCOPED_GROUP_OP(All);
+  } else if (Name.consume_front("broadcast")) {
+    ID = SCOPED_GROUP_OP(Broadcast);
+  } else if (Name.consume_front("shuffle_up")) {
+    if (!IsSubgroupOp) {
+      return std::nullopt;
+    }
+    ID = eMuxBuiltinSubgroupShuffleUp;
+  } else if (Name.consume_front("shuffle_down")) {
+    if (!IsSubgroupOp) {
+      return std::nullopt;
+    }
+    ID = eMuxBuiltinSubgroupShuffleDown;
+  } else if (Name.consume_front("shuffle_xor")) {
+    if (!IsSubgroupOp) {
+      return std::nullopt;
+    }
+    ID = eMuxBuiltinSubgroupShuffleXor;
+  } else if (Name.consume_front("shuffle")) {
+    if (!IsSubgroupOp) {
+      return std::nullopt;
+    }
+    ID = eMuxBuiltinSubgroupShuffle;
+  } else if (Name.consume_front("reduce_")) {
+    auto NextIdx = Name.find_first_of('_');
+    std::string Group = Name.substr(0, NextIdx).str();
+    Name = Name.drop_front(Group.size());
+
+    if (Group == "logical") {
+      Name = Name.drop_front(); // Drop the underscore
+      auto NextIdx = Name.find_first_of('_');
+      auto RealGroup = Name.substr(0, NextIdx);
+      Group += "_" + RealGroup.str();
+      Name = Name.drop_front(RealGroup.size());
+    }
+
+    ID = StringSwitch<std::optional<BuiltinID>>(Group)
+             .Case("add", SCOPED_GROUP_OP(ReduceAdd))
+             .Case("fadd", SCOPED_GROUP_OP(ReduceFAdd))
+             .Case("mul", SCOPED_GROUP_OP(ReduceMul))
+             .Case("fmul", SCOPED_GROUP_OP(ReduceFMul))
+             .Case("smin", SCOPED_GROUP_OP(ReduceSMin))
+             .Case("umin", SCOPED_GROUP_OP(ReduceUMin))
+             .Case("fmin", SCOPED_GROUP_OP(ReduceFMin))
+             .Case("smax", SCOPED_GROUP_OP(ReduceSMax))
+             .Case("umax", SCOPED_GROUP_OP(ReduceUMax))
+             .Case("fmax", SCOPED_GROUP_OP(ReduceFMax))
+             .Case("and", SCOPED_GROUP_OP(ReduceAnd))
+             .Case("or", SCOPED_GROUP_OP(ReduceOr))
+             .Case("xor", SCOPED_GROUP_OP(ReduceXor))
+             .Case("logical_and", SCOPED_GROUP_OP(ReduceLogicalAnd))
+             .Case("logical_or", SCOPED_GROUP_OP(ReduceLogicalOr))
+             .Case("logical_xor", SCOPED_GROUP_OP(ReduceLogicalXor))
+             .Default(std::nullopt);
+  } else if (Name.consume_front("scan_")) {
+    const bool IsInclusive = Name.consume_front("inclusive_");
+    if (!IsInclusive && !Name.consume_front("exclusive_")) {
+      return std::nullopt;
+    }
+
+    auto NextIdx = Name.find_first_of('_');
+    std::string Group = Name.substr(0, NextIdx).str();
+    Name = Name.drop_front(Group.size());
+
+    if (Group == "logical") {
+      auto NextIdx = Name.find_first_of('_', /*From*/ 1);
+      auto RealGroup = Name.substr(0, NextIdx);
+      Group += RealGroup.str();
+      Name = Name.drop_front(RealGroup.size());
+    }
+
+    ID = StringSwitch<std::optional<BuiltinID>>(Group)
+             .Case("add", IsInclusive ? SCOPED_GROUP_OP(ScanAddInclusive)
+                                      : SCOPED_GROUP_OP(ScanAddExclusive))
+             .Case("fadd", IsInclusive ? SCOPED_GROUP_OP(ScanFAddInclusive)
+                                       : SCOPED_GROUP_OP(ScanFAddExclusive))
+             .Case("mul", IsInclusive ? SCOPED_GROUP_OP(ScanMulInclusive)
+                                      : SCOPED_GROUP_OP(ScanMulExclusive))
+             .Case("fmul", IsInclusive ? SCOPED_GROUP_OP(ScanFMulInclusive)
+                                       : SCOPED_GROUP_OP(ScanFMulExclusive))
+             .Case("smin", IsInclusive ? SCOPED_GROUP_OP(ScanSMinInclusive)
+                                       : SCOPED_GROUP_OP(ScanSMinExclusive))
+             .Case("umin", IsInclusive ? SCOPED_GROUP_OP(ScanUMinInclusive)
+                                       : SCOPED_GROUP_OP(ScanUMinExclusive))
+             .Case("fmin", IsInclusive ? SCOPED_GROUP_OP(ScanFMinInclusive)
+                                       : SCOPED_GROUP_OP(ScanFMinExclusive))
+             .Case("smax", IsInclusive ? SCOPED_GROUP_OP(ScanSMaxInclusive)
+                                       : SCOPED_GROUP_OP(ScanSMaxExclusive))
+             .Case("umax", IsInclusive ? SCOPED_GROUP_OP(ScanUMaxInclusive)
+                                       : SCOPED_GROUP_OP(ScanUMaxExclusive))
+             .Case("fmax", IsInclusive ? SCOPED_GROUP_OP(ScanFMaxInclusive)
+                                       : SCOPED_GROUP_OP(ScanFMaxExclusive))
+             .Case("and", IsInclusive ? SCOPED_GROUP_OP(ScanAndInclusive)
+                                      : SCOPED_GROUP_OP(ScanAndExclusive))
+             .Case("or", IsInclusive ? SCOPED_GROUP_OP(ScanOrInclusive)
+                                     : SCOPED_GROUP_OP(ScanOrExclusive))
+             .Case("xor", IsInclusive ? SCOPED_GROUP_OP(ScanXorInclusive)
+                                      : SCOPED_GROUP_OP(ScanXorExclusive))
+             .Case("logical_and",
+                   IsInclusive ? SCOPED_GROUP_OP(ScanLogicalAndInclusive)
+                               : SCOPED_GROUP_OP(ScanLogicalAndExclusive))
+             .Case("logical_or", IsInclusive
+                                     ? SCOPED_GROUP_OP(ScanLogicalOrInclusive)
+                                     : SCOPED_GROUP_OP(ScanLogicalOrExclusive))
+             .Case("logical_xor",
+                   IsInclusive ? SCOPED_GROUP_OP(ScanLogicalXorInclusive)
+                               : SCOPED_GROUP_OP(ScanLogicalXorExclusive))
+             .Default(std::nullopt);
+  }
+  if (!ID) {
+    return std::nullopt;
+  }
+
+  std::vector<Type *> OverloadInfo;
+
+  // Consume the rest of this group Op function name. If we can't identify a
+  // series of mangled type names, this builtin is invalid.
+  unsigned NumMangledArgs = 0;
+  // Work-group builtins have an unmangled 'barrier ID' parameter first, which
+  // we want to skip.
+  const unsigned Offset = ID >= eFirstMuxWorkgroupCollectiveBuiltin &&
+                          ID <= eLastMuxWorkgroupCollectiveBuiltin;
+  while (!Name.empty()) {
+    if (!Name.consume_front("_")) {
+      return std::nullopt;
+    }
+    auto [Ty, NewName] = getDemangledTypeFromStr(Name, F.getContext());
+    Name = NewName;
+
+    auto ParamIdx = Offset + NumMangledArgs;
+    if (ParamIdx >= F.arg_size() || Ty != F.getArg(ParamIdx)->getType()) {
+      return std::nullopt;
+    }
+
+    ++NumMangledArgs;
+    OverloadInfo.push_back(Ty);
+  }
+  if (NumMangledArgs != NumExpectedMangledArgs) {
+    return std::nullopt;
+  }
+
+  return {{*ID, OverloadInfo}};
+#undef SCOPED_GROUP_OP
+}
+
+BuiltinUniformity BuiltinInfo::isBuiltinUniform(const Builtin &B,
+                                                const CallInst *CI,
+                                                unsigned SimdDimIdx) const {
+  switch (B.ID) {
+  default:
+    break;
+  case eMuxBuiltinGetGlobalId:
+  case eMuxBuiltinGetLocalId: {
+    // We need to know the dimension requested from these builtins at compile
+    // time to infer their uniformity.
+    if (!CI || CI->arg_empty()) {
+      return eBuiltinUniformityNever;
+    }
+    auto *Rank = dyn_cast<ConstantInt>(CI->getArgOperand(0));
+    if (!Rank) {
+      // The Rank is some function, which "might" evaluate to zero
+      // sometimes, so we let the packetizer sort it out with some
+      // conditional magic.
+      // TODO Make sure this can never go haywire in weird edge cases.
+      // Where we have one get_global_id() dependent on another, this is
+      // not packetized correctly. Doing so is very hard!  We should
+      // probably just fail to packetize in this case.  We might also be
+      // able to return eBuiltinUniformityNever here, in cases where we can
+      // prove that the value can never be zero.
+      return eBuiltinUniformityMaybeInstanceID;
+    }
+    // Only vectorize on selected dimension. The value of get_global_id with
+    // other ranks is uniform.
+    if (Rank->getZExtValue() == SimdDimIdx) {
+      return eBuiltinUniformityInstanceID;
+    }
+
+    return eBuiltinUniformityAlways;
+  }
+  case eMuxBuiltinGetSubGroupLocalId:
+    return eBuiltinUniformityInstanceID;
+  case eMuxBuiltinGetLocalLinearId:
+  case eMuxBuiltinGetGlobalLinearId:
+    // TODO: This is fine for vectorizing in the x-axis, but currently we do
+    // not support vectorizing along y or z.
+    return SimdDimIdx ? eBuiltinUniformityNever : eBuiltinUniformityInstanceID;
+  }
+
+  // Reductions and broadcasts are always uniform
+  if (auto Info = isMuxGroupCollective(B.ID)) {
+    if (Info->isAnyAll() || Info->isReduction() || Info->isBroadcast()) {
+      return eBuiltinUniformityAlways;
+    }
+  }
+
+  if (LangImpl) {
+    return LangImpl->isBuiltinUniform(B, CI, SimdDimIdx);
+  }
+  return eBuiltinUniformityUnknown;
+}
+
+std::optional<Builtin> BuiltinInfo::analyzeBuiltin(const Function &F) const {
+  // Handle LLVM intrinsics.
+  if (F.isIntrinsic()) {
+    int32_t Properties = eBuiltinPropertyNone;
+
+    const Intrinsic::ID IntrID = (Intrinsic::ID)F.getIntrinsicID();
+    const AttributeList AS = multi_llvm::Intrinsic::getAttributes(
+        F.getContext(), IntrID, F.getFunctionType());
+    const bool NoSideEffect = F.onlyReadsMemory();
+    bool SafeIntrinsic = false;
+    switch (IntrID) {
+    default:
+      SafeIntrinsic = false;
+      break;
+    case Intrinsic::smin:
+    case Intrinsic::smax:
+    case Intrinsic::umin:
+    case Intrinsic::umax:
+    case Intrinsic::abs:
+    case Intrinsic::ctlz:
+    case Intrinsic::cttz:
+    case Intrinsic::sqrt:
+    case Intrinsic::sin:
+    case Intrinsic::cos:
+    case Intrinsic::pow:
+    case Intrinsic::exp:
+    case Intrinsic::exp2:
+    case Intrinsic::log:
+    case Intrinsic::log10:
+    case Intrinsic::log2:
+    case Intrinsic::fma:
+    case Intrinsic::fabs:
+    case Intrinsic::minnum:
+    case Intrinsic::maxnum:
+    case Intrinsic::copysign:
+    case Intrinsic::floor:
+    case Intrinsic::ceil:
+    case Intrinsic::trunc:
+    case Intrinsic::rint:
+    case Intrinsic::nearbyint:
+    case Intrinsic::round:
+    case Intrinsic::ctpop:
+    case Intrinsic::fmuladd:
+    case Intrinsic::fshl:
+    case Intrinsic::fshr:
+    case Intrinsic::sadd_sat:
+    case Intrinsic::uadd_sat:
+    case Intrinsic::ssub_sat:
+    case Intrinsic::usub_sat:
+    case Intrinsic::bitreverse:
+      // All these function are overloadable and have both scalar and vector
+      // versions.
+      Properties |= eBuiltinPropertyVectorEquivalent;
+      SafeIntrinsic = true;
+      break;
+    case Intrinsic::assume:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::invariant_start:
+    case Intrinsic::invariant_end:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::objectsize:
+    case Intrinsic::ptr_annotation:
+    case Intrinsic::var_annotation:
+    case Intrinsic::experimental_noalias_scope_decl:
+      SafeIntrinsic = true;
+      break;
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+      Properties |= eBuiltinPropertyNoVectorEquivalent;
+      Properties |= eBuiltinPropertySideEffects;
+      break;
+    }
+    if (NoSideEffect || SafeIntrinsic) {
+      Properties |= eBuiltinPropertyNoSideEffects;
+      if (!AS.hasFnAttr(Attribute::NoDuplicate)) {
+        Properties |= eBuiltinPropertySupportsInstantiation;
+      }
+    }
+    return Builtin{F, eBuiltinUnknown, (BuiltinProperties)Properties};
+  }
+
+  auto MB = identifyMuxBuiltin(F);
+  if (!MB) {
+    // It's not a Mux builtin, so defer to the language implementation
+    if (LangImpl) {
+      return LangImpl->analyzeBuiltin(F);
+    }
+    return std::nullopt;
+  }
+
+  auto [ID, OverloadInfo] = *MB;
+
+  // Check that all overloadable builtins have returned some overloading
+  // information, for API consistency.
+  assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) &&
+         "Inconsistency in overloadable builtin APIs");
+
+  bool IsConvergent = false;
+  unsigned Properties = eBuiltinPropertyNone;
+  switch (ID) {
+  default:
+    break;
+  case eMuxBuiltinMemBarrier:
+    Properties = eBuiltinPropertySideEffects;
+    break;
+  case eMuxBuiltinSubGroupBarrier:
+  case eMuxBuiltinWorkGroupBarrier:
+    IsConvergent = true;
+    Properties = eBuiltinPropertyExecutionFlow | eBuiltinPropertySideEffects;
+    break;
+  case eMuxBuiltinDMARead1D:
+  case eMuxBuiltinDMARead2D:
+  case eMuxBuiltinDMARead3D:
+  case eMuxBuiltinDMAWrite1D:
+  case eMuxBuiltinDMAWrite2D:
+  case eMuxBuiltinDMAWrite3D:
+  case eMuxBuiltinDMAWait:
+    // Our DMA builtins, by default, rely on thread checks against specific
+    // work-item IDs, so they must be convergent.
+    IsConvergent = true;
+    Properties = eBuiltinPropertyNoSideEffects;
+    break;
+  case eMuxBuiltinGetWorkDim:
+  case eMuxBuiltinGetGroupId:
+  case eMuxBuiltinGetGlobalSize:
+  case eMuxBuiltinGetGlobalOffset:
+  case eMuxBuiltinGetLocalSize:
+  case eMuxBuiltinGetNumGroups:
+  case eMuxBuiltinGetGlobalLinearId:
+  case eMuxBuiltinGetLocalLinearId:
+  case eMuxBuiltinGetGlobalId:
+  case eMuxBuiltinGetSubGroupLocalId:
+    Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyRematerializable;
+    break;
+  case eMuxBuiltinGetLocalId:
+    Properties = eBuiltinPropertyWorkItem | eBuiltinPropertyLocalID |
+                 eBuiltinPropertyRematerializable;
+    break;
+  case eMuxBuiltinIsFTZ:
+  case eMuxBuiltinIsEmbeddedProfile:
+  case eMuxBuiltinUseFast:
+    Properties = eBuiltinPropertyNoSideEffects;
+    break;
+  }
+
+  // Group functions are convergent.
+  if (isMuxGroupCollective(ID)) {
+    IsConvergent = true;
+  }
+
+  if (!IsConvergent) {
+    Properties |= eBuiltinPropertyKnownNonConvergent;
+  }
+
+  return Builtin{F, ID, (BuiltinProperties)Properties, OverloadInfo};
+}
+
+std::optional<BuiltinCall>
+BuiltinInfo::analyzeBuiltinCall(const CallInst &CI, unsigned SimdDimIdx) const {
+  if (auto *const callee = dyn_cast<Function>(CI.getCalledOperand())) {
+    if (const auto B = analyzeBuiltin(*callee)) {
+      const auto U = isBuiltinUniform(*B, &CI, SimdDimIdx);
+      return BuiltinCall{*B, CI, U};
+    }
+  }
+  return std::nullopt;
+}
+
+Function *BuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
+                                           Module *M) {
+  // We don't handle LLVM intrinsics here
+  if (B.function.isIntrinsic()) {
+    return nullptr;
+  }
+
+  if (LangImpl) {
+    return LangImpl->getVectorEquivalent(B, Width, M);
+  }
+  return nullptr;
+}
+
+Function *BuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
+  // We will first check to see if this is an LLVM intrinsic that has a scalar
+  // equivalent.
+  if (B.function.isIntrinsic()) {
+    // Analyze the builtin. Some functions have no scalar equivalent.
+    const auto Props = B.properties;
+    if (!(Props & eBuiltinPropertyVectorEquivalent)) {
+      return nullptr;
+    }
+
+    // Check the return type.
+    auto *VecRetTy = dyn_cast<FixedVectorType>(B.function.getReturnType());
+    if (!VecRetTy) {
+      return nullptr;
+    }
+
+    auto IntrinsicID = B.function.getIntrinsicID();
+    // Currently, we can only handle correctly intrinsics that have one
+    // overloaded type, used for both the return type and all of the arguments.
+    // TODO: More generic support for intrinsics with vector equivalents.
+    for (Type *ArgTy : B.function.getFunctionType()->params()) {
+      // If the argument isn't a vector, then it isn't going to get scalarized,
+      // so don't worry about it.
+      if (ArgTy->isVectorTy() && ArgTy != VecRetTy) {
+        return nullptr;
+      }
+    }
+    Type *ScalarType = VecRetTy->getElementType();
+    // Get the scalar version of the intrinsic
+    Function *ScalarIntrinsic =
+        Intrinsic::getOrInsertDeclaration(M, IntrinsicID, ScalarType);
+
+    return ScalarIntrinsic;
+  }
+
+  if (LangImpl) {
+    return LangImpl->getScalarEquivalent(B, M);
+  }
+  return nullptr;
+}
+
+Value *BuiltinInfo::emitBuiltinInline(Function *Builtin, IRBuilder<> &B,
+                                      ArrayRef<Value *> Args) {
+  if (LangImpl) {
+    return LangImpl->emitBuiltinInline(Builtin, B, Args);
+  }
+  return nullptr;
+}
+
+std::optional<llvm::ConstantRange> BuiltinInfo::getBuiltinRange(
+    CallInst &CI, std::array<std::optional<uint64_t>, 3> MaxLocalSizes,
+    std::array<std::optional<uint64_t>, 3> MaxGlobalSizes) const {
+  auto *F = CI.getCalledFunction();
+  // Ranges only apply to integer types, and ensure that there's a named
+  // function to analyze.
+  if (!F || !F->hasName() || !CI.getType()->isIntegerTy()) {
+    return std::nullopt;
+  }
+
+  // First, check mux builtins
+  if (auto MB = identifyMuxBuiltin(*F); MB && isMuxBuiltinID(MB->first)) {
+    return MuxImpl->getBuiltinRange(CI, MB->first, MaxLocalSizes,
+                                    MaxGlobalSizes);
+  }
+
+  // Next, ask the language builtin info
+  if (LangImpl) {
+    return LangImpl->getBuiltinRange(CI, MaxLocalSizes, MaxGlobalSizes);
+  }
+
+  return std::nullopt;
+}
+
+Instruction *BuiltinInfo::lowerBuiltinToMuxBuiltin(CallInst &CI) {
+  if (LangImpl) {
+    return LangImpl->lowerBuiltinToMuxBuiltin(CI, *MuxImpl);
+  }
+  // We shouldn't be mapping mux builtins to mux builtins, so we can stop here.
+  return nullptr;
+}
+
+std::optional<BuiltinID> BuiltinInfo::getPrintfBuiltin() const {
+  if (LangImpl) {
+    return LangImpl->getPrintfBuiltin();
+  }
+  return std::nullopt;
+}
+
+bool BuiltinInfo::requiresSchedulingParameters(BuiltinID ID) {
+  // Defer to mux for the scheduling parameters.
+  return MuxImpl->requiresSchedulingParameters(ID);
+}
+
+Type *BuiltinInfo::getRemappedTargetExtTy(Type *Ty, Module &M) {
+  // Defer to mux for the scheduling parameters.
+  return MuxImpl->getRemappedTargetExtTy(Ty, M);
+}
+
+SmallVector<BuiltinInfo::SchedParamInfo, 4>
+BuiltinInfo::getMuxSchedulingParameters(Module &M) {
+  // Defer to mux for the scheduling parameters.
+  return MuxImpl->getMuxSchedulingParameters(M);
+}
+
+SmallVector<BuiltinInfo::SchedParamInfo, 4>
+BuiltinInfo::getFunctionSchedulingParameters(Function &F) {
+  // Defer to mux for the scheduling parameters.
+  return MuxImpl->getFunctionSchedulingParameters(F);
+}
+
+Value *BuiltinInfo::initializeSchedulingParamForWrappedKernel(
+    const SchedParamInfo &Info, IRBuilder<> &B, Function &IntoF,
+    Function &CalleeF) {
+  return MuxImpl->initializeSchedulingParamForWrappedKernel(Info, B, IntoF,
+                                                            CalleeF);
+}
+
+// This provides an extremely simple mangling scheme matching LLVM's intrinsic
+// mangling system. It is only designed to be used with a specific set of types
+// and is not a general-purpose mangler.
+std::string BuiltinInfo::getMangledTypeStr(Type *Ty) {
+  std::string Result;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    const ElementCount EC = VTy->getElementCount();
+    if (EC.isScalable()) {
+      Result += "nx";
+    }
+    return "v" + utostr(EC.getKnownMinValue()) +
+           getMangledTypeStr(VTy->getElementType());
+  }
+
+  if (Ty) {
+    switch (Ty->getTypeID()) {
+    default:
+      break;
+    case Type::HalfTyID:
+      return "f16";
+    case Type::BFloatTyID:
+      return "bf16";
+    case Type::FloatTyID:
+      return "f32";
+    case Type::DoubleTyID:
+      return "f64";
+    case Type::IntegerTyID:
+      return "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
+    }
+  }
+  llvm_unreachable("Unhandled type");
+}
+
+std::pair<Type *, StringRef>
+BuiltinInfo::getDemangledTypeFromStr(StringRef TyStr, LLVMContext &Ctx) {
+  const bool IsScalable = TyStr.consume_front("nx");
+  if (TyStr.consume_front("v")) {
+    unsigned EC;
+    if (TyStr.consumeInteger(10, EC)) {
+      return {nullptr, TyStr};
+    }
+    if (auto [EltTy, NewTyStr] = getDemangledTypeFromStr(TyStr, Ctx); EltTy) {
+      return {VectorType::get(EltTy, EC, IsScalable), NewTyStr};
+    }
+    return {nullptr, TyStr};
+  }
+  if (TyStr.consume_front("f16")) {
+    return {Type::getHalfTy(Ctx), TyStr};
+  }
+  if (TyStr.consume_front("bf16")) {
+    return {Type::getBFloatTy(Ctx), TyStr};
+  }
+  if (TyStr.consume_front("f32")) {
+    return {Type::getFloatTy(Ctx), TyStr};
+  }
+  if (TyStr.consume_front("f64")) {
+    return {Type::getDoubleTy(Ctx), TyStr};
+  }
+  unsigned IntBitWidth;
+  if (TyStr.consume_front("i") && !TyStr.consumeInteger(10, IntBitWidth)) {
+    return {IntegerType::get(Ctx, IntBitWidth), TyStr};
+  }
+
+  return {nullptr, TyStr};
+}
+
+std::string BuiltinInfo::getMuxBuiltinName(BuiltinID ID,
+                                           ArrayRef<Type *> OverloadInfo) {
+  assert(isMuxBuiltinID(ID));
+  switch (ID) {
+  default:
+    break;
+  case eMuxBuiltinIsFTZ:
+    return MuxBuiltins::isftz;
+  case eMuxBuiltinUseFast:
+    return MuxBuiltins::usefast;
+  case eMuxBuiltinIsEmbeddedProfile:
+    return MuxBuiltins::isembeddedprofile;
+  case eMuxBuiltinGetGlobalSize:
+    return MuxBuiltins::get_global_size;
+  case eMuxBuiltinGetGlobalId:
+    return MuxBuiltins::get_global_id;
+  case eMuxBuiltinGetGlobalOffset:
+    return MuxBuiltins::get_global_offset;
+  case eMuxBuiltinGetLocalSize:
+    return MuxBuiltins::get_local_size;
+  case eMuxBuiltinGetLocalId:
+    return MuxBuiltins::get_local_id;
+  case eMuxBuiltinSetLocalId:
+    return MuxBuiltins::set_local_id;
+  case eMuxBuiltinGetSubGroupId:
+    return MuxBuiltins::get_sub_group_id;
+  case eMuxBuiltinSetSubGroupId:
+    return MuxBuiltins::set_sub_group_id;
+  case eMuxBuiltinGetNumGroups:
+    return MuxBuiltins::get_num_groups;
+  case eMuxBuiltinGetNumSubGroups:
+    return MuxBuiltins::get_num_sub_groups;
+  case eMuxBuiltinSetNumSubGroups:
+    return MuxBuiltins::set_num_sub_groups;
+  case eMuxBuiltinGetMaxSubGroupSize:
+    return MuxBuiltins::get_max_sub_group_size;
+  case eMuxBuiltinSetMaxSubGroupSize:
+    return MuxBuiltins::set_max_sub_group_size;
+  case eMuxBuiltinGetGroupId:
+    return MuxBuiltins::get_group_id;
+  case eMuxBuiltinGetWorkDim:
+    return MuxBuiltins::get_work_dim;
+  case eMuxBuiltinDMARead1D:
+    return MuxBuiltins::dma_read_1d;
+  case eMuxBuiltinDMARead2D:
+    return MuxBuiltins::dma_read_2d;
+  case eMuxBuiltinDMARead3D:
+    return MuxBuiltins::dma_read_3d;
+  case eMuxBuiltinDMAWrite1D:
+    return MuxBuiltins::dma_write_1d;
+  case eMuxBuiltinDMAWrite2D:
+    return MuxBuiltins::dma_write_2d;
+  case eMuxBuiltinDMAWrite3D:
+    return MuxBuiltins::dma_write_3d;
+  case eMuxBuiltinDMAWait:
+    return MuxBuiltins::dma_wait;
+  case eMuxBuiltinGetGlobalLinearId:
+    return MuxBuiltins::get_global_linear_id;
+  case eMuxBuiltinGetLocalLinearId:
+    return MuxBuiltins::get_local_linear_id;
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    return MuxBuiltins::get_enqueued_local_size;
+  case eMuxBuiltinGetSubGroupSize:
+    return MuxBuiltins::get_sub_group_size;
+  case eMuxBuiltinGetSubGroupLocalId:
+    return MuxBuiltins::get_sub_group_local_id;
+  case eMuxBuiltinMemBarrier:
+    return MuxBuiltins::mem_barrier;
+  case eMuxBuiltinWorkGroupBarrier:
+    return MuxBuiltins::work_group_barrier;
+  case eMuxBuiltinSubGroupBarrier:
+    return MuxBuiltins::sub_group_barrier;
+  }
+
+  // A sneaky macro to do case statements on all scopes of a group operation.
+  // Note that it is missing a leading 'case' and a trailing ':' to trick
+  // clang-format into formatting it like a regular case statement.
+#define CASE_GROUP_OP_ALL_SCOPES(OP)                                           \
+  eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP:                      \
+  case eMuxBuiltinWorkgroup##OP
+
+  std::string BaseName = [](BuiltinID ID) {
+    // For simplicity, return all group operations as 'work_group' and replace
+    // the string with 'sub_group' or 'vec_group' post-hoc.
+    switch (ID) {
+    default:
+      return "";
+    case CASE_GROUP_OP_ALL_SCOPES(All):
+      return "__mux_work_group_all";
+    case CASE_GROUP_OP_ALL_SCOPES(Any):
+      return "__mux_work_group_any";
+    case CASE_GROUP_OP_ALL_SCOPES(Broadcast):
+      return "__mux_work_group_broadcast";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
+      return "__mux_work_group_reduce_add";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+      return "__mux_work_group_reduce_fadd";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+      return "__mux_work_group_reduce_smin";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+      return "__mux_work_group_reduce_umin";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+      return "__mux_work_group_reduce_fmin";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+      return "__mux_work_group_reduce_smax";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+      return "__mux_work_group_reduce_umax";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+      return "__mux_work_group_reduce_fmax";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+      return "__mux_work_group_reduce_mul";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+      return "__mux_work_group_reduce_fmul";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+      return "__mux_work_group_reduce_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+      return "__mux_work_group_reduce_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+      return "__mux_work_group_reduce_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+      return "__mux_work_group_reduce_logical_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+      return "__mux_work_group_reduce_logical_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+      return "__mux_work_group_reduce_logical_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
+      return "__mux_work_group_scan_inclusive_add";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
+      return "__mux_work_group_scan_inclusive_fadd";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+      return "__mux_work_group_scan_exclusive_add";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+      return "__mux_work_group_scan_exclusive_fadd";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
+      return "__mux_work_group_scan_inclusive_smin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
+      return "__mux_work_group_scan_inclusive_umin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
+      return "__mux_work_group_scan_inclusive_fmin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+      return "__mux_work_group_scan_exclusive_smin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+      return "__mux_work_group_scan_exclusive_umin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+      return "__mux_work_group_scan_exclusive_fmin";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
+      return "__mux_work_group_scan_inclusive_smax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
+      return "__mux_work_group_scan_inclusive_umax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+      return "__mux_work_group_scan_inclusive_fmax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+      return "__mux_work_group_scan_exclusive_smax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
+      return "__mux_work_group_scan_exclusive_umax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+      return "__mux_work_group_scan_exclusive_fmax";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
+      return "__mux_work_group_scan_inclusive_mul";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
+      return "__mux_work_group_scan_inclusive_fmul";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+      return "__mux_work_group_scan_exclusive_mul";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+      return "__mux_work_group_scan_exclusive_fmul";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+      return "__mux_work_group_scan_inclusive_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+      return "__mux_work_group_scan_exclusive_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+      return "__mux_work_group_scan_inclusive_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+      return "__mux_work_group_scan_exclusive_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+      return "__mux_work_group_scan_inclusive_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+      return "__mux_work_group_scan_exclusive_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+      return "__mux_work_group_scan_inclusive_logical_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+      return "__mux_work_group_scan_exclusive_logical_and";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+      return "__mux_work_group_scan_inclusive_logical_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+      return "__mux_work_group_scan_exclusive_logical_or";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+      return "__mux_work_group_scan_inclusive_logical_xor";
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+      return "__mux_work_group_scan_exclusive_logical_xor";
+    case eMuxBuiltinSubgroupShuffle:
+      return "__mux_work_group_shuffle";
+    case eMuxBuiltinSubgroupShuffleUp:
+      return "__mux_work_group_shuffle_up";
+    case eMuxBuiltinSubgroupShuffleDown:
+      return "__mux_work_group_shuffle_down";
+    case eMuxBuiltinSubgroupShuffleXor:
+      return "__mux_work_group_shuffle_xor";
+    }
+  }(ID);
+
+  if (!BaseName.empty()) {
+    assert(!OverloadInfo.empty() &&
+           "Must know how to overload group operation");
+    if (ID >= eFirstMuxSubgroupCollectiveBuiltin &&
+        ID <= eLastMuxSubgroupCollectiveBuiltin) {
+      // Replace 'work' with 'sub'
+      BaseName = BaseName.replace(6, 4, "sub");
+    } else if (ID >= eFirstMuxVecgroupCollectiveBuiltin &&
+               ID <= eLastMuxVecgroupCollectiveBuiltin) {
+      // Replace 'work' with 'vec'
+      BaseName = BaseName.replace(6, 4, "vec");
+    }
+    auto *const Ty = OverloadInfo.front();
+    return BaseName + "_" + getMangledTypeStr(Ty);
+  }
+  llvm_unreachable("Unhandled mux builtin");
+#undef CASE_GROUP_OP_ALL_SCOPES
+}
+
+Function *BuiltinInfo::defineMuxBuiltin(BuiltinID ID, Module &M,
+                                        ArrayRef<Type *> OverloadInfo) {
+  assert(isMuxBuiltinID(ID) && "Only handling mux builtins");
+  // Check that all overloadable builtins have returned some overloading
+  // information, for API consistency.
+  assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) &&
+         "Inconsistency in overloadable builtin APIs");
+
+  Function *F = M.getFunction(getMuxBuiltinName(ID, OverloadInfo));
+  // FIXME: We'd ideally want to declare it here to reduce pass
+  // inter-dependencies.
+  assert(F && "Function should have been pre-declared");
+  if (!F->isDeclaration()) {
+    return F;
+  }
+  // Defer to the mux implementation to define this builtin.
+  return MuxImpl->defineMuxBuiltin(ID, M, OverloadInfo);
+}
+
+Function *BuiltinInfo::getOrDeclareMuxBuiltin(BuiltinID ID, Module &M,
+                                              ArrayRef<Type *> OverloadInfo) {
+  assert(isMuxBuiltinID(ID) && "Only handling mux builtins");
+  // Check that all overloadable builtins have returned some overloading
+  // information, for API consistency.
+  assert((!isOverloadableMuxBuiltinID(ID) || !OverloadInfo.empty()) &&
+         "Inconsistency in overloadable builtin APIs");
+  // Defer to the mux implementation to get/declare this builtin.
+  return MuxImpl->getOrDeclareMuxBuiltin(ID, M, OverloadInfo);
+}
+
+std::optional<GroupCollective> BuiltinInfo::isMuxGroupCollective(BuiltinID ID) {
+  GroupCollective Collective;
+
+  if (ID >= eFirstMuxSubgroupCollectiveBuiltin &&
+      ID <= eLastMuxSubgroupCollectiveBuiltin) {
+    Collective.Scope = GroupCollective::ScopeKind::SubGroup;
+  } else if (ID >= eFirstMuxWorkgroupCollectiveBuiltin &&
+             ID <= eLastMuxWorkgroupCollectiveBuiltin) {
+    Collective.Scope = GroupCollective::ScopeKind::WorkGroup;
+  } else if (ID >= eFirstMuxVecgroupCollectiveBuiltin &&
+             ID <= eLastMuxVecgroupCollectiveBuiltin) {
+    Collective.Scope = GroupCollective::ScopeKind::VectorGroup;
+  } else {
+    return std::nullopt;
+  }
+
+  // A sneaky macro to do case statements on all scopes of a group operation.
+  // Note that it is missing a leading 'case' and a trailing ':' to trick
+  // clang-format into formatting it like a regular case statement.
+#define CASE_GROUP_OP_ALL_SCOPES(OP)                                           \
+  eMuxBuiltinVecgroup##OP : case eMuxBuiltinSubgroup##OP:                      \
+  case eMuxBuiltinWorkgroup##OP
+
+  switch (ID) {
+  default:
+    llvm_unreachable("Unhandled mux group builtin");
+  case CASE_GROUP_OP_ALL_SCOPES(All):
+    Collective.Op = GroupCollective::OpKind::All;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(Any):
+    Collective.Op = GroupCollective::OpKind::Any;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(Broadcast):
+    Collective.Op = GroupCollective::OpKind::Broadcast;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+    Collective.IsLogical = true;
+    [[fallthrough]];
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+  case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+    Collective.Op = GroupCollective::OpKind::Reduction;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+    Collective.IsLogical = true;
+    [[fallthrough]];
+  case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+    Collective.Op = GroupCollective::OpKind::ScanInclusive;
+    break;
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+    Collective.IsLogical = true;
+    [[fallthrough]];
+  case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+  case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+    Collective.Op = GroupCollective::OpKind::ScanExclusive;
+    break;
+  case eMuxBuiltinSubgroupShuffle:
+    Collective.Op = GroupCollective::OpKind::Shuffle;
+    break;
+  case eMuxBuiltinSubgroupShuffleUp:
+    Collective.Op = GroupCollective::OpKind::ShuffleUp;
+    break;
+  case eMuxBuiltinSubgroupShuffleDown:
+    Collective.Op = GroupCollective::OpKind::ShuffleDown;
+    break;
+  case eMuxBuiltinSubgroupShuffleXor:
+    Collective.Op = GroupCollective::OpKind::ShuffleXor;
+    break;
+  }
+
+  // Then the recurrence kind.
+  if (Collective.Op == GroupCollective::OpKind::All) {
+    Collective.Recurrence = RecurKind::And;
+  } else if (Collective.Op == GroupCollective::OpKind::Any) {
+    Collective.Recurrence = RecurKind::Or;
+  } else if (Collective.Op == GroupCollective::OpKind::Reduction ||
+             Collective.Op == GroupCollective::OpKind::ScanExclusive ||
+             Collective.Op == GroupCollective::OpKind::ScanInclusive) {
+    switch (ID) {
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAdd):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAddInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAddExclusive):
+      Collective.Recurrence = RecurKind::Add;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFAdd):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFAddExclusive):
+      Collective.Recurrence = RecurKind::FAdd;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceMul):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanMulExclusive):
+      Collective.Recurrence = RecurKind::Mul;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMul):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMulExclusive):
+      Collective.Recurrence = RecurKind::FMul;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMinExclusive):
+      Collective.Recurrence = RecurKind::SMin;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMinExclusive):
+      Collective.Recurrence = RecurKind::UMin;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMin):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMinExclusive):
+      Collective.Recurrence = RecurKind::FMin;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceSMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanSMaxExclusive):
+      Collective.Recurrence = RecurKind::SMax;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceUMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanUMaxExclusive):
+      Collective.Recurrence = RecurKind::UMax;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceFMax):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanFMaxExclusive):
+      Collective.Recurrence = RecurKind::FMax;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceAnd):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalAnd):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanAndExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalAndExclusive):
+      Collective.Recurrence = RecurKind::And;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceOr):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalOr):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanOrExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalOrExclusive):
+      Collective.Recurrence = RecurKind::Or;
+      break;
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceXor):
+    case CASE_GROUP_OP_ALL_SCOPES(ReduceLogicalXor):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanXorExclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorInclusive):
+    case CASE_GROUP_OP_ALL_SCOPES(ScanLogicalXorExclusive):
+      Collective.Recurrence = RecurKind::Xor;
+      break;
+    default:
+      llvm_unreachable("Unhandled mux group operation");
+    }
+  } else if (!Collective.isBroadcast() && !Collective.isShuffleLike()) {
+    llvm_unreachable("Unhandled mux group operation");
+  }
+
+  return Collective;
+#undef CASE_GROUP_OP_ALL_SCOPES
+}
+
+std::optional<BuiltinID>
+BuiltinInfo::getMuxGroupCollective(const GroupCollective &Group) {
+#define SIMPLE_SCOPE_SWITCH(OP)                                                \
+  do {                                                                         \
+    switch (Group.Scope) {                                                     \
+    case GroupCollective::ScopeKind::SubGroup:                                 \
+      return eMuxBuiltinSubgroup##OP;                                          \
+    case GroupCollective::ScopeKind::WorkGroup:                                \
+      return eMuxBuiltinWorkgroup##OP;                                         \
+    case GroupCollective::ScopeKind::VectorGroup:                              \
+      return eMuxBuiltinVecgroup##OP;                                          \
+    }                                                                          \
+    llvm_unreachable("Impossible scope kind");                                 \
+  } while (0)
+
+#define COMPLEX_SCOPE_SWITCH(OP, SUFFIX)                                       \
+  do {                                                                         \
+    switch (Group.Recurrence) {                                                \
+    default:                                                                   \
+      llvm_unreachable("Unhandled recursion kind");                            \
+    case RecurKind::Add:                                                       \
+      SIMPLE_SCOPE_SWITCH(OP##Add##SUFFIX);                                    \
+    case RecurKind::Mul:                                                       \
+      SIMPLE_SCOPE_SWITCH(OP##Mul##SUFFIX);                                    \
+    case RecurKind::FAdd:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##FAdd##SUFFIX);                                   \
+    case RecurKind::FMul:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##FMul##SUFFIX);                                   \
+    case RecurKind::SMin:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##SMin##SUFFIX);                                   \
+    case RecurKind::UMin:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##UMin##SUFFIX);                                   \
+    case RecurKind::FMin:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##FMin##SUFFIX);                                   \
+    case RecurKind::SMax:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##SMax##SUFFIX);                                   \
+    case RecurKind::UMax:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##UMax##SUFFIX);                                   \
+    case RecurKind::FMax:                                                      \
+      SIMPLE_SCOPE_SWITCH(OP##FMax##SUFFIX);                                   \
+    case RecurKind::And:                                                       \
+      if (Group.IsLogical) {                                                   \
+        SIMPLE_SCOPE_SWITCH(OP##LogicalAnd##SUFFIX);                           \
+      } else {                                                                 \
+        SIMPLE_SCOPE_SWITCH(OP##And##SUFFIX);                                  \
+      }                                                                        \
+    case RecurKind::Or:                                                        \
+      if (Group.IsLogical) {                                                   \
+        SIMPLE_SCOPE_SWITCH(OP##LogicalOr##SUFFIX);                            \
+      } else {                                                                 \
+        SIMPLE_SCOPE_SWITCH(OP##Or##SUFFIX);                                   \
+      }                                                                        \
+    case RecurKind::Xor:                                                       \
+      if (Group.IsLogical) {                                                   \
+        SIMPLE_SCOPE_SWITCH(OP##LogicalXor##SUFFIX);                           \
+      } else {                                                                 \
+        SIMPLE_SCOPE_SWITCH(OP##Xor##SUFFIX);                                  \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+
+  switch (Group.Op) {
+  case GroupCollective::OpKind::All:
+    SIMPLE_SCOPE_SWITCH(All);
+  case GroupCollective::OpKind::Any:
+    SIMPLE_SCOPE_SWITCH(Any);
+  case GroupCollective::OpKind::Broadcast:
+    SIMPLE_SCOPE_SWITCH(Broadcast);
+  case GroupCollective::OpKind::Reduction:
+    COMPLEX_SCOPE_SWITCH(Reduce, );
+  case GroupCollective::OpKind::ScanExclusive:
+    COMPLEX_SCOPE_SWITCH(Scan, Exclusive);
+  case GroupCollective::OpKind::ScanInclusive:
+    COMPLEX_SCOPE_SWITCH(Scan, Inclusive);
+    break;
+  case GroupCollective::OpKind::Shuffle:
+  case GroupCollective::OpKind::ShuffleUp:
+  case GroupCollective::OpKind::ShuffleDown:
+  case GroupCollective::OpKind::ShuffleXor:
+    if (!Group.isSubGroupScope()) {
+      break;
+    }
+    switch (Group.Op) {
+    default:
+      llvm_unreachable("Unhandled op");
+    case GroupCollective::OpKind::Shuffle:
+      return eMuxBuiltinSubgroupShuffle;
+    case GroupCollective::OpKind::ShuffleUp:
+      return eMuxBuiltinSubgroupShuffleUp;
+    case GroupCollective::OpKind::ShuffleDown:
+      return eMuxBuiltinSubgroupShuffleDown;
+    case GroupCollective::OpKind::ShuffleXor:
+      return eMuxBuiltinSubgroupShuffleXor;
+    }
+  }
+  return std::nullopt;
+#undef COMPLEX_SCOPE_SWITCH
+#undef SCOPE_SWITCH
+}
+
+bool BuiltinInfo::isOverloadableMuxBuiltinID(BuiltinID ID) {
+  if (!isMuxBuiltinID(ID)) {
+    return false;
+  }
+  switch (ID) {
+  default:
+    return isMuxGroupCollective(ID).has_value();
+  case eMuxBuiltinDMARead1D:
+  case eMuxBuiltinDMAWrite1D:
+  case eMuxBuiltinDMARead2D:
+  case eMuxBuiltinDMAWrite2D:
+  case eMuxBuiltinDMARead3D:
+  case eMuxBuiltinDMAWrite3D:
+    return true;
+  }
+}
+
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
new file mode 100644
index 0000000000000..20b934795d20c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/cl_builtin_info.cpp
@@ -0,0 +1,3654 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/address_spaces.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/cl_builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Compiler.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/Error.h>
+#include <llvm/Support/MathExtras.h>
+#include <llvm/TargetParser/Triple.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <cmath>
+#include <set>
+
+// For compatibility with the Android NDK, we need to use the C ilogb function.
+namespace stdcompat {
+#ifdef __ANDROID__
+// Note: This function accepts double only as its argument
+using ::ilogb;
+#else
+using std::ilogb;
+#endif // __ANDROID__
+} // namespace stdcompat
+
+namespace {
+/// @brief Identifiers for recognized OpenCL builtins.
+enum CLBuiltinID : compiler::utils::BuiltinID {
+  // Non-standard Builtin Functions
+  /// @brief Internal builtin 'convert_half_to_float'.
+  eCLBuiltinConvertHalfToFloat = compiler::utils::eFirstTargetBuiltin,
+  /// @brief Internal builtin 'convert_float_to_half'.
+  eCLBuiltinConvertFloatToHalf,
+  /// @brief Internal builtin 'convert_float_to_half_rte'
+  eCLBuiltinConvertFloatToHalfRte,
+  /// @brief Internal builtin 'convert_float_to_half_rtz'
+  eCLBuiltinConvertFloatToHalfRtz,
+  /// @brief Internal builtin 'convert_float_to_half_rtp'
+  eCLBuiltinConvertFloatToHalfRtp,
+  /// @brief Internal builtin 'convert_float_to_half_rtn'
+  eCLBuiltinConvertFloatToHalfRtn,
+  /// @brief Internal builtin 'convert_half_to_double'.
+  eCLBuiltinConvertHalfToDouble,
+  /// @brief Internal builtin 'convert_double_to_half'.
+  eCLBuiltinConvertDoubleToHalf,
+  /// @brief Internal builtin 'convert_double_to_half_rte'
+  eCLBuiltinConvertDoubleToHalfRte,
+  /// @brief Internal builtin 'convert_double_to_half_rtz'
+  eCLBuiltinConvertDoubleToHalfRtz,
+  /// @brief Internal builtin 'convert_double_to_half_rtp'
+  eCLBuiltinConvertDoubleToHalfRtp,
+  /// @brief Internal builtin 'convert_double_to_half_rtn'
+  eCLBuiltinConvertDoubleToHalfRtn,
+
+  // 6.2.3 Explicit Conversions
+  /// @brief OpenCL builtin `convert_char`
+  eCLBuiltinConvertChar,
+  /// @brief OpenCL builtin `convert_short`
+  eCLBuiltinConvertShort,
+  /// @brief OpenCL builtin `convert_int`
+  eCLBuiltinConvertInt,
+  /// @brief OpenCL builtin `convert_long`
+  eCLBuiltinConvertLong,
+  /// @brief OpenCL builtin `convert_uchar`
+  eCLBuiltinConvertUChar,
+  /// @brief OpenCL builtin `convert_ushort`
+  eCLBuiltinConvertUShort,
+  /// @brief OpenCL builtin `convert_uint`
+  eCLBuiltinConvertUInt,
+  /// @brief OpenCL builtin `convert_ulong`
+  eCLBuiltinConvertULong,
+
+  // 6.12.1 Work-Item Functions
+  /// @brief OpenCL builtin 'get_work_dim'.
+  eCLBuiltinGetWorkDim,
+  /// @brief OpenCL builtin 'get_group_id'.
+  eCLBuiltinGetGroupId,
+  /// @brief OpenCL builtin 'get_global_size'.
+  eCLBuiltinGetGlobalSize,
+  /// @brief OpenCL builtin 'get_global_offset'.
+  eCLBuiltinGetGlobalOffset,
+  /// @brief OpenCL builtin 'get_local_id'.
+  eCLBuiltinGetLocalId,
+  /// @brief OpenCL builtin 'get_local_size'.
+  eCLBuiltinGetLocalSize,
+  /// @brief OpenCL builtin 'get_enqueued_local_size'.
+  eCLBuiltinGetEnqueuedLocalSize,
+  /// @brief OpenCL builtin 'get_num_groups'.
+  eCLBuiltinGetNumGroups,
+  /// @brief OpenCL builtin 'get_global_id'.
+  eCLBuiltinGetGlobalId,
+  /// @brief OpenCL builtin 'get_local_linear_id' (OpenCL >= 2.0).
+  eCLBuiltinGetLocalLinearId,
+  /// @brief OpenCL builtin 'get_global_linear_id' (OpenCL >= 2.0).
+  eCLBuiltinGetGlobalLinearId,
+  /// @brief OpenCL builtin 'get_sub_group_local_id' (OpenCL >= 3.0).
+  eCLBuiltinGetSubgroupLocalId,
+  /// @brief OpenCL builtin 'get_sub_group_size' (OpenCL >= 3.0).
+  eCLBuiltinGetSubgroupSize,
+  /// @brief OpenCL builtin 'get_max_sub_group_size' (OpenCL >= 3.0).
+  eCLBuiltinGetMaxSubgroupSize,
+  /// @brief OpenCL builtin 'get_num_sub_groups' (OpenCL >= 3.0).
+  eCLBuiltinGetNumSubgroups,
+  /// @brief OpenCL builtin 'get_enqueued_num_sub_groups' (OpenCL >= 3.0).
+  eCLBuiltinGetEnqueuedNumSubgroups,
+  /// @brief OpenCL builtin 'get_sub_group_id' (OpenCL >= 3.0).
+  eCLBuiltinGetSubgroupId,
+
+  // 6.12.2 Math Functions
+  /// @brief OpenCL builtin 'fmax'.
+  eCLBuiltinFMax,
+  /// @brief OpenCL builtin 'fmin'.
+  eCLBuiltinFMin,
+  /// @brief OpenCL builtin 'fract'.
+  eCLBuiltinFract,
+  /// @brief OpenCL builtin 'frexp'.
+  eCLBuiltinFrexp,
+  /// @brief OpenCL builtin 'lgamma_r'.
+  eCLBuiltinLGammaR,
+  /// @brief OpenCL builtin 'modf'.
+  eCLBuiltinModF,
+  /// @brief OpenCL builtin 'sincos'.
+  eCLBuiltinSinCos,
+  /// @brief OpenCL builtin 'remquo'.
+  eCLBuiltinRemquo,
+
+  // 6.12.3 Integer Functions
+  /// @brief OpenCL builtin 'add_sat'.
+  eCLBuiltinAddSat,
+  /// @brief OpenCL builtin 'sub_sat'.
+  eCLBuiltinSubSat,
+
+  // 6.12.5 Geometric Builtin-in Functions
+  /// @brief OpenCL builtin 'dot'.
+  eCLBuiltinDot,
+  /// @brief OpenCL builtin 'cross'.
+  eCLBuiltinCross,
+  /// @brief OpenCL builtin 'length'.
+  eCLBuiltinLength,
+  /// @brief OpenCL builtin 'distance'.
+  eCLBuiltinDistance,
+  /// @brief OpenCL builtin 'normalize'.
+  eCLBuiltinNormalize,
+  /// @brief OpenCL builtin 'fast_length'.
+  eCLBuiltinFastLength,
+  /// @brief OpenCL builtin 'fast_distance'.
+  eCLBuiltinFastDistance,
+  /// @brief OpenCL builtin 'fast_normalize'.
+  eCLBuiltinFastNormalize,
+
+  // 6.12.6 Relational Functions
+  /// @brief OpenCL builtin 'all'.
+  eCLBuiltinAll,
+  /// @brief OpenCL builtin 'any'.
+  eCLBuiltinAny,
+  /// @brief OpenCL builtin 'isequal'.
+  eCLBuiltinIsEqual,
+  /// @brief OpenCL builtin 'isnotequal'.
+  eCLBuiltinIsNotEqual,
+  /// @brief OpenCL builtin 'isgreater'.
+  eCLBuiltinIsGreater,
+  /// @brief OpenCL builtin 'isgreaterequal'.
+  eCLBuiltinIsGreaterEqual,
+  /// @brief OpenCL builtin 'isless'.
+  eCLBuiltinIsLess,
+  /// @brief OpenCL builtin 'islessequal'.
+  eCLBuiltinIsLessEqual,
+  /// @brief OpenCL builtin 'islessgreater'.
+  eCLBuiltinIsLessGreater,
+  /// @brief OpenCL builtin 'isordered'.
+  eCLBuiltinIsOrdered,
+  /// @brief OpenCL builtin 'isunordered'.
+  eCLBuiltinIsUnordered,
+  /// @brief OpenCL builtin 'isfinite'.
+  eCLBuiltinIsFinite,
+  /// @brief OpenCL builtin 'isinf'.
+  eCLBuiltinIsInf,
+  /// @brief OpenCL builtin 'isnan'.
+  eCLBuiltinIsNan,
+  /// @brief OpenCL builtin 'isnormal'.
+  eCLBuiltinIsNormal,
+  /// @brief OpenCL builtin 'signbit'.
+  eCLBuiltinSignBit,
+  /// @brief OpenCL builtin `select`.
+  eCLBuiltinSelect,
+
+  // 6.12.8 Synchronization Functions
+  /// @brief OpenCL builtin 'barrier'.
+  eCLBuiltinBarrier,
+  /// @brief OpenCL builtin 'mem_fence'.
+  eCLBuiltinMemFence,
+  /// @brief OpenCL builtin 'read_mem_fence'.
+  eCLBuiltinReadMemFence,
+  /// @brief OpenCL builtin 'write_mem_fence'.
+  eCLBuiltinWriteMemFence,
+  /// @brief OpenCL builtin 'atomic_work_item_fence'.
+  eCLBuiltinAtomicWorkItemFence,
+  /// @brief OpenCL builtin 'sub_group_barrier'.
+  eCLBuiltinSubGroupBarrier,
+  /// @brief OpenCL builtin 'work_group_barrier'.
+  eCLBuiltinWorkGroupBarrier,
+
+  // 6.12.10 Async Copies and Prefetch Functions
+  /// @brief OpenCL builtin 'async_work_group_copy'.
+  eCLBuiltinAsyncWorkGroupCopy,
+  /// @brief OpenCL builtin 'async_work_group_strided_copy'.
+  eCLBuiltinAsyncWorkGroupStridedCopy,
+  /// @brief OpenCL builtin 'wait_group_events'.
+  eCLBuiltinWaitGroupEvents,
+  /// @brief OpenCL builtin 'async_work_group_copy_2D2D'.
+  eCLBuiltinAsyncWorkGroupCopy2D2D,
+  /// @brief OpenCL builtin 'async_work_group_copy_3D3D'.
+  eCLBuiltinAsyncWorkGroupCopy3D3D,
+
+  // 6.12.11 Atomic Functions
+  /// @brief OpenCL builtins 'atomic_add', 'atom_add'.
+  eCLBuiltinAtomicAdd,
+  /// @brief OpenCL builtins 'atomic_sub', 'atom_sub'.
+  eCLBuiltinAtomicSub,
+  /// @brief OpenCL builtins 'atomic_xchg', 'atom_xchg'.
+  eCLBuiltinAtomicXchg,
+  /// @brief OpenCL builtins 'atomic_inc', 'atom_inc'.
+  eCLBuiltinAtomicInc,
+  /// @brief OpenCL builtins 'atomic_dec', 'atom_dec'.
+  eCLBuiltinAtomicDec,
+  /// @brief OpenCL builtins 'atomic_cmpxchg', 'atom_cmpxchg'.
+  eCLBuiltinAtomicCmpxchg,
+  /// @brief OpenCL builtins 'atomic_min', 'atom_min'.
+  eCLBuiltinAtomicMin,
+  /// @brief OpenCL builtins 'atomic_max', 'atom_max'.
+  eCLBuiltinAtomicMax,
+  /// @brief OpenCL builtins 'atomic_and', 'atom_and'.
+  eCLBuiltinAtomicAnd,
+  /// @brief OpenCL builtins 'atomic_or', 'atom_or'.
+  eCLBuiltinAtomicOr,
+  /// @brief OpenCL builtins 'atomic_xor', 'atom_xor'.
+  eCLBuiltinAtomicXor,
+
+  // 6.12.12 Miscellaneous Vector Functions
+  eCLBuiltinShuffle,
+  eCLBuiltinShuffle2,
+
+  // 6.12.13 printf
+  /// @brief OpenCL builtin 'printf'.
+  eCLBuiltinPrintf,
+
+  // 6.15.16 Work-group Collective Functions
+  /// @brief OpenCL builtin 'work_group_all'.
+  eCLBuiltinWorkgroupAll,
+  /// @brief OpenCL builtin 'work_group_any'.
+  eCLBuiltinWorkgroupAny,
+  /// @brief OpenCL builtin 'work_group_broadcast'.
+  eCLBuiltinWorkgroupBroadcast,
+  /// @brief OpenCL builtin 'work_group_reduce_add'.
+  eCLBuiltinWorkgroupReduceAdd,
+  /// @brief OpenCL builtin 'work_group_reduce_min'.
+  eCLBuiltinWorkgroupReduceMin,
+  /// @brief OpenCL builtin 'work_group_reduce_max'.
+  eCLBuiltinWorkgroupReduceMax,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_add'.
+  eCLBuiltinWorkgroupScanAddInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_add'.
+  eCLBuiltinWorkgroupScanAddExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_min'.
+  eCLBuiltinWorkgroupScanMinInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_min'.
+  eCLBuiltinWorkgroupScanMinExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_max'.
+  eCLBuiltinWorkgroupScanMaxInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_max'.
+  eCLBuiltinWorkgroupScanMaxExclusive,
+
+  /// @brief OpenCL builtin 'work_group_reduce_mul'.
+  eCLBuiltinWorkgroupReduceMul,
+  /// @brief OpenCL builtin 'work_group_reduce_and'.
+  eCLBuiltinWorkgroupReduceAnd,
+  /// @brief OpenCL builtin 'work_group_reduce_or'.
+  eCLBuiltinWorkgroupReduceOr,
+  /// @brief OpenCL builtin 'work_group_reduce_xor'.
+  eCLBuiltinWorkgroupReduceXor,
+  /// @brief OpenCL builtin 'work_group_reduce_logical_and'.
+  eCLBuiltinWorkgroupReduceLogicalAnd,
+  /// @brief OpenCL builtin 'work_group_reduce_logical_or'.
+  eCLBuiltinWorkgroupReduceLogicalOr,
+  /// @brief OpenCL builtin 'work_group_reduce_logical_xor'.
+  eCLBuiltinWorkgroupReduceLogicalXor,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_mul'.
+  eCLBuiltinWorkgroupScanMulInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_mul'.
+  eCLBuiltinWorkgroupScanMulExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_and'.
+  eCLBuiltinWorkgroupScanAndInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_and'.
+  eCLBuiltinWorkgroupScanAndExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_or'.
+  eCLBuiltinWorkgroupScanOrInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_or'.
+  eCLBuiltinWorkgroupScanOrExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_xor'.
+  eCLBuiltinWorkgroupScanXorInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_xor'.
+  eCLBuiltinWorkgroupScanXorExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_and'.
+  eCLBuiltinWorkgroupScanLogicalAndInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_and'.
+  eCLBuiltinWorkgroupScanLogicalAndExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_or'.
+  eCLBuiltinWorkgroupScanLogicalOrInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_or'.
+  eCLBuiltinWorkgroupScanLogicalOrExclusive,
+  /// @brief OpenCL builtin 'work_group_scan_inclusive_logical_xor'.
+  eCLBuiltinWorkgroupScanLogicalXorInclusive,
+  /// @brief OpenCL builtin 'work_group_scan_exclusive_logical_xor'.
+  eCLBuiltinWorkgroupScanLogicalXorExclusive,
+
+  // 6.15.19 Subgroup Collective Functions
+  /// @brief OpenCL builtin 'sub_group_all'.
+  eCLBuiltinSubgroupAll,
+  /// @brief OpenCL builtin 'sub_group_any'.
+  eCLBuiltinSubgroupAny,
+  /// @brief OpenCL builtin 'sub_group_broadcast'.
+  eCLBuiltinSubgroupBroadcast,
+  /// @brief OpenCL builtin 'sub_group_reduce_add'.
+  eCLBuiltinSubgroupReduceAdd,
+  /// @brief OpenCL builtin 'sub_group_reduce_min'.
+  eCLBuiltinSubgroupReduceMin,
+  /// @brief OpenCL builtin 'sub_group_reduce_max'.
+  eCLBuiltinSubgroupReduceMax,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_add'.
+  eCLBuiltinSubgroupScanAddInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_add'.
+  eCLBuiltinSubgroupScanAddExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_min'.
+  eCLBuiltinSubgroupScanMinInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_min'.
+  eCLBuiltinSubgroupScanMinExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_max'.
+  eCLBuiltinSubgroupScanMaxInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_max'.
+  eCLBuiltinSubgroupScanMaxExclusive,
+
+  /// @brief OpenCL builtin 'sub_group_reduce_mul'.
+  eCLBuiltinSubgroupReduceMul,
+  /// @brief OpenCL builtin 'sub_group_reduce_and'.
+  eCLBuiltinSubgroupReduceAnd,
+  /// @brief OpenCL builtin 'sub_group_reduce_or'.
+  eCLBuiltinSubgroupReduceOr,
+  /// @brief OpenCL builtin 'sub_group_reduce_xor'.
+  eCLBuiltinSubgroupReduceXor,
+  /// @brief OpenCL builtin 'sub_group_reduce_logical_and'.
+  eCLBuiltinSubgroupReduceLogicalAnd,
+  /// @brief OpenCL builtin 'sub_group_reduce_logical_or'.
+  eCLBuiltinSubgroupReduceLogicalOr,
+  /// @brief OpenCL builtin 'sub_group_reduce_logical_xor'.
+  eCLBuiltinSubgroupReduceLogicalXor,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_mul'.
+  eCLBuiltinSubgroupScanMulInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_mul'.
+  eCLBuiltinSubgroupScanMulExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_and'.
+  eCLBuiltinSubgroupScanAndInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_and'.
+  eCLBuiltinSubgroupScanAndExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_or'.
+  eCLBuiltinSubgroupScanOrInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_or'.
+  eCLBuiltinSubgroupScanOrExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_xor'.
+  eCLBuiltinSubgroupScanXorInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_xor'.
+  eCLBuiltinSubgroupScanXorExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_and'.
+  eCLBuiltinSubgroupScanLogicalAndInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_and'.
+  eCLBuiltinSubgroupScanLogicalAndExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_or'.
+  eCLBuiltinSubgroupScanLogicalOrInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_or'.
+  eCLBuiltinSubgroupScanLogicalOrExclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_inclusive_logical_xor'.
+  eCLBuiltinSubgroupScanLogicalXorInclusive,
+  /// @brief OpenCL builtin 'sub_group_scan_exclusive_logical_xor'.
+  eCLBuiltinSubgroupScanLogicalXorExclusive,
+
+  // 6.12.7 Vector Data Load and Store Functions
+  eCLBuiltinVLoad,
+  eCLBuiltinVLoadHalf,
+  eCLBuiltinVStore,
+  eCLBuiltinVStoreHalf,
+
+  // 6.3 Conversions & Type Casting Examples
+  eCLBuiltinAs,
+};
+} // namespace
+
+namespace {
+using namespace llvm;
+using namespace compiler::utils;
+
+// Returns whether the given integer is a valid vector width in OpenCL.
+// Matches 2, 3, 4, 8, 16.
+bool isValidVecWidth(unsigned w) {
+  return (w == 3 || (w >= 2 && w <= 16 && llvm::isPowerOf2_32(w)));
+}
+
+/// @brief Copy global variables to a module on demand.
+class GlobalValueMaterializer final : public llvm::ValueMaterializer {
+public:
+  /// @brief Create a new global variable materializer.
+  /// @param[in] M Module to materialize the variables in.
+  GlobalValueMaterializer(Module &M) : DestM(M) {}
+
+  /// @brief List of variables created during materialization.
+  const std::vector<GlobalVariable *> &variables() const { return Variables; }
+
+  /// @brief Materialize the given value.
+  ///
+  /// @param[in] V Value to materialize.
+  ///
+  /// @return A value that lives in the destination module, or nullptr if the
+  /// given value could not be materialized (e.g. it is not a global variable).
+  Value *materialize(Value *V) override final {
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
+    if (!GV) {
+      return nullptr;
+    }
+    GlobalVariable *NewGV = DestM.getGlobalVariable(GV->getName());
+    if (!NewGV) {
+      NewGV = new GlobalVariable(
+          DestM, GV->getValueType(), GV->isConstant(), GV->getLinkage(),
+          (Constant *)nullptr, GV->getName(), (GlobalVariable *)nullptr,
+          GV->getThreadLocalMode(), GV->getType()->getAddressSpace());
+      NewGV->copyAttributesFrom(GV);
+      Variables.push_back(GV);
+    }
+    return NewGV;
+  }
+
+private:
+  /// @brief Modules to materialize variables in.
+  Module &DestM;
+  /// @brief Materialized variables.
+  std::vector<GlobalVariable *> Variables;
+};
+} // namespace
+
+namespace compiler {
+namespace utils {
+using namespace llvm;
+
+std::unique_ptr<BILangInfoConcept> createCLBuiltinInfo(Module *Builtins) {
+  return std::make_unique<CLBuiltinInfo>(Builtins);
+}
+
+CLBuiltinInfo::CLBuiltinInfo(Module *builtins)
+    : Loader(std::make_unique<SimpleCLBuiltinLoader>(builtins)) {}
+
+CLBuiltinInfo::~CLBuiltinInfo() = default;
+
+/// @brief Create a call instruction to the given builtin and set the correct
+/// calling convention.
+///
+/// This function is intended as a helper function for creating calls to
+/// builtins. For each call generated we need to set the calling convention
+/// manually, which can lead to code bloat. This function will create the call
+/// instruction and then it will either copy the calling convention for the
+/// called function (if possible) or set it to the default value of spir_func.
+///
+/// @param[in] B The IRBuilder to use when creating the CallInst
+/// @param[in] Builtin The Function to call
+/// @param[in] Args The call arguments
+/// @param[in] NameStr The name for the new CallInst
+/// @return The newly emitted CallInst
+static CallInst *CreateBuiltinCall(IRBuilder<> &B, Function *Builtin,
+                                   ArrayRef<Value *> Args,
+                                   const Twine &NameStr = "") {
+  CallInst *CI =
+      B.CreateCall(Builtin->getFunctionType(), Builtin, Args, NameStr);
+  CI->setCallingConv(Builtin->getCallingConv());
+  return CI;
+}
+
+struct CLBuiltinEntry {
+  /// @brief Identifier for the builtin function.
+  BuiltinID ID;
+  /// @brief OpenCL name of the builtin function.
+  const char *OpenCLFnName;
+  /// @brief Minimum OpenCL version that supports this builtin.
+  uint32_t MinVer = OpenCLC10;
+};
+
+/// @brief Information about known OpenCL builtins.
+static constexpr CLBuiltinEntry Builtins[] = {
+    // Non-standard Builtin Functions
+    {eCLBuiltinConvertHalfToFloat, "convert_half_to_float"},
+    {eCLBuiltinConvertFloatToHalf, "convert_float_to_half"},
+    {eCLBuiltinConvertFloatToHalfRte, "convert_float_to_half_rte"},
+    {eCLBuiltinConvertFloatToHalfRtz, "convert_float_to_half_rtz"},
+    {eCLBuiltinConvertFloatToHalfRtp, "convert_float_to_half_rtp"},
+    {eCLBuiltinConvertFloatToHalfRtn, "convert_float_to_half_rtn"},
+    {eCLBuiltinConvertHalfToDouble, "convert_half_to_double"},
+    {eCLBuiltinConvertDoubleToHalf, "convert_double_to_half"},
+    {eCLBuiltinConvertDoubleToHalfRte, "convert_double_to_half_rte"},
+    {eCLBuiltinConvertDoubleToHalfRtz, "convert_double_to_half_rtz"},
+    {eCLBuiltinConvertDoubleToHalfRtp, "convert_double_to_half_rtp"},
+    {eCLBuiltinConvertDoubleToHalfRtn, "convert_double_to_half_rtn"},
+
+    // 6.2.3 Explicit Conversions
+    {eCLBuiltinConvertChar, "convert_char"},
+    {eCLBuiltinConvertShort, "convert_short"},
+    {eCLBuiltinConvertInt, "convert_int"},
+    {eCLBuiltinConvertLong, "convert_long"},
+    {eCLBuiltinConvertUChar, "convert_uchar"},
+    {eCLBuiltinConvertUShort, "convert_ushort"},
+    {eCLBuiltinConvertUInt, "convert_uint"},
+    {eCLBuiltinConvertULong, "convert_ulong"},
+
+    // 6.12.1 Work-Item Functions
+    {eCLBuiltinGetWorkDim, "get_work_dim"},
+    {eCLBuiltinGetGroupId, "get_group_id"},
+    {eCLBuiltinGetGlobalSize, "get_global_size"},
+    {eCLBuiltinGetGlobalOffset, "get_global_offset"},
+    {eCLBuiltinGetLocalId, "get_local_id"},
+    {eCLBuiltinGetLocalSize, "get_local_size"},
+    {eCLBuiltinGetEnqueuedLocalSize, "get_enqueued_local_size"},
+    {eCLBuiltinGetNumGroups, "get_num_groups"},
+    {eCLBuiltinGetGlobalId, "get_global_id"},
+    {eCLBuiltinGetLocalLinearId, "get_local_linear_id", OpenCLC20},
+    {eCLBuiltinGetGlobalLinearId, "get_global_linear_id", OpenCLC20},
+    {eCLBuiltinGetSubgroupLocalId, "get_sub_group_local_id", OpenCLC30},
+    {eCLBuiltinGetSubgroupSize, "get_sub_group_size", OpenCLC30},
+    {eCLBuiltinGetMaxSubgroupSize, "get_max_sub_group_size", OpenCLC30},
+    {eCLBuiltinGetNumSubgroups, "get_num_sub_groups", OpenCLC30},
+    {eCLBuiltinGetEnqueuedNumSubgroups, "get_enqueued_num_sub_groups",
+     OpenCLC30},
+    {eCLBuiltinGetSubgroupId, "get_sub_group_id", OpenCLC30},
+
+    // 6.12.2 Math Functions
+    {eCLBuiltinFMax, "fmax"},
+    {eCLBuiltinFMin, "fmin"},
+    {eCLBuiltinFract, "fract"},
+    {eCLBuiltinFrexp, "frexp"},
+    {eCLBuiltinLGammaR, "lgamma_r"},
+    {eCLBuiltinModF, "modf"},
+    {eCLBuiltinSinCos, "sincos"},
+    {eCLBuiltinRemquo, "remquo"},
+
+    // 6.12.3 Integer Functions
+    {eCLBuiltinAddSat, "add_sat"},
+    {eCLBuiltinSubSat, "sub_sat"},
+
+    // 6.12.5 Geometric Functions
+    {eCLBuiltinDot, "dot"},
+    {eCLBuiltinCross, "cross"},
+    {eCLBuiltinLength, "length"},
+    {eCLBuiltinDistance, "distance"},
+    {eCLBuiltinNormalize, "normalize"},
+    {eCLBuiltinFastLength, "fast_length"},
+    {eCLBuiltinFastDistance, "fast_distance"},
+    {eCLBuiltinFastNormalize, "fast_normalize"},
+
+    // 6.12.6 Relational Functions
+    {eCLBuiltinAll, "all"},
+    {eCLBuiltinAny, "any"},
+    {eCLBuiltinIsEqual, "isequal"},
+    {eCLBuiltinIsNotEqual, "isnotequal"},
+    {eCLBuiltinIsGreater, "isgreater"},
+    {eCLBuiltinIsGreaterEqual, "isgreaterequal"},
+    {eCLBuiltinIsLess, "isless"},
+    {eCLBuiltinIsLessEqual, "islessequal"},
+    {eCLBuiltinIsLessGreater, "islessgreater"},
+    {eCLBuiltinIsOrdered, "isordered"},
+    {eCLBuiltinIsUnordered, "isunordered"},
+    {eCLBuiltinIsFinite, "isfinite"},
+    {eCLBuiltinIsInf, "isinf"},
+    {eCLBuiltinIsNan, "isnan"},
+    {eCLBuiltinIsNormal, "isnormal"},
+    {eCLBuiltinSignBit, "signbit"},
+    {eCLBuiltinSelect, "select"},
+
+    // 6.12.8 Synchronization Functions
+    {eCLBuiltinBarrier, "barrier"},
+    {eCLBuiltinMemFence, "mem_fence"},
+    {eCLBuiltinReadMemFence, "read_mem_fence"},
+    {eCLBuiltinWriteMemFence, "write_mem_fence"},
+    {eCLBuiltinAtomicWorkItemFence, "atomic_work_item_fence", OpenCLC20},
+    {eCLBuiltinSubGroupBarrier, "sub_group_barrier", OpenCLC30},
+    {eCLBuiltinWorkGroupBarrier, "work_group_barrier", OpenCLC20},
+
+    // 6.12.10 Async Copies and Prefetch Functions
+    {eCLBuiltinAsyncWorkGroupCopy, "async_work_group_copy"},
+    {eCLBuiltinAsyncWorkGroupStridedCopy, "async_work_group_strided_copy"},
+    {eCLBuiltinWaitGroupEvents, "wait_group_events"},
+    {eCLBuiltinAsyncWorkGroupCopy2D2D, "async_work_group_copy_2D2D"},
+    {eCLBuiltinAsyncWorkGroupCopy3D3D, "async_work_group_copy_3D3D"},
+
+    // 6.12.11 Atomic Functions
+    {eCLBuiltinAtomicAdd, "atom_add"},
+    {eCLBuiltinAtomicSub, "atom_sub"},
+    {eCLBuiltinAtomicXchg, "atom_xchg"},
+    {eCLBuiltinAtomicInc, "atom_inc"},
+    {eCLBuiltinAtomicDec, "atom_dec"},
+    {eCLBuiltinAtomicCmpxchg, "atom_cmpxchg"},
+    {eCLBuiltinAtomicMin, "atom_min"},
+    {eCLBuiltinAtomicMax, "atom_max"},
+    {eCLBuiltinAtomicAnd, "atom_and"},
+    {eCLBuiltinAtomicOr, "atom_or"},
+    {eCLBuiltinAtomicXor, "atom_xor"},
+    {eCLBuiltinAtomicAdd, "atomic_add"},
+    {eCLBuiltinAtomicSub, "atomic_sub"},
+    {eCLBuiltinAtomicXchg, "atomic_xchg"},
+    {eCLBuiltinAtomicInc, "atomic_inc"},
+    {eCLBuiltinAtomicDec, "atomic_dec"},
+    {eCLBuiltinAtomicCmpxchg, "atomic_cmpxchg"},
+    {eCLBuiltinAtomicMin, "atomic_min"},
+    {eCLBuiltinAtomicMax, "atomic_max"},
+    {eCLBuiltinAtomicAnd, "atomic_and"},
+    {eCLBuiltinAtomicOr, "atomic_or"},
+    {eCLBuiltinAtomicXor, "atomic_xor"},
+
+    // 6.11.12 Miscellaneous Vector Functions
+    {eCLBuiltinShuffle, "shuffle"},
+    {eCLBuiltinShuffle2, "shuffle2"},
+
+    // 6.12.13 printf
+    {eCLBuiltinPrintf, "printf"},
+
+    // 6.15.16 Work-group Collective Functions
+    {eCLBuiltinWorkgroupAll, "work_group_all", OpenCLC20},
+    {eCLBuiltinWorkgroupAny, "work_group_any", OpenCLC20},
+    {eCLBuiltinWorkgroupBroadcast, "work_group_broadcast", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceAdd, "work_group_reduce_add", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceMin, "work_group_reduce_min", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceMax, "work_group_reduce_max", OpenCLC20},
+    {eCLBuiltinWorkgroupScanAddInclusive, "work_group_scan_inclusive_add",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanAddExclusive, "work_group_scan_exclusive_add",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMinInclusive, "work_group_scan_inclusive_min",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMinExclusive, "work_group_scan_exclusive_min",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMaxInclusive, "work_group_scan_inclusive_max",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMaxExclusive, "work_group_scan_exclusive_max",
+     OpenCLC20},
+
+    /// Provided by SPV_KHR_uniform_group_instructions.
+    {eCLBuiltinWorkgroupReduceMul, "work_group_reduce_mul", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceAnd, "work_group_reduce_and", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceOr, "work_group_reduce_or", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceXor, "work_group_reduce_xor", OpenCLC20},
+    {eCLBuiltinWorkgroupReduceLogicalAnd, "work_group_reduce_logical_and",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupReduceLogicalOr, "work_group_reduce_logical_or",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupReduceLogicalXor, "work_group_reduce_logical_xor",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMulInclusive, "work_group_scan_inclusive_mul",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanMulExclusive, "work_group_scan_exclusive_mul",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanAndInclusive, "work_group_scan_inclusive_and",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanAndExclusive, "work_group_scan_exclusive_and",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanOrInclusive, "work_group_scan_inclusive_or",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanOrExclusive, "work_group_scan_exclusive_or",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanXorInclusive, "work_group_scan_inclusive_xor",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanXorExclusive, "work_group_scan_exclusive_xor",
+     OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalAndInclusive,
+     "work_group_scan_inclusive_logical_and", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalAndExclusive,
+     "work_group_scan_exclusive_logical_and", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalOrInclusive,
+     "work_group_scan_inclusive_logical_or", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalOrExclusive,
+     "work_group_scan_exclusive_logical_or", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalXorInclusive,
+     "work_group_scan_inclusive_logical_xor", OpenCLC20},
+    {eCLBuiltinWorkgroupScanLogicalXorExclusive,
+     "work_group_scan_exclusive_logical_xor", OpenCLC20},
+
+    // 6.15.19 Subgroup Collective Functions
+    {eCLBuiltinSubgroupAll, "sub_group_all", OpenCLC30},
+    {eCLBuiltinSubgroupAny, "sub_group_any", OpenCLC30},
+    {eCLBuiltinSubgroupBroadcast, "sub_group_broadcast", OpenCLC30},
+    {eCLBuiltinSubgroupReduceAdd, "sub_group_reduce_add", OpenCLC30},
+    {eCLBuiltinSubgroupReduceMin, "sub_group_reduce_min", OpenCLC30},
+    {eCLBuiltinSubgroupReduceMax, "sub_group_reduce_max", OpenCLC30},
+    {eCLBuiltinSubgroupScanAddInclusive, "sub_group_scan_inclusive_add",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanAddExclusive, "sub_group_scan_exclusive_add",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMinInclusive, "sub_group_scan_inclusive_min",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMinExclusive, "sub_group_scan_exclusive_min",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMaxInclusive, "sub_group_scan_inclusive_max",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMaxExclusive, "sub_group_scan_exclusive_max",
+     OpenCLC30},
+    /// Provided by SPV_KHR_uniform_group_instructions.
+    {eCLBuiltinSubgroupReduceMul, "sub_group_reduce_mul", OpenCLC30},
+    {eCLBuiltinSubgroupReduceAnd, "sub_group_reduce_and", OpenCLC30},
+    {eCLBuiltinSubgroupReduceOr, "sub_group_reduce_or", OpenCLC30},
+    {eCLBuiltinSubgroupReduceXor, "sub_group_reduce_xor", OpenCLC30},
+    {eCLBuiltinSubgroupReduceLogicalAnd, "sub_group_reduce_logical_and",
+     OpenCLC30},
+    {eCLBuiltinSubgroupReduceLogicalOr, "sub_group_reduce_logical_or",
+     OpenCLC30},
+    {eCLBuiltinSubgroupReduceLogicalXor, "sub_group_reduce_logical_xor",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMulInclusive, "sub_group_scan_inclusive_mul",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanMulExclusive, "sub_group_scan_exclusive_mul",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanAndInclusive, "sub_group_scan_inclusive_and",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanAndExclusive, "sub_group_scan_exclusive_and",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanOrInclusive, "sub_group_scan_inclusive_or",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanOrExclusive, "sub_group_scan_exclusive_or",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanXorInclusive, "sub_group_scan_inclusive_xor",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanXorExclusive, "sub_group_scan_exclusive_xor",
+     OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalAndInclusive,
+     "sub_group_scan_inclusive_logical_and", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalAndExclusive,
+     "sub_group_scan_exclusive_logical_and", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalOrInclusive,
+     "sub_group_scan_inclusive_logical_or", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalOrExclusive,
+     "sub_group_scan_exclusive_logical_or", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalXorInclusive,
+     "sub_group_scan_inclusive_logical_xor", OpenCLC30},
+    {eCLBuiltinSubgroupScanLogicalXorExclusive,
+     "sub_group_scan_exclusive_logical_xor", OpenCLC30},
+
+    {eBuiltinUnknown, nullptr}};
+
+////////////////////////////////////////////////////////////////////////////////
+
+Function *CLBuiltinInfo::declareBuiltin(Module *M, BuiltinID ID, Type *RetTy,
+                                        ArrayRef<Type *> ArgTys,
+                                        ArrayRef<TypeQualifiers> ArgQuals,
+                                        Twine Suffix) {
+  // Determine the builtin function name.
+  if (!M) {
+    return nullptr;
+  }
+  std::string BuiltinName = getBuiltinName(ID).str();
+  if (BuiltinName.empty()) {
+    return nullptr;
+  }
+
+  // Add the optional suffix.
+  SmallVector<char, 16> SuffixVec;
+  Suffix.toVector(SuffixVec);
+  if (!SuffixVec.empty()) {
+    BuiltinName.append(SuffixVec.begin(), SuffixVec.end());
+  }
+
+  // Mangle the function name and look it up in the module.
+  NameMangler Mangler(&M->getContext());
+  const std::string MangledName =
+      Mangler.mangleName(BuiltinName, ArgTys, ArgQuals);
+  Function *Builtin = M->getFunction(MangledName);
+
+  // Declare the builtin if necessary.
+  if (!Builtin) {
+    FunctionType *FT = FunctionType::get(RetTy, ArgTys, false);
+    M->getOrInsertFunction(MangledName, FT);
+    Builtin = M->getFunction(MangledName);
+    Builtin->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+  return Builtin;
+}
+
+std::optional<BuiltinID> CLBuiltinInfo::getPrintfBuiltin() const {
+  return eCLBuiltinPrintf;
+}
+
+Module *CLBuiltinInfo::getBuiltinsModule() {
+  if (!Loader) {
+    return nullptr;
+  }
+  return Loader->getBuiltinsModule();
+}
+
+Function *CLBuiltinInfo::materializeBuiltin(StringRef BuiltinName,
+                                            Module *DestM,
+                                            BuiltinMatFlags Flags) {
+  // First try to find the builtin in the target module.
+  if (DestM) {
+    Function *Builtin = DestM->getFunction(BuiltinName);
+    // If a builtin was found, it might be either a declaration or a definition.
+    // If the definition flag (eBuiltinMatDefinition) is set, we can not return
+    // just a declaration.
+    if (Builtin &&
+        (!(Flags & eBuiltinMatDefinition) || !Builtin->isDeclaration())) {
+      return Builtin;
+    }
+  }
+
+  if (!Loader) {
+    return nullptr;
+  }
+  // Try to find the builtin in the builtins module
+  return Loader->materializeBuiltin(BuiltinName, DestM, Flags);
+}
+
+std::optional<BuiltinID>
+CLBuiltinInfo::identifyBuiltin(const Function &F) const {
+  NameMangler Mangler(nullptr);
+  const StringRef Name = F.getName();
+  const CLBuiltinEntry *entry = Builtins;
+  const auto Version = getOpenCLVersion(*F.getParent());
+  const StringRef DemangledName = Mangler.demangleName(Name);
+  while (entry->ID != eBuiltinUnknown) {
+    if (Version >= entry->MinVer && DemangledName == entry->OpenCLFnName) {
+      return entry->ID;
+    }
+    entry++;
+  }
+
+  if (DemangledName == Name) {
+    // The function name is not mangled and so it can not be an OpenCL builtin.
+    return std::nullopt;
+  }
+
+  Lexer L(Mangler.demangleName(Name));
+  if (L.Consume("vload")) {
+    unsigned Width = 0;
+    if (L.Consume("_half")) {
+      // We have both `vload_half` and `vload_halfN` variants.
+      if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) {
+        // If there's nothing left to parse we're good to go.
+        if (!L.Left()) {
+          return eCLBuiltinVLoadHalf;
+        }
+      }
+    } else if (L.ConsumeInteger(Width) && !L.Left() && isValidVecWidth(Width)) {
+      // There are no scalar variants of this builtin.
+      return eCLBuiltinVLoad;
+    }
+  } else if (L.Consume("vstore")) {
+    unsigned Width = 0;
+    if (L.Consume("_half")) {
+      // We have both `vstore_half` and `vstore_halfN` variants.
+      if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) {
+        // Rounding modes are optional.
+        L.Consume("_rte") || L.Consume("_rtz") || L.Consume("_rtp") ||
+            L.Consume("_rtn");
+
+        // If there's nothing left to parse we're good to go.
+        if (!L.Left()) {
+          return eCLBuiltinVStoreHalf;
+        }
+      }
+    } else if (L.ConsumeInteger(Width) && !L.Left() && isValidVecWidth(Width)) {
+      // There are no scalar variants of this builtin.
+      return eCLBuiltinVStore;
+    }
+  } else if (L.Consume("as_")) {
+    if (L.Consume("char") || L.Consume("uchar") || L.Consume("short") ||
+        L.Consume("ushort") || L.Consume("int") || L.Consume("uint") ||
+        L.Consume("long") || L.Consume("ulong") || L.Consume("float") ||
+        L.Consume("double") || L.Consume("half")) {
+      unsigned Width = 0;
+      if (!L.ConsumeInteger(Width) || isValidVecWidth(Width)) {
+        if (!L.Left()) {
+          return eCLBuiltinAs;
+        }
+      }
+    }
+  }
+
+  return eBuiltinUnknown;
+}
+
+llvm::StringRef CLBuiltinInfo::getBuiltinName(BuiltinID ID) const {
+  const CLBuiltinEntry *entry = Builtins;
+  while (entry->ID != eBuiltinUnknown) {
+    if (ID == entry->ID) {
+      return entry->OpenCLFnName;
+    }
+    entry++;
+  }
+  return llvm::StringRef();
+}
+
+BuiltinUniformity CLBuiltinInfo::isBuiltinUniform(const Builtin &,
+                                                  const CallInst *CI,
+                                                  unsigned) const {
+  // Assume that builtins with side effects are varying.
+  if (Function *Callee = CI->getCalledFunction()) {
+    if (auto B = analyzeBuiltin(*Callee)) {
+      const auto Props = B->properties;
+      if (Props & eBuiltinPropertySideEffects) {
+        return eBuiltinUniformityNever;
+      }
+    }
+  }
+
+  return eBuiltinUniformityLikeInputs;
+}
+
+std::optional<Builtin>
+CLBuiltinInfo::analyzeBuiltin(const Function &Callee) const {
+  const auto ID = identifyBuiltin(Callee);
+  if (!ID) {
+    return std::nullopt;
+  }
+
+  bool IsConvergent = false;
+  unsigned Properties = eBuiltinPropertyNone;
+  switch (*ID) {
+  default:
+    // Assume convergence on unknown builtins.
+    IsConvergent = true;
+    break;
+  case eBuiltinUnknown: {
+    // Assume convergence on unknown builtins.
+    IsConvergent = true;
+    // If we know that this is an OpenCL builtin, but we don't have any
+    // special information about it, we can determine if it has side effects
+    // or not by its return type and its paramaters. This depends on being
+    // able to identify all the "special" builtins, such as barriers and
+    // fences.
+    bool HasSideEffects = false;
+
+    // Void functions have side effects
+    if (Callee.getReturnType() == Type::getVoidTy(Callee.getContext())) {
+      HasSideEffects = true;
+    }
+    // Functions that take pointers probably have side effects
+    for (const auto &arg : Callee.args()) {
+      if (arg.getType()->isPointerTy()) {
+        HasSideEffects = true;
+      }
+    }
+    Properties |= HasSideEffects ? eBuiltinPropertySideEffects
+                                 : eBuiltinPropertyNoSideEffects;
+  } break;
+  case eCLBuiltinBarrier:
+    IsConvergent = true;
+    Properties |= eBuiltinPropertyExecutionFlow;
+    Properties |= eBuiltinPropertySideEffects;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinMemFence:
+  case eCLBuiltinReadMemFence:
+  case eCLBuiltinWriteMemFence:
+    Properties |= eBuiltinPropertySupportsInstantiation;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinPrintf:
+    Properties |= eBuiltinPropertySideEffects;
+    Properties |= eBuiltinPropertySupportsInstantiation;
+    break;
+  case eCLBuiltinAsyncWorkGroupCopy:
+  case eCLBuiltinAsyncWorkGroupStridedCopy:
+  case eCLBuiltinWaitGroupEvents:
+  case eCLBuiltinAsyncWorkGroupCopy2D2D:
+  case eCLBuiltinAsyncWorkGroupCopy3D3D:
+    // Our implementation of these builtins uses thread checks against
+    // specific work-item IDs, so they are convergent.
+    IsConvergent = true;
+    Properties |= eBuiltinPropertyNoSideEffects;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinAtomicAdd:
+  case eCLBuiltinAtomicSub:
+  case eCLBuiltinAtomicXchg:
+  case eCLBuiltinAtomicInc:
+  case eCLBuiltinAtomicDec:
+  case eCLBuiltinAtomicCmpxchg:
+  case eCLBuiltinAtomicMin:
+  case eCLBuiltinAtomicMax:
+  case eCLBuiltinAtomicAnd:
+  case eCLBuiltinAtomicOr:
+  case eCLBuiltinAtomicXor:
+    Properties |= eBuiltinPropertySideEffects;
+    Properties |= eBuiltinPropertySupportsInstantiation;
+    Properties |= eBuiltinPropertyAtomic;
+    break;
+  case eCLBuiltinGetWorkDim:
+  case eCLBuiltinGetGroupId:
+  case eCLBuiltinGetGlobalSize:
+  case eCLBuiltinGetGlobalOffset:
+  case eCLBuiltinGetNumGroups:
+  case eCLBuiltinGetGlobalId:
+  case eCLBuiltinGetLocalSize:
+  case eCLBuiltinGetEnqueuedLocalSize:
+  case eCLBuiltinGetLocalLinearId:
+  case eCLBuiltinGetGlobalLinearId:
+  case eCLBuiltinGetSubgroupLocalId:
+    Properties |= eBuiltinPropertyWorkItem;
+    Properties |= eBuiltinPropertyRematerializable;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinGetLocalId:
+    Properties |= eBuiltinPropertyWorkItem;
+    Properties |= eBuiltinPropertyLocalID;
+    Properties |= eBuiltinPropertyRematerializable;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinDot:
+  case eCLBuiltinCross:
+  case eCLBuiltinFastDistance:
+  case eCLBuiltinFastLength:
+  case eCLBuiltinFastNormalize:
+    Properties |= eBuiltinPropertyReduction;
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinDistance:
+  case eCLBuiltinLength:
+  case eCLBuiltinNormalize:
+    Properties |= eBuiltinPropertyReduction;
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    // XXX The inline implementation seems to have precision issues. The dot
+    // product can overflow to +inf which results in the wrong result.
+    // See redmine #6427 and #9115
+    // Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinIsEqual:
+  case eCLBuiltinIsNotEqual:
+  case eCLBuiltinIsGreater:
+  case eCLBuiltinIsGreaterEqual:
+  case eCLBuiltinIsLess:
+  case eCLBuiltinIsLessEqual:
+  case eCLBuiltinIsLessGreater:
+  case eCLBuiltinIsOrdered:
+  case eCLBuiltinIsUnordered:
+  case eCLBuiltinIsFinite:
+  case eCLBuiltinIsInf:
+  case eCLBuiltinIsNan:
+  case eCLBuiltinIsNormal:
+  case eCLBuiltinSignBit:
+    // Scalar variants return '0' or '1', vector variants '0' or '111...1'.
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    Properties |= eBuiltinPropertySupportsInstantiation;
+    break;
+  case eCLBuiltinAny:
+  case eCLBuiltinAll:
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinFract:
+  case eCLBuiltinModF:
+  case eCLBuiltinSinCos:
+    Properties |= eBuiltinPropertyPointerReturnEqualRetTy;
+    break;
+  case eCLBuiltinFrexp:
+  case eCLBuiltinLGammaR:
+  case eCLBuiltinRemquo:
+    Properties |= eBuiltinPropertyPointerReturnEqualIntRetTy;
+    break;
+  case eCLBuiltinShuffle:
+  case eCLBuiltinShuffle2:
+    // While there are vector equivalents for these builtins, they require a
+    // modified mask, so we cannot use them by simply packetizing their
+    // arguments.
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinFMax:
+  case eCLBuiltinFMin:
+  case eCLBuiltinAddSat:
+  case eCLBuiltinSubSat:
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinConvertChar:
+  case eCLBuiltinConvertShort:
+  case eCLBuiltinConvertInt:
+  case eCLBuiltinConvertLong:
+  case eCLBuiltinConvertUChar:
+  case eCLBuiltinConvertUShort:
+  case eCLBuiltinConvertUInt:
+  case eCLBuiltinConvertULong:
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinVLoad:
+  case eCLBuiltinVLoadHalf:
+    Properties |= eBuiltinPropertyNoSideEffects;
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinVStore:
+  case eCLBuiltinVStoreHalf:
+    Properties |= eBuiltinPropertySideEffects;
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinSelect:
+  case eCLBuiltinAs:
+    // Some of these builtins do have vector equivalents, but since we can
+    // emit all variants inline, we mark them as having none for simplicity.
+    Properties |= eBuiltinPropertyNoVectorEquivalent;
+    Properties |= eBuiltinPropertyCanEmitInline;
+    break;
+  case eCLBuiltinWorkGroupBarrier:
+  case eCLBuiltinSubGroupBarrier:
+    IsConvergent = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinAtomicWorkItemFence:
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  case eCLBuiltinGetSubgroupSize:
+  case eCLBuiltinGetMaxSubgroupSize:
+  case eCLBuiltinGetNumSubgroups:
+  case eCLBuiltinGetEnqueuedNumSubgroups:
+  case eCLBuiltinGetSubgroupId:
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+    // Subgroup collectives
+  case eCLBuiltinSubgroupAll:
+  case eCLBuiltinSubgroupAny:
+  case eCLBuiltinSubgroupBroadcast:
+  case eCLBuiltinSubgroupReduceAdd:
+  case eCLBuiltinSubgroupReduceMin:
+  case eCLBuiltinSubgroupReduceMax:
+  case eCLBuiltinSubgroupScanAddInclusive:
+  case eCLBuiltinSubgroupScanAddExclusive:
+  case eCLBuiltinSubgroupScanMinInclusive:
+  case eCLBuiltinSubgroupScanMinExclusive:
+  case eCLBuiltinSubgroupScanMaxInclusive:
+  case eCLBuiltinSubgroupScanMaxExclusive:
+  case eCLBuiltinSubgroupReduceMul:
+  case eCLBuiltinSubgroupReduceAnd:
+  case eCLBuiltinSubgroupReduceOr:
+  case eCLBuiltinSubgroupReduceXor:
+  case eCLBuiltinSubgroupReduceLogicalAnd:
+  case eCLBuiltinSubgroupReduceLogicalOr:
+  case eCLBuiltinSubgroupReduceLogicalXor:
+  case eCLBuiltinSubgroupScanMulInclusive:
+  case eCLBuiltinSubgroupScanMulExclusive:
+  case eCLBuiltinSubgroupScanAndInclusive:
+  case eCLBuiltinSubgroupScanAndExclusive:
+  case eCLBuiltinSubgroupScanOrInclusive:
+  case eCLBuiltinSubgroupScanOrExclusive:
+  case eCLBuiltinSubgroupScanXorInclusive:
+  case eCLBuiltinSubgroupScanXorExclusive:
+  case eCLBuiltinSubgroupScanLogicalAndInclusive:
+  case eCLBuiltinSubgroupScanLogicalAndExclusive:
+  case eCLBuiltinSubgroupScanLogicalOrInclusive:
+  case eCLBuiltinSubgroupScanLogicalOrExclusive:
+  case eCLBuiltinSubgroupScanLogicalXorInclusive:
+  case eCLBuiltinSubgroupScanLogicalXorExclusive:
+    // Work-group collectives
+  case eCLBuiltinWorkgroupAll:
+  case eCLBuiltinWorkgroupAny:
+  case eCLBuiltinWorkgroupBroadcast:
+  case eCLBuiltinWorkgroupReduceAdd:
+  case eCLBuiltinWorkgroupReduceMin:
+  case eCLBuiltinWorkgroupReduceMax:
+  case eCLBuiltinWorkgroupScanAddInclusive:
+  case eCLBuiltinWorkgroupScanAddExclusive:
+  case eCLBuiltinWorkgroupScanMinInclusive:
+  case eCLBuiltinWorkgroupScanMinExclusive:
+  case eCLBuiltinWorkgroupScanMaxInclusive:
+  case eCLBuiltinWorkgroupScanMaxExclusive:
+  case eCLBuiltinWorkgroupReduceMul:
+  case eCLBuiltinWorkgroupReduceAnd:
+  case eCLBuiltinWorkgroupReduceOr:
+  case eCLBuiltinWorkgroupReduceXor:
+  case eCLBuiltinWorkgroupReduceLogicalAnd:
+  case eCLBuiltinWorkgroupReduceLogicalOr:
+  case eCLBuiltinWorkgroupReduceLogicalXor:
+  case eCLBuiltinWorkgroupScanMulInclusive:
+  case eCLBuiltinWorkgroupScanMulExclusive:
+  case eCLBuiltinWorkgroupScanAndInclusive:
+  case eCLBuiltinWorkgroupScanAndExclusive:
+  case eCLBuiltinWorkgroupScanOrInclusive:
+  case eCLBuiltinWorkgroupScanOrExclusive:
+  case eCLBuiltinWorkgroupScanXorInclusive:
+  case eCLBuiltinWorkgroupScanXorExclusive:
+  case eCLBuiltinWorkgroupScanLogicalAndInclusive:
+  case eCLBuiltinWorkgroupScanLogicalAndExclusive:
+  case eCLBuiltinWorkgroupScanLogicalOrInclusive:
+  case eCLBuiltinWorkgroupScanLogicalOrExclusive:
+  case eCLBuiltinWorkgroupScanLogicalXorInclusive:
+  case eCLBuiltinWorkgroupScanLogicalXorExclusive:
+    IsConvergent = true;
+    Properties |= eBuiltinPropertyLowerToMuxBuiltin;
+    break;
+  }
+
+  if (!IsConvergent) {
+    Properties |= eBuiltinPropertyKnownNonConvergent;
+  }
+
+  return Builtin{Callee, *ID, (BuiltinProperties)Properties};
+}
+
+Function *CLBuiltinInfo::getVectorEquivalent(const Builtin &B, unsigned Width,
+                                             Module *M) {
+  // Analyze the builtin. Some functions have no vector equivalent.
+  const auto Props = B.properties;
+  if (Props & eBuiltinPropertyNoVectorEquivalent) {
+    return nullptr;
+  }
+
+  // Builtin functions have mangled names. If it's not mangled, there will be
+  // no vector equivalent.
+  NameMangler Mangler(&B.function.getContext());
+  SmallVector<Type *, 4> BuiltinArgTypes, BuiltinPointeeTypes;
+  SmallVector<TypeQualifiers, 4> BuiltinArgQuals;
+  const StringRef BuiltinName =
+      Mangler.demangleName(B.function.getName(), BuiltinArgTypes,
+                           BuiltinPointeeTypes, BuiltinArgQuals);
+  if (BuiltinName.empty()) {
+    return nullptr;
+  }
+
+  // Determine the mangled name of the vector equivalent.
+  // This means creating a list of qualified types for the arguments.
+  SmallVector<Type *, 4> VectorTypes;
+  SmallVector<TypeQualifiers, 4> VectorQuals;
+  for (unsigned i = 0; i < BuiltinArgTypes.size(); i++) {
+    Type *OldTy = BuiltinArgTypes[i];
+    const TypeQualifiers OldQuals = BuiltinArgQuals[i];
+    if (isa<FixedVectorType>(OldTy)) {
+      return nullptr;
+    }
+    PointerType *OldPtrTy = dyn_cast<PointerType>(OldTy);
+    if (OldPtrTy) {
+      if (auto *const PtrRetPointeeTy =
+              getPointerReturnPointeeTy(B.function, Props)) {
+        [[maybe_unused]] auto *OldPointeeTy = BuiltinPointeeTypes[i];
+        assert(OldPointeeTy && OldPointeeTy == PtrRetPointeeTy &&
+               "Demangling inconsistency");
+        if (!FixedVectorType::isValidElementType(PtrRetPointeeTy)) {
+          return nullptr;
+        }
+        Type *NewType = OldPtrTy;
+        TypeQualifiers NewQuals;
+        TypeQualifiers EleQuals = OldQuals;
+        NewQuals.push_back(EleQuals.pop_front()); // Pointer qualifier
+        NewQuals.push_back(eTypeQualNone);        // Vector qualifier
+        NewQuals.push_back(EleQuals);
+
+        VectorTypes.push_back(NewType);
+        VectorQuals.push_back(NewQuals);
+
+        continue;
+      }
+    }
+
+    if (!FixedVectorType::isValidElementType(OldTy)) {
+      return nullptr;
+    }
+    TypeQualifiers NewQuals;
+    Type *NewType = FixedVectorType::get(OldTy, Width);
+    NewQuals.push_back(eTypeQualNone); // Vector qualifier
+    NewQuals.push_back(OldQuals);      // Element qualifier
+
+    VectorTypes.push_back(NewType);
+    VectorQuals.push_back(NewQuals);
+  }
+
+  // Handle special builtin naming equivalents.
+  std::string EquivNameBase = BuiltinName.str();
+  StringRef FirstChunk;
+  Lexer L(BuiltinName);
+  if (L.ConsumeUntil('_', FirstChunk)) {
+    const bool AsBuiltin = FirstChunk == "as";
+    const bool ConvertBuiltin = FirstChunk == "convert";
+    if (!L.Consume("_")) {
+      return nullptr;
+    }
+    StringRef SecondChunkNoWidth;
+    if (!L.ConsumeAlpha(SecondChunkNoWidth)) {
+      return nullptr;
+    }
+    if (AsBuiltin || ConvertBuiltin) {
+      // as_* and convert_* builtins have vector equivalents, with a vector
+      // width suffix. Add the width suffix to the scalar builtin name.
+      if (AsBuiltin && L.Left()) {
+        return nullptr;
+      }
+      const Twine WidthText(Width);
+      EquivNameBase.insert(L.CurrentPos(), WidthText.str());
+    }
+  }
+
+  const std::string EquivName =
+      Mangler.mangleName(EquivNameBase, VectorTypes, VectorQuals);
+
+  // Lookup the vector equivalent and make sure the return type agrees.
+  Function *VectorBuiltin = materializeBuiltin(EquivName, M);
+  if (VectorBuiltin) {
+    Type *RetTy = B.function.getReturnType();
+    auto *VecRetTy = dyn_cast<FixedVectorType>(VectorBuiltin->getReturnType());
+    if (!VecRetTy || (VecRetTy->getElementType() != RetTy) ||
+        (VecRetTy->getNumElements() != Width)) {
+      VectorBuiltin = nullptr;
+    }
+  }
+  return VectorBuiltin;
+}
+
+Function *CLBuiltinInfo::getScalarEquivalent(const Builtin &B, Module *M) {
+  // Analyze the builtin. Some functions have no scalar equivalent.
+  const auto Props = B.properties;
+  if (Props & eBuiltinPropertyNoVectorEquivalent) {
+    return nullptr;
+  }
+
+  // Check the return type.
+  auto *VecRetTy = dyn_cast<FixedVectorType>(B.function.getReturnType());
+  if (!VecRetTy) {
+    return nullptr;
+  }
+
+  // Builtin functions have mangled names. If it's not mangled, there will be
+  // no scalar equivalent.
+  NameMangler Mangler(&B.function.getContext());
+  SmallVector<Type *, 4> BuiltinArgTypes, BuiltinPointeeTypes;
+  SmallVector<TypeQualifiers, 4> BuiltinArgQuals;
+  const StringRef BuiltinName =
+      Mangler.demangleName(B.function.getName(), BuiltinArgTypes,
+                           BuiltinPointeeTypes, BuiltinArgQuals);
+  if (BuiltinName.empty()) {
+    return nullptr;
+  }
+
+  // Determine the mangled name of the scalar equivalent.
+  // This means creating a list of qualified types for the arguments.
+  const unsigned Width = VecRetTy->getNumElements();
+  SmallVector<Type *, 4> ScalarTypes;
+  SmallVector<TypeQualifiers, 4> ScalarQuals;
+  for (unsigned i = 0; i < BuiltinArgTypes.size(); i++) {
+    Type *OldTy = BuiltinArgTypes[i];
+    const TypeQualifiers OldQuals = BuiltinArgQuals[i];
+    if (auto *OldVecTy = dyn_cast<FixedVectorType>(OldTy)) {
+      if (OldVecTy->getNumElements() != Width) {
+        return nullptr;
+      }
+      Type *NewTy = OldVecTy->getElementType();
+      TypeQualifiers NewQuals = OldQuals;
+      NewQuals.pop_front();
+
+      ScalarTypes.push_back(NewTy);
+      ScalarQuals.push_back(NewQuals);
+    } else if (PointerType *OldPtrTy = dyn_cast<PointerType>(OldTy)) {
+      Type *const PtrRetPointeeTy =
+          getPointerReturnPointeeTy(B.function, Props);
+      if (PtrRetPointeeTy && PtrRetPointeeTy->isVectorTy()) {
+        [[maybe_unused]] auto *OldPointeeTy = BuiltinPointeeTypes[i];
+        assert(OldPointeeTy && OldPointeeTy == PtrRetPointeeTy &&
+               "Demangling inconsistency");
+        Type *NewTy = OldPtrTy;
+        TypeQualifiers NewQuals = OldQuals;
+        const TypeQualifier PtrQual = NewQuals.pop_front();
+        const TypeQualifier VecQual = NewQuals.pop_front();
+        (void)VecQual;
+        const TypeQualifier EleQual = NewQuals.pop_front();
+        NewQuals.push_back(PtrQual);
+        NewQuals.push_back(EleQual);
+        ScalarTypes.push_back(NewTy);
+        ScalarQuals.push_back(NewQuals);
+      } else {
+        ScalarTypes.push_back(OldTy);
+        ScalarQuals.push_back(OldQuals);
+      }
+    } else {
+      if (!OldTy) {
+        return nullptr;
+      }
+      ScalarTypes.push_back(OldTy);
+      ScalarQuals.push_back(OldQuals);
+    }
+  }
+
+  // Handle special builtin naming equivalents.
+  std::string EquivNameBase = BuiltinName.str();
+  StringRef FirstChunk;
+  Lexer L(BuiltinName);
+  if (L.ConsumeUntil('_', FirstChunk)) {
+    const bool AsBuiltin = FirstChunk == "as";
+    const bool ConvertBuiltin = FirstChunk == "convert";
+    if (!L.Consume("_")) {
+      return nullptr;
+    }
+    StringRef SecondChunkNoWidth;
+    if (!L.ConsumeAlpha(SecondChunkNoWidth)) {
+      return nullptr;
+    }
+    if (AsBuiltin || ConvertBuiltin) {
+      // as_* and convert_* builtins have scalar equivalents, with no width
+      // suffix. Remove the width suffix from the vector builtin name.
+      const unsigned WidthStart = L.CurrentPos();
+      unsigned Width = 0;
+      if (!L.ConsumeInteger(Width)) {
+        return nullptr;
+      }
+      const unsigned WidthEnd = L.CurrentPos();
+      EquivNameBase.erase(WidthStart, WidthEnd - WidthStart);
+    }
+  }
+
+  const std::string EquivName =
+      Mangler.mangleName(EquivNameBase, ScalarTypes, ScalarQuals);
+
+  // Lookup the scalar equivalent and make sure the return type agrees.
+  Function *ScalarBuiltin = materializeBuiltin(EquivName, M);
+  if (!ScalarBuiltin) {
+    return nullptr;
+  }
+  Type *RetTy = ScalarBuiltin->getReturnType();
+  if (VecRetTy->getElementType() != RetTy) {
+    return nullptr;
+  }
+  return ScalarBuiltin;
+}
+
+/// @brief Returns whether the parameter corresponding to given index to the
+/// (assumed builtin) Function is known to possess the given qualifier.
+/// @return true if the parameter is known to have the qualifier, false if not,
+/// and None on error.
+static std::optional<bool>
+paramHasTypeQual(const Function &F, unsigned ParamIdx, TypeQualifier Q) {
+  // Demangle the function name to get the type qualifiers.
+  SmallVector<Type *, 2> Types;
+  SmallVector<TypeQualifiers, 2> Quals;
+  NameMangler Mangler(&F.getContext());
+  if (Mangler.demangleName(F.getName(), Types, Quals).empty()) {
+    return std::nullopt;
+  }
+
+  if (ParamIdx >= Quals.size()) {
+    return std::nullopt;
+  }
+
+  auto &Qual = Quals[ParamIdx];
+  while (Qual.getCount()) {
+    if (Qual.pop_front() == Q) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Value *CLBuiltinInfo::emitBuiltinInline(Function *F, IRBuilder<> &B,
+                                        ArrayRef<Value *> Args) {
+  if (!F) {
+    return nullptr;
+  }
+
+  // Handle 'common' builtins.
+  const auto BuiltinID = identifyBuiltin(*F);
+  if (BuiltinID && *BuiltinID != eBuiltinUnknown) {
+    // Note we have to handle these specially since we need to deduce whether
+    // the source operand is signed or not. It is not possible to do this based
+    // solely on the BuiltinID.
+    switch (*BuiltinID) {
+      // 6.2 Explicit Conversions
+    case eCLBuiltinConvertChar:
+    case eCLBuiltinConvertShort:
+    case eCLBuiltinConvertInt:
+    case eCLBuiltinConvertLong:
+    case eCLBuiltinConvertUChar:
+    case eCLBuiltinConvertUShort:
+    case eCLBuiltinConvertUInt:
+    case eCLBuiltinConvertULong:
+      return emitBuiltinInlineConvert(F, *BuiltinID, B, Args);
+      // 6.12.3 Integer Functions
+    case eCLBuiltinAddSat:
+    case eCLBuiltinSubSat: {
+      std::optional<bool> IsParamSignedOrNone =
+          paramHasTypeQual(*F, 0, eTypeQualSignedInt);
+      if (!IsParamSignedOrNone.has_value()) {
+        return nullptr;
+      }
+      const bool IsSigned = *IsParamSignedOrNone;
+      const Intrinsic::ID IntrinsicOpc = [=] {
+        if (BuiltinID == eCLBuiltinSubSat) {
+          return IsSigned ? Intrinsic::ssub_sat : Intrinsic::usub_sat;
+        } else {
+          return IsSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat;
+        }
+      }();
+      return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                    IntrinsicOpc);
+    }
+    case eCLBuiltinVLoad: {
+      NameMangler Mangler(&F->getContext());
+      Lexer L(Mangler.demangleName(F->getName()));
+      if (L.Consume("vload")) {
+        unsigned Width = 0;
+        if (L.ConsumeInteger(Width)) {
+          return emitBuiltinInlineVLoad(F, Width, B, Args);
+        }
+      }
+    } break;
+    case eCLBuiltinVLoadHalf: {
+      NameMangler Mangler(&F->getContext());
+      const auto name = Mangler.demangleName(F->getName());
+      if (name == "vload_half") {
+        // TODO handle "vload_halfn"
+        return emitBuiltinInlineVLoadHalf(F, B, Args);
+      }
+    } break;
+    case eCLBuiltinVStore: {
+      NameMangler Mangler(&F->getContext());
+      Lexer L(Mangler.demangleName(F->getName()));
+      if (L.Consume("vstore")) {
+        unsigned Width = 0;
+        if (L.ConsumeInteger(Width)) {
+          return emitBuiltinInlineVStore(F, Width, B, Args);
+        }
+      }
+    } break;
+    case eCLBuiltinVStoreHalf: {
+      NameMangler Mangler(&F->getContext());
+      Lexer L(Mangler.demangleName(F->getName()));
+      if (L.Consume("vstore_half")) {
+        // TODO handle "vstore_halfn"
+        return emitBuiltinInlineVStoreHalf(F, L.TextLeft(), B, Args);
+      }
+    } break;
+    case eCLBuiltinSelect:
+      return emitBuiltinInlineSelect(F, B, Args);
+    case eCLBuiltinAs:
+      return emitBuiltinInlineAs(F, B, Args);
+    default:
+      break;
+    }
+    return emitBuiltinInline(*BuiltinID, B, Args);
+  }
+
+  return nullptr;
+}
+
+Value *CLBuiltinInfo::emitBuiltinInline(BuiltinID BuiltinID, IRBuilder<> &B,
+                                        ArrayRef<Value *> Args) {
+  switch (BuiltinID) {
+  default:
+    return nullptr;
+
+  case eCLBuiltinDot:
+  case eCLBuiltinCross:
+  case eCLBuiltinLength:
+  case eCLBuiltinDistance:
+  case eCLBuiltinNormalize:
+  case eCLBuiltinFastLength:
+  case eCLBuiltinFastDistance:
+  case eCLBuiltinFastNormalize:
+    return emitBuiltinInlineGeometrics(BuiltinID, B, Args);
+  // 6.12.2 Math Functions
+  case eCLBuiltinFMax:
+    return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                  llvm::Intrinsic::maxnum);
+  case eCLBuiltinFMin:
+    return emitBuiltinInlineAsLLVMBinaryIntrinsic(B, Args[0], Args[1],
+                                                  llvm::Intrinsic::minnum);
+  // 6.12.6 Relational Functions
+  case eCLBuiltinAll:
+    return emitBuiltinInlineAll(B, Args);
+  case eCLBuiltinAny:
+    return emitBuiltinInlineAny(B, Args);
+  case eCLBuiltinIsEqual:
+  case eCLBuiltinIsNotEqual:
+  case eCLBuiltinIsGreater:
+  case eCLBuiltinIsGreaterEqual:
+  case eCLBuiltinIsLess:
+  case eCLBuiltinIsLessEqual:
+  case eCLBuiltinIsLessGreater:
+  case eCLBuiltinIsOrdered:
+  case eCLBuiltinIsUnordered:
+    return emitBuiltinInlineRelationalsWithTwoArguments(BuiltinID, B, Args);
+  case eCLBuiltinIsFinite:
+  case eCLBuiltinIsInf:
+  case eCLBuiltinIsNan:
+  case eCLBuiltinIsNormal:
+  case eCLBuiltinSignBit:
+    assert(Args.size() == 1 && "Invalid number of arguments");
+    return emitBuiltinInlineRelationalsWithOneArgument(BuiltinID, B, Args[0]);
+  // 6.12.12 Miscellaneous Vector Functions
+  case eCLBuiltinShuffle:
+  case eCLBuiltinShuffle2:
+    return emitBuiltinInlineShuffle(BuiltinID, B, Args);
+
+  case eCLBuiltinPrintf:
+    return emitBuiltinInlinePrintf(BuiltinID, B, Args);
+  }
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineGeometrics(BuiltinID BuiltinID,
+                                                  IRBuilder<> &B,
+                                                  ArrayRef<Value *> Args) {
+  Value *Src = nullptr;
+  switch (BuiltinID) {
+  default:
+    return nullptr;
+  case eCLBuiltinDot:
+    return emitBuiltinInlineDot(B, Args);
+  case eCLBuiltinCross:
+    return emitBuiltinInlineCross(B, Args);
+  case eCLBuiltinLength:
+  case eCLBuiltinFastLength:
+    return emitBuiltinInlineLength(B, Args);
+  case eCLBuiltinDistance:
+  case eCLBuiltinFastDistance:
+    if (Args.size() != 2) {
+      return nullptr;
+    }
+    Src = B.CreateFSub(Args[0], Args[1], "distance");
+    return emitBuiltinInlineLength(B, ArrayRef<Value *>(&Src, 1));
+  case eCLBuiltinNormalize:
+  case eCLBuiltinFastNormalize:
+    return emitBuiltinInlineNormalize(B, Args);
+  }
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineDot(IRBuilder<> &B,
+                                           ArrayRef<Value *> Args) {
+  if (Args.size() != 2) {
+    return nullptr;
+  }
+  Value *Src0 = Args[0];
+  Value *Src1 = Args[1];
+  auto *SrcVecTy = dyn_cast<FixedVectorType>(Src0->getType());
+  if (SrcVecTy) {
+    Value *LHS0 = B.CreateExtractElement(Src0, B.getInt32(0), "lhs");
+    Value *RHS0 = B.CreateExtractElement(Src1, B.getInt32(0), "rhs");
+    Value *Sum = B.CreateFMul(LHS0, RHS0, "dot");
+    for (unsigned i = 1; i < SrcVecTy->getNumElements(); i++) {
+      Value *LHS = B.CreateExtractElement(Src0, B.getInt32(i), "lhs");
+      Value *RHS = B.CreateExtractElement(Src1, B.getInt32(i), "rhs");
+      Sum = B.CreateFAdd(Sum, B.CreateFMul(LHS, RHS, "dot"), "dot");
+    }
+    return Sum;
+  } else {
+    return B.CreateFMul(Src0, Src1, "dot");
+  }
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineCross(IRBuilder<> &B,
+                                             ArrayRef<Value *> Args) {
+  if (Args.size() != 2) {
+    return nullptr;
+  }
+  Value *Src0 = Args[0];
+  Value *Src1 = Args[1];
+  auto *RetTy = dyn_cast<FixedVectorType>(Src0->getType());
+  if (!RetTy) {
+    return nullptr;
+  }
+  const int SrcIndices[] = {1, 2, 2, 0, 0, 1};
+  SmallVector<Value *, 4> Src0Lanes;
+  SmallVector<Value *, 4> Src1Lanes;
+  for (unsigned i = 0; i < 3; i++) {
+    Src0Lanes.push_back(B.CreateExtractElement(Src0, B.getInt32(i)));
+    Src1Lanes.push_back(B.CreateExtractElement(Src1, B.getInt32(i)));
+  }
+
+  Value *Result = PoisonValue::get(RetTy);
+  for (unsigned i = 0; i < 3; i++) {
+    const int Idx0 = SrcIndices[(i * 2) + 0];
+    const int Idx1 = SrcIndices[(i * 2) + 1];
+    Value *Src0A = Src0Lanes[Idx0];
+    Value *Src1A = Src1Lanes[Idx1];
+    Value *TempA = B.CreateFMul(Src0A, Src1A);
+    Value *Src0B = Src0Lanes[Idx1];
+    Value *Src1B = Src1Lanes[Idx0];
+    Value *TempB = B.CreateFMul(Src0B, Src1B);
+    Value *Lane = B.CreateFSub(TempA, TempB);
+    Result = B.CreateInsertElement(Result, Lane, B.getInt32(i));
+  }
+  if (RetTy->getNumElements() == 4) {
+    Type *EleTy = RetTy->getElementType();
+    Result = B.CreateInsertElement(Result, Constant::getNullValue(EleTy),
+                                   B.getInt32(3));
+  }
+  return Result;
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineLength(IRBuilder<> &B,
+                                              ArrayRef<Value *> Args) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+  Value *Src0 = Args[0];
+  Value *Src1 = Src0;
+
+  NameMangler Mangler(&B.getContext());
+  Type *SrcType = Src0->getType();
+  auto *SrcVecType = dyn_cast<FixedVectorType>(SrcType);
+  if (SrcVecType) {
+    SrcType = SrcVecType->getElementType();
+  }
+
+  TypeQualifiers SrcQuals;
+  SmallVector<Type *, 4> Tys;
+  SmallVector<TypeQualifiers, 4> Quals;
+  SrcQuals.push_back(eTypeQualNone);
+
+  // Materialize 'sqrt', 'fabs' and 'isinf'.
+  Tys.push_back(SrcType);
+  Quals.push_back(SrcQuals);
+  BasicBlock *BB = B.GetInsertBlock();
+  if (!BB) {
+    return nullptr;
+  }
+  Function *F = BB->getParent();
+  if (!F) {
+    return nullptr;
+  }
+  Module *M = F->getParent();
+  if (!M) {
+    return nullptr;
+  }
+
+  const std::string FabsName = Mangler.mangleName("fabs", Tys, Quals);
+  Function *Fabs = materializeBuiltin(FabsName, M);
+  if (!Fabs) {
+    return nullptr;
+  }
+  if (!SrcVecType) {
+    // The "length" of a scalar is just the absolute value.
+    return CreateBuiltinCall(B, Fabs, Src0, "scalar_length");
+  }
+
+  const std::string SqrtName = Mangler.mangleName("sqrt", Tys, Quals);
+  Function *Sqrt = materializeBuiltin(SqrtName, M);
+  if (!Sqrt) {
+    return nullptr;
+  }
+
+  const std::string IsInfName = Mangler.mangleName("isinf", Tys, Quals);
+  Function *IsInf = materializeBuiltin(IsInfName, M);
+  if (!IsInf) {
+    return nullptr;
+  }
+  Tys.clear();
+  Quals.clear();
+
+  // Materialize 'fmax'.
+  Tys.push_back(SrcType);
+  Quals.push_back(SrcQuals);
+  Tys.push_back(SrcType);
+  Quals.push_back(SrcQuals);
+  const std::string FmaxName = Mangler.mangleName("fmax", Tys, Quals);
+  Function *Fmax = materializeBuiltin(FmaxName, M);
+  if (!Fmax) {
+    return nullptr;
+  }
+
+  // Emit length or distance inline.
+  SmallVector<Value *, 4> Ops;
+  Ops.push_back(Src0);
+  Ops.push_back(Src1);
+  Value *Result = emitBuiltinInline(eCLBuiltinDot, B, Ops);
+  Result = CreateBuiltinCall(B, Sqrt, Result, "result");
+
+  // Handle the case where the result is infinite.
+  Value *AltResult = ConstantFP::get(SrcType, 0.0);
+  if (SrcVecType) {
+    for (unsigned i = 0; i < SrcVecType->getNumElements(); i++) {
+      Value *SrcLane = B.CreateExtractElement(Src0, B.getInt32(i), "src_lane");
+      SrcLane = CreateBuiltinCall(B, Fabs, SrcLane, "src_lane");
+      AltResult =
+          CreateBuiltinCall(B, Fmax, {SrcLane, AltResult}, "alt_result");
+    }
+  } else {
+    Value *SrcLane = CreateBuiltinCall(B, Fabs, Src0, "src_lane");
+    AltResult = CreateBuiltinCall(B, Fmax, {SrcLane, AltResult}, "alt_result");
+  }
+  Value *Cond = CreateBuiltinCall(B, IsInf, Result, "cond");
+  Cond = B.CreateICmpEQ(Cond, B.getInt32(0), "cmp");
+  Result = B.CreateSelect(Cond, Result, AltResult, "final_result");
+  return Result;
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineNormalize(IRBuilder<> &B,
+                                                 ArrayRef<Value *> Args) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+
+  Value *Src0 = Args[0];
+
+  NameMangler Mangler(&B.getContext());
+  Type *SrcType = Src0->getType();
+  auto *SrcVecType = dyn_cast<FixedVectorType>(SrcType);
+  if (SrcVecType) {
+    SrcType = SrcVecType->getElementType();
+  }
+
+  TypeQualifiers SrcQuals;
+  SmallVector<Type *, 4> Tys;
+  SmallVector<TypeQualifiers, 4> Quals;
+  SrcQuals.push_back(eTypeQualNone);
+
+  // Materialize 'rsqrt'.
+  Tys.push_back(SrcType);
+  Quals.push_back(SrcQuals);
+  BasicBlock *BB = B.GetInsertBlock();
+  if (!BB) {
+    return nullptr;
+  }
+  Function *F = BB->getParent();
+  if (!F) {
+    return nullptr;
+  }
+  Module *M = F->getParent();
+  if (!M) {
+    return nullptr;
+  }
+
+  if (!SrcVecType) {
+    // A normalized scalar is either 1.0 or -1.0, unless the input was NaN, or
+    // in other words, just the sign.
+    const std::string SignName = Mangler.mangleName("sign", Tys, Quals);
+    Function *Sign = materializeBuiltin(SignName, M);
+    if (!Sign) {
+      return nullptr;
+    }
+    return CreateBuiltinCall(B, Sign, Src0, "scalar_normalize");
+  }
+
+  const std::string RSqrtName = Mangler.mangleName("rsqrt", Tys, Quals);
+  Function *RSqrt = materializeBuiltin(RSqrtName, M);
+  if (!RSqrt) {
+    return nullptr;
+  }
+
+  // Call 'dot' on the input.
+  SmallVector<Value *, 4> DotArgs;
+  DotArgs.push_back(Src0);
+  DotArgs.push_back(Src0);
+  Value *Result = emitBuiltinInlineDot(B, DotArgs);
+  Result = CreateBuiltinCall(B, RSqrt, Result, "normalize");
+  if (SrcVecType) {
+    Result = B.CreateVectorSplat(SrcVecType->getNumElements(), Result);
+  }
+  Result = B.CreateFMul(Result, Src0, "normalized");
+  return Result;
+}
+
+static Value *emitAllAnyReduction(IRBuilder<> &B, ArrayRef<Value *> Args,
+                                  Instruction::BinaryOps ReduceOp) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+  Value *Arg0 = Args[0];
+  IntegerType *EleTy = dyn_cast<IntegerType>(Arg0->getType()->getScalarType());
+  if (!EleTy) {
+    return nullptr;
+  }
+
+  // Reduce the MSB of all vector lanes.
+  Value *ReducedVal = nullptr;
+  auto *VecTy = dyn_cast<FixedVectorType>(Arg0->getType());
+  if (VecTy) {
+    ReducedVal = B.CreateExtractElement(Arg0, B.getInt32(0));
+    for (unsigned i = 1; i < VecTy->getNumElements(); i++) {
+      Value *Lane = B.CreateExtractElement(Arg0, B.getInt32(i));
+      ReducedVal = B.CreateBinOp(ReduceOp, ReducedVal, Lane);
+    }
+  } else {
+    ReducedVal = Arg0;
+  }
+
+  // Shift the MSB to return either 0 or 1.
+  const unsigned ShiftAmount = EleTy->getPrimitiveSizeInBits() - 1;
+  Value *ShiftAmountVal = ConstantInt::get(EleTy, ShiftAmount);
+  Value *Result = B.CreateLShr(ReducedVal, ShiftAmountVal);
+  return B.CreateZExtOrTrunc(Result, B.getInt32Ty());
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineAll(IRBuilder<> &B,
+                                           ArrayRef<Value *> Args) {
+  return emitAllAnyReduction(B, Args, Instruction::And);
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineAny(IRBuilder<> &B,
+                                           ArrayRef<Value *> Args) {
+  return emitAllAnyReduction(B, Args, Instruction::Or);
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlineSelect(Function *F, IRBuilder<> &B,
+                                              ArrayRef<Value *> Args) {
+  if (F->arg_size() != 3) {
+    return nullptr;
+  }
+  Value *FalseVal = Args[0];
+  Value *TrueVal = Args[1];
+  Value *Cond = Args[2];
+  Type *RetTy = F->getReturnType();
+  auto *VecRetTy = dyn_cast<FixedVectorType>(RetTy);
+  Type *CondEleTy = Cond->getType()->getScalarType();
+  const unsigned CondEleBits = CondEleTy->getPrimitiveSizeInBits();
+  if (VecRetTy) {
+    const unsigned SimdWidth = VecRetTy->getNumElements();
+    Constant *ShiftAmount = ConstantInt::get(CondEleTy, CondEleBits - 1);
+    Constant *VecShiftAmount = ConstantVector::getSplat(
+        ElementCount::getFixed(SimdWidth), ShiftAmount);
+    Value *Mask = B.CreateAShr(Cond, VecShiftAmount);
+    Value *TrueValRaw = TrueVal;
+    Value *FalseValRaw = FalseVal;
+    if (VecRetTy->getElementType()->isFloatingPointTy()) {
+      auto *RawType = FixedVectorType::getInteger(VecRetTy);
+      TrueValRaw = B.CreateBitCast(TrueVal, RawType);
+      FalseValRaw = B.CreateBitCast(FalseVal, RawType);
+    }
+    Value *Result = B.CreateXor(TrueValRaw, FalseValRaw);
+    Result = B.CreateAnd(Result, Mask);
+    Result = B.CreateXor(Result, FalseValRaw);
+    if (Result->getType() != VecRetTy) {
+      Result = B.CreateBitCast(Result, VecRetTy);
+    }
+    return Result;
+  } else {
+    Value *Cmp = B.CreateICmpNE(Cond, Constant::getNullValue(CondEleTy));
+    return B.CreateSelect(Cmp, TrueVal, FalseVal);
+  }
+}
+
+/// @brief Emit the body of a builtin function as a call to a binary LLVM
+/// intrinsic. If one argument is a scalar type and the other a vector type,
+/// the scalar argument is splatted to the vector type.
+///
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] LHS first argument to be passed to the intrinsic.
+/// @param[in] RHS second argument to be passed to the intrinsic.
+/// @param[in] ID the LLVM intrinsic ID.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineAsLLVMBinaryIntrinsic(
+    IRBuilder<> &B, Value *LHS, Value *RHS, llvm::Intrinsic::ID ID) {
+  const Triple TT(B.GetInsertBlock()->getModule()->getTargetTriple());
+  if (TT.getArch() == Triple::arm || TT.getArch() == Triple::aarch64) {
+    // fmin and fmax fail CTS on arm targets.
+    // This is a HACK and should be removed when it is resolved.
+    return nullptr;
+  }
+
+  const auto *LHSTy = LHS->getType();
+  const auto *RHSTy = RHS->getType();
+  if (LHSTy->isVectorTy() != RHSTy->isVectorTy()) {
+    auto VectorEC =
+        multi_llvm::getVectorElementCount(LHSTy->isVectorTy() ? LHSTy : RHSTy);
+    if (!LHS->getType()->isVectorTy()) {
+      LHS = B.CreateVectorSplat(VectorEC, LHS);
+    }
+    if (!RHS->getType()->isVectorTy()) {
+      RHS = B.CreateVectorSplat(VectorEC, RHS);
+    }
+  }
+  return B.CreateBinaryIntrinsic(ID, LHS, RHS);
+}
+
+/// @brief Emit the body of the 'as_*' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineAs(Function *F, llvm::IRBuilder<> &B,
+                                          llvm::ArrayRef<Value *> Args) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+  Value *Src = Args[0];
+  Type *SrcTy = Src->getType();
+  Type *DstTy = F->getReturnType();
+  auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy);
+  auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
+  Type *SrcEleTy = SrcVecTy ? SrcVecTy->getElementType() : nullptr;
+  Type *DstEleTy = DstVecTy ? DstVecTy->getElementType() : nullptr;
+  const unsigned SrcEleBits = SrcEleTy ? SrcEleTy->getPrimitiveSizeInBits() : 0;
+  const unsigned DstEleBits = DstEleTy ? DstEleTy->getPrimitiveSizeInBits() : 0;
+  const bool SrcDstHaveSameWidth =
+      SrcEleTy && DstEleTy && (SrcEleBits == DstEleBits);
+  const bool SrcVec3 = SrcVecTy && (SrcVecTy->getNumElements() == 3);
+  const bool SrcVec4 = SrcVecTy && (SrcVecTy->getNumElements() == 4);
+  const bool DstVec3 = DstVecTy && (DstVecTy->getNumElements() == 3);
+  const bool DstVec4 = DstVecTy && (DstVecTy->getNumElements() == 4);
+  bool LowerAsShuffle = false;
+  if (SrcVec3 && !DstVec3) {
+    if (!DstVec4 || !SrcDstHaveSameWidth) {
+      return nullptr;
+    }
+    LowerAsShuffle = true;
+  } else if (DstVec3 && !SrcVec3) {
+    if (!SrcVec4 || !SrcDstHaveSameWidth) {
+      return nullptr;
+    }
+    LowerAsShuffle = true;
+  }
+
+  // Lower some vec3 variants of as_* using vector shuffles.
+  if (LowerAsShuffle) {
+    SmallVector<Constant *, 4> Indices;
+    for (unsigned i = 0; i < DstVecTy->getNumElements(); i++) {
+      if (i < SrcVecTy->getNumElements()) {
+        Indices.push_back(B.getInt32(i));
+      } else {
+        Indices.push_back(PoisonValue::get(B.getInt32Ty()));
+      }
+    }
+    Value *Mask = ConstantVector::get(Indices);
+    Src = B.CreateShuffleVector(Src, PoisonValue::get(SrcVecTy), Mask);
+  }
+
+  // Common case: as_* is a simple bitcast.
+  return B.CreateBitCast(Src, DstTy, "as");
+}
+
+/// @brief Emit the body of the 'convert_*' builtin functions.
+///
+/// @param[in] F the function to emit inline.
+/// @param[in] builtinID Builtin ID of the function.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineConvert(Function *F, BuiltinID builtinID,
+                                               IRBuilder<> &B,
+                                               ArrayRef<Value *> Args) {
+  if (Args.size() != 1) {
+    return nullptr;
+  }
+  Type *DstTy = nullptr;
+  bool DstIsSigned = false;
+  auto &Ctx = B.getContext();
+  switch (builtinID) {
+  case eCLBuiltinConvertChar:
+    DstIsSigned = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinConvertUChar:
+    DstTy = IntegerType::getInt8Ty(Ctx);
+    break;
+  case eCLBuiltinConvertShort:
+    DstIsSigned = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinConvertUShort:
+    DstTy = IntegerType::getInt16Ty(Ctx);
+    break;
+  case eCLBuiltinConvertInt:
+    DstIsSigned = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinConvertUInt:
+    DstTy = IntegerType::getInt32Ty(Ctx);
+    break;
+  case eCLBuiltinConvertLong:
+    DstIsSigned = true;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinConvertULong:
+    DstTy = IntegerType::getInt64Ty(Ctx);
+    break;
+
+  default:
+    return nullptr;
+  }
+  if (!DstTy) {
+    return nullptr;
+  }
+
+  Value *Src = Args[0];
+  bool SrcIsSigned;
+  if (Src->getType()->isFloatingPointTy()) {
+    // All floating point types are signed
+    SrcIsSigned = true;
+  } else {
+    auto IsParamSignedOrNone = paramHasTypeQual(*F, 0, eTypeQualSignedInt);
+    if (!IsParamSignedOrNone) {
+      return nullptr;
+    }
+    SrcIsSigned = *IsParamSignedOrNone;
+  }
+
+  auto Opcode = CastInst::getCastOpcode(Src, SrcIsSigned, DstTy, DstIsSigned);
+  return B.CreateCast(Opcode, Src, DstTy, "inline_convert");
+}
+
+/// @brief Emit the body of the 'vloadN' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] Width Number of elements to load.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineVLoad(Function *F, unsigned Width,
+                                             IRBuilder<> &B,
+                                             ArrayRef<Value *> Args) {
+  if (Width < 2) {
+    return nullptr;
+  }
+  (void)F;
+
+  Type *RetTy = F->getReturnType();
+  assert(isa<FixedVectorType>(RetTy) && "vloadN must return a vector type");
+  Type *EltTy = RetTy->getScalarType();
+
+  Value *Ptr = Args[1];
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  auto *DataTy = FixedVectorType::get(EltTy, Width);
+  Value *Data = PoisonValue::get(DataTy);
+
+  // Emit the base pointer.
+  Value *Offset = Args[0];
+  IntegerType *OffsetTy = dyn_cast<IntegerType>(Offset->getType());
+  if (!OffsetTy) {
+    return nullptr;
+  }
+  Value *Stride = ConstantInt::get(OffsetTy, Width);
+  Offset = B.CreateMul(Offset, Stride);
+  Value *GEPBase = B.CreateGEP(EltTy, Ptr, Offset, "vload_base");
+
+  if (Width == 3) {
+    for (unsigned i = 0; i < Width; i++) {
+      Value *Index = B.getInt32(i);
+      Value *GEP = B.CreateGEP(EltTy, GEPBase, Index);
+      Value *Lane = B.CreateLoad(EltTy, GEP, false, "vload");
+      Data = B.CreateInsertElement(Data, Lane, Index, "vload_insert");
+    }
+  } else {
+    auto *Load = B.CreateLoad(DataTy, GEPBase, false, "vload");
+
+    const unsigned Align = DataTy->getScalarSizeInBits() / 8;
+    Load->setAlignment(MaybeAlign(Align).valueOrOne());
+    Data = Load;
+  }
+
+  return Data;
+}
+
+/// @brief Emit the body of the 'vstoreN' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] Width Number of elements to store.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineVStore(Function *F, unsigned Width,
+                                              IRBuilder<> &B,
+                                              ArrayRef<Value *> Args) {
+  if (Width < 2) {
+    return nullptr;
+  }
+  (void)F;
+
+  Value *Data = Args[0];
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Data->getType());
+  if (!VecDataTy || (VecDataTy->getNumElements() != Width)) {
+    return nullptr;
+  }
+
+  Value *Ptr = Args[2];
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+
+  // Emit the base pointer.
+  Value *Offset = Args[1];
+  IntegerType *OffsetTy = dyn_cast<IntegerType>(Offset->getType());
+  if (!OffsetTy) {
+    return nullptr;
+  }
+  Value *Stride = ConstantInt::get(OffsetTy, Width);
+  Offset = B.CreateMul(Offset, Stride);
+  Value *GEPBase =
+      B.CreateGEP(VecDataTy->getElementType(), Ptr, Offset, "vstore_base");
+
+  // Emit store(s).
+  StoreInst *Store = nullptr;
+  if (Width == 3) {
+    for (unsigned i = 0; i < Width; i++) {
+      Value *Index = B.getInt32(i);
+      Value *Lane = B.CreateExtractElement(Data, Index, "vstore_extract");
+      Value *GEP = B.CreateGEP(VecDataTy->getElementType(), GEPBase, Index);
+      Store = B.CreateStore(Lane, GEP, false);
+    }
+  } else {
+    Store = B.CreateStore(Data, GEPBase, false);
+
+    const unsigned Align = VecDataTy->getScalarSizeInBits() / 8;
+    Store->setAlignment(MaybeAlign(Align).valueOrOne());
+  }
+  return Store;
+}
+
+/// @brief Emit the body of the 'vload_half' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineVLoadHalf(Function *F, IRBuilder<> &B,
+                                                 ArrayRef<Value *> Args) {
+  if (F->getType()->isVectorTy()) {
+    return nullptr;
+  }
+
+  // Cast the pointer to ushort*.
+  Value *Ptr = Args[1];
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  Type *U16Ty = B.getInt16Ty();
+
+  // Emit the base pointer.
+  Value *Offset = Args[0];
+  Value *DataPtr = B.CreateGEP(U16Ty, Ptr, Offset, "vload_base");
+
+  // Load a ushort.
+  Value *Data = B.CreateLoad(B.getInt16Ty(), DataPtr, "vload_half");
+
+  // Declare the conversion builtin.
+  Module *M = F->getParent();
+  Function *HalfToFloatFn =
+      declareBuiltin(M, eCLBuiltinConvertHalfToFloat, B.getFloatTy(),
+                     {B.getInt16Ty()}, {eTypeQualNone});
+  if (!HalfToFloatFn) {
+    return nullptr;
+  }
+
+  // Convert it to float.
+  CallInst *CI = CreateBuiltinCall(B, HalfToFloatFn, {Data});
+  CI->setCallingConv(F->getCallingConv());
+
+  return CI;
+}
+
+/// @brief Emit the body of the 'vstore_half' builtin function.
+///
+/// @param[in] F Function to emit the body inline.
+/// @param[in] Mode Rounding mode to use, e.g. '_rte'.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineVStoreHalf(Function *F, StringRef Mode,
+                                                  IRBuilder<> &B,
+                                                  ArrayRef<Value *> Args) {
+  Value *Data = Args[0];
+  if (!Data || Data->getType()->isVectorTy()) {
+    return nullptr;
+  }
+
+  // Declare the conversion builtin.
+  std::optional<BuiltinID> ConvID;
+
+  if (Data->getType() == B.getFloatTy()) {
+    ConvID = StringSwitch<std::optional<BuiltinID>>(Mode)
+                 .Case("", eCLBuiltinConvertFloatToHalf)
+                 .Case("_rte", eCLBuiltinConvertFloatToHalfRte)
+                 .Case("_rtz", eCLBuiltinConvertFloatToHalfRtz)
+                 .Case("_rtp", eCLBuiltinConvertFloatToHalfRtp)
+                 .Case("_rtn", eCLBuiltinConvertFloatToHalfRtn)
+                 .Default(std::nullopt);
+  } else {
+    ConvID = StringSwitch<std::optional<BuiltinID>>(Mode)
+                 .Case("", eCLBuiltinConvertDoubleToHalf)
+                 .Case("_rte", eCLBuiltinConvertDoubleToHalfRte)
+                 .Case("_rtz", eCLBuiltinConvertDoubleToHalfRtz)
+                 .Case("_rtp", eCLBuiltinConvertDoubleToHalfRtp)
+                 .Case("_rtn", eCLBuiltinConvertDoubleToHalfRtn)
+                 .Default(std::nullopt);
+  }
+  if (!ConvID) {
+    return nullptr;
+  }
+  Module *M = F->getParent();
+
+  // Normally, the vstore_half functions take the number to store as a float.
+  // However, if the double extension is enabled, it is also possible to use
+  // double instead. This means that we might have to convert either a float or
+  // a double to a half.
+  Function *FloatToHalfFn = declareBuiltin(M, *ConvID, B.getInt16Ty(),
+                                           {Data->getType()}, {eTypeQualNone});
+  if (!FloatToHalfFn) {
+    return nullptr;
+  }
+
+  // Convert the data from float/double to half.
+  CallInst *CI = CreateBuiltinCall(B, FloatToHalfFn, {Data});
+  CI->setCallingConv(F->getCallingConv());
+  Data = CI;
+
+  // Cast the pointer to ushort*.
+  Value *Ptr = Args[2];
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  auto U16Ty = B.getInt16Ty();
+
+  // Emit the base pointer.
+  Value *Offset = Args[1];
+  Value *DataPtr = B.CreateGEP(U16Ty, Ptr, Offset, "vstore_base");
+
+  // Store the ushort.
+  return B.CreateStore(Data, DataPtr);
+}
+
+/// @brief Emit the body of a relational builtin function.
+///
+/// This function handles relational builtins that accept two arguments, such as
+/// the comparison builtins.
+///
+/// @param[in] BuiltinID Identifier of the builtin to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineRelationalsWithTwoArguments(
+    BuiltinID BuiltinID, IRBuilder<> &B, ArrayRef<Value *> Args) {
+  CmpInst::Predicate Pred = CmpInst::FCMP_FALSE;
+  CmpInst::Predicate Pred2 = CmpInst::FCMP_FALSE;
+  switch (BuiltinID) {
+  default:
+    return nullptr;
+  case eCLBuiltinIsEqual:
+    Pred = CmpInst::FCMP_OEQ;
+    break;
+  case eCLBuiltinIsNotEqual:
+    Pred = CmpInst::FCMP_UNE;
+    break;
+  case eCLBuiltinIsGreater:
+    Pred = CmpInst::FCMP_OGT;
+    break;
+  case eCLBuiltinIsGreaterEqual:
+    Pred = CmpInst::FCMP_OGE;
+    break;
+  case eCLBuiltinIsLess:
+    Pred = CmpInst::FCMP_OLT;
+    break;
+  case eCLBuiltinIsLessEqual:
+    Pred = CmpInst::FCMP_OLE;
+    break;
+  case eCLBuiltinIsLessGreater:
+    Pred = CmpInst::FCMP_OLT;
+    Pred2 = CmpInst::FCMP_OGT;
+    break;
+  case eCLBuiltinIsOrdered:
+    Pred = CmpInst::FCMP_ORD;
+    break;
+  case eCLBuiltinIsUnordered:
+    Pred = CmpInst::FCMP_UNO;
+    break;
+  }
+
+  if (Args.size() != 2) {
+    return nullptr;
+  }
+  Value *Src0 = Args[0], *Src1 = Args[1];
+  Value *Cmp = B.CreateFCmp(Pred, Src0, Src1, "relational");
+
+  Type *ResultEleTy = nullptr;
+  Type *Src0Ty = Src0->getType();
+  if (Src0->getType() == B.getDoubleTy()) {
+    // Special case because relational(doubleN, doubleN) returns longn while
+    // relational(double, double) returns int.
+    if (Src0Ty->isVectorTy()) {
+      ResultEleTy = B.getInt64Ty();
+    } else {
+      ResultEleTy = B.getInt32Ty();
+    }
+  } else if (Src0->getType() == B.getHalfTy()) {
+    // Special case because relational(HalfTyN, HalfTyN) returns i16 while
+    // relational(HalfTy, HalfTy) returns int.
+    if (Src0Ty->isVectorTy()) {
+      ResultEleTy = B.getInt16Ty();
+    } else {
+      ResultEleTy = B.getInt32Ty();
+    }
+  } else {
+    // All the other cases can be handled here.
+    ResultEleTy = B.getIntNTy(Src0->getType()->getScalarSizeInBits());
+  }
+  Value *Result = nullptr;
+  auto *SrcVecTy = dyn_cast<FixedVectorType>(Src0->getType());
+  if (SrcVecTy) {
+    auto *ResultVecTy =
+        FixedVectorType::get(ResultEleTy, SrcVecTy->getNumElements());
+    Result = B.CreateSExt(Cmp, ResultVecTy, "relational");
+  } else {
+    Result = B.CreateZExt(Cmp, ResultEleTy, "relational");
+  }
+
+  if (Pred2 != CmpInst::FCMP_FALSE) {
+    Value *Cmp2 = B.CreateFCmp(Pred2, Src0, Src1, "relational");
+    Value *True = SrcVecTy ? Constant::getAllOnesValue(Result->getType())
+                           : ConstantInt::get(Result->getType(), 1);
+    Result = B.CreateSelect(Cmp2, True, Result);
+  }
+
+  return Result;
+}
+
+/// @brief Emit the body of a relational builtin function.
+///
+/// This function handles relational builtins that accept a single argument,
+/// such as the builtins checking if the argument is infinite or not.
+///
+/// @param[in] BuiltinID Identifier of the builtin to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Arg Argument passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineRelationalsWithOneArgument(
+    BuiltinID BuiltinID, IRBuilder<> &B, Value *Arg) {
+  Value *Result = nullptr;
+  // The types (and misc info) that we will be using
+  Type *ArgTy = Arg->getType();
+  const bool isVectorTy = ArgTy->isVectorTy();
+  const unsigned Width =
+      isVectorTy ? multi_llvm::getVectorNumElements(ArgTy) : 0;
+  Type *ArgEleTy = isVectorTy ? multi_llvm::getVectorElementType(ArgTy) : ArgTy;
+  Type *SignedTy = ArgEleTy == B.getFloatTy() ? B.getInt32Ty() : B.getInt64Ty();
+  Type *ReturnTy = (ArgEleTy == B.getDoubleTy() && isVectorTy) ? B.getInt64Ty()
+                                                               : B.getInt32Ty();
+
+  if (ArgEleTy != B.getFloatTy() && ArgEleTy != B.getDoubleTy()) {
+    return nullptr;
+  }
+  // Create all the masks we are going to be using
+  Constant *ExponentMask = nullptr;
+  Constant *MantissaMask = nullptr;
+  Constant *NonSignMask = nullptr;
+  Constant *Zero = nullptr;
+  if (ArgEleTy == B.getFloatTy()) {
+    ExponentMask = B.getInt32(0x7F800000u);
+    MantissaMask = B.getInt32(0x007FFFFFu);
+    NonSignMask = B.getInt32(0x7FFFFFFFu);
+    Zero = B.getInt32(0u);
+  } else if (ArgEleTy == B.getDoubleTy()) {
+    ExponentMask = B.getInt64(0x7FF0000000000000u);
+    MantissaMask = B.getInt64(0x000FFFFFFFFFFFFFu);
+    NonSignMask = B.getInt64(0x7FFFFFFFFFFFFFFFu);
+    Zero = B.getInt64(0u);
+  }
+
+  // For the vector versions, we need to create vector types and values
+  if (isVectorTy) {
+    SignedTy = FixedVectorType::get(SignedTy, Width);
+    ReturnTy = FixedVectorType::get(ReturnTy, Width);
+    const auto EC = ElementCount::getFixed(Width);
+    ExponentMask = ConstantVector::getSplat(EC, ExponentMask);
+    MantissaMask = ConstantVector::getSplat(EC, MantissaMask);
+    NonSignMask = ConstantVector::getSplat(EC, NonSignMask);
+    Zero = ConstantVector::getSplat(EC, Zero);
+  }
+
+  // We will be needing access to the argument as an integer (bitcast) value
+  Value *STArg = B.CreateBitCast(Arg, SignedTy);
+
+  // Emit the IR that will calculate the result
+  switch (BuiltinID) {
+  default:
+    llvm_unreachable("Invalid Builtin ID");
+    break;
+  case eCLBuiltinIsFinite:
+    Result = B.CreateAnd(STArg, NonSignMask);
+    Result = B.CreateICmpSLT(Result, ExponentMask);
+    break;
+  case eCLBuiltinIsInf:
+    Result = B.CreateAnd(STArg, NonSignMask);
+    Result = B.CreateICmpEQ(Result, ExponentMask);
+    break;
+  case eCLBuiltinIsNan: {
+    Result = B.CreateAnd(STArg, NonSignMask);
+    // This checks if the exponent is all ones (the same as the ExponentMask)
+    // and also if the significant (the mantissa) is not zero. If the mantissa
+    // is zero then it would be infinite, not NaN.
+    Value *ExponentAllOnes =
+        B.CreateICmpEQ(ExponentMask, B.CreateAnd(ExponentMask, Result));
+    Value *MantissaNotZero =
+        B.CreateICmpSGT(B.CreateAnd(MantissaMask, Result), Zero);
+    Result = B.CreateAnd(ExponentAllOnes, MantissaNotZero);
+    break;
+  }
+  case eCLBuiltinIsNormal: {
+    Result = B.CreateAnd(STArg, NonSignMask);
+    Value *ExponentBitsNotAllSet = B.CreateICmpSLT(Result, ExponentMask);
+    Value *ExponentBitsNonZero = B.CreateICmpSGT(Result, MantissaMask);
+    Result = B.CreateAnd(ExponentBitsNotAllSet, ExponentBitsNonZero);
+    break;
+  }
+  case eCLBuiltinSignBit:
+    Result = B.CreateICmpSLT(STArg, Zero);
+    break;
+  }
+
+  // Convert the i1 result from the comparison instruction to the type that the
+  // builtin returns
+  if (isVectorTy) {
+    // 0 for false, -1 (all 1s) for true
+    Result = B.CreateSExt(Result, ReturnTy);
+  } else {
+    // 0 for false, 1 for true
+    Result = B.CreateZExt(Result, ReturnTy);
+  }
+
+  return Result;
+}
+
+/// @brief Emit the body of a vector shuffle builtin function.
+///
+/// @param[in] BuiltinID Identifier of the builtin to emit the body inline.
+/// @param[in] B Builder used to emit instructions.
+/// @param[in] Args Arguments passed to the function.
+///
+/// @return Value returned by the builtin implementation or null on failure.
+Value *CLBuiltinInfo::emitBuiltinInlineShuffle(BuiltinID BuiltinID,
+                                               IRBuilder<> &B,
+                                               ArrayRef<Value *> Args) {
+  // Make sure we have the correct number of arguments.
+  assert(((BuiltinID == eCLBuiltinShuffle && Args.size() == 2) ||
+          (BuiltinID == eCLBuiltinShuffle2 && Args.size() == 3)) &&
+         "Wrong number of arguments!");
+
+  // It is not worth splitting shuffle and shuffle2 into two functions as a lot
+  // of the code is the same.
+  const bool isShuffle2 = (BuiltinID == eCLBuiltinShuffle2);
+
+  // Get the mask and the mask type.
+  Value *Mask = Args[isShuffle2 ? 2 : 1];
+  auto MaskVecTy = cast<FixedVectorType>(Mask->getType());
+  IntegerType *MaskTy = cast<IntegerType>(MaskVecTy->getElementType());
+  const int MaskWidth = MaskVecTy->getNumElements();
+
+  // TODO: Support non-constant masks (in a less efficient way)
+  if (!isa<Constant>(Mask)) {
+    return nullptr;
+  }
+
+  // We need to mask the mask elements, since the OpenCL standard specifies that
+  // we should only take the ilogb(2N-1)+1 least significant bits from each mask
+  // element into consideration, where N the number of elements in the vector
+  // according to vec_step.
+  auto ShuffleTy = cast<FixedVectorType>(Args[0]->getType());
+  const int Width = ShuffleTy->getNumElements();
+  // Vectors for size 3 are not supported by the shuffle builtin.
+  assert(Width != 3 && "Invalid vector width of 3!");
+  const int N = (Width == 3 ? 4 : Width);
+  const int SignificantBits =
+      stdcompat::ilogb((2 * N) - 1) + (isShuffle2 ? 1 : 0);
+  const unsigned BitMask = ~((~0u) << SignificantBits);
+  Value *BitMaskV = ConstantVector::getSplat(ElementCount::getFixed(MaskWidth),
+                                             ConstantInt::get(MaskTy, BitMask));
+  // The builtin's mask may have different integer types, while the LLVM
+  // instruction only supports i32.
+  // Mask the mask.
+  Value *MaskedMask = B.CreateAnd(Mask, BitMaskV, "mask");
+  MaskedMask = B.CreateIntCast(
+      MaskedMask, FixedVectorType::get(B.getInt32Ty(), MaskWidth), false);
+
+  // Create the shufflevector instruction.
+  Value *Arg1 = (isShuffle2 ? Args[1] : PoisonValue::get(ShuffleTy));
+  return B.CreateShuffleVector(Args[0], Arg1, MaskedMask, "shuffle");
+}
+
+Value *CLBuiltinInfo::emitBuiltinInlinePrintf(BuiltinID, IRBuilder<> &B,
+                                              ArrayRef<Value *> Args) {
+  Module &M = *(B.GetInsertBlock()->getModule());
+
+  // Declare printf if needed.
+  Function *Printf = M.getFunction("printf");
+  if (!Printf) {
+    PointerType *PtrTy = B.getPtrTy(/*AddrSpace=*/0);
+    FunctionType *PrintfTy = FunctionType::get(B.getInt32Ty(), {PtrTy}, true);
+    Printf =
+        Function::Create(PrintfTy, GlobalValue::ExternalLinkage, "printf", &M);
+    Printf->setCallingConv(CallingConv::SPIR_FUNC);
+  }
+
+  return CreateBuiltinCall(B, Printf, Args);
+}
+
+// Must be kept in sync with our OpenCL headers!
+enum : uint32_t {
+  CLK_LOCAL_MEM_FENCE = 1,
+  CLK_GLOBAL_MEM_FENCE = 2,
+  // FIXME: We don't support image fences in our headers
+};
+
+// Must be kept in sync with our OpenCL headers!
+enum : uint32_t {
+  memory_scope_work_item = 1,
+  memory_scope_sub_group = 2,
+  memory_scope_work_group = 3,
+  memory_scope_device = 4,
+  memory_scope_all_svm_devices = 5,
+  memory_scope_all_devices = 6,
+};
+
+// Must be kept in sync with our OpenCL headers!
+enum : uint32_t {
+  memory_order_relaxed = 0,
+  memory_order_acquire = 1,
+  memory_order_release = 2,
+  memory_order_acq_rel = 3,
+  memory_order_seq_cst = 4,
+};
+
+static std::optional<unsigned> parseMemFenceFlagsParam(Value *const P) {
+  // Grab the 'flags' parameter.
+  if (auto *const Flags = dyn_cast<ConstantInt>(P)) {
+    // cl_mem_fence_flags is a bitfield and can be 0 or a combination of
+    // CLK_(GLOBAL|LOCAL|IMAGE)_MEM_FENCE values ORed together.
+    switch (Flags->getZExtValue()) {
+    case 0:
+      return std::nullopt;
+    case CLK_LOCAL_MEM_FENCE:
+      return BIMuxInfoConcept::MemSemanticsWorkGroupMemory;
+    case CLK_GLOBAL_MEM_FENCE:
+      return BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory;
+    case CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE:
+      return (BIMuxInfoConcept::MemSemanticsWorkGroupMemory |
+              BIMuxInfoConcept::MemSemanticsCrossWorkGroupMemory);
+    default:
+      llvm_unreachable("unhandled memory fence flags");
+    }
+  }
+  return std::nullopt;
+}
+
+static std::optional<unsigned> parseMemoryScopeParam(Value *const P) {
+  if (auto *const Scope = dyn_cast<ConstantInt>(P)) {
+    switch (Scope->getZExtValue()) {
+    case memory_scope_work_item:
+      return BIMuxInfoConcept::MemScopeWorkItem;
+    case memory_scope_sub_group:
+      return BIMuxInfoConcept::MemScopeSubGroup;
+    case memory_scope_work_group:
+      return BIMuxInfoConcept::MemScopeWorkGroup;
+    case memory_scope_device:
+      return BIMuxInfoConcept::MemScopeDevice;
+    // 3.3.5. memory_scope_all_devices is an alias for
+    // memory_scope_all_svm_devices.
+    case memory_scope_all_devices:
+    case memory_scope_all_svm_devices:
+      return BIMuxInfoConcept::MemScopeCrossDevice;
+    default:
+      llvm_unreachable("unhandled memory scope");
+    }
+  }
+  return std::nullopt;
+}
+
+static std::optional<unsigned> parseMemoryOrderParam(Value *const P) {
+  if (auto *const Order = dyn_cast<ConstantInt>(P)) {
+    switch (Order->getZExtValue()) {
+    case memory_order_relaxed:
+      return BIMuxInfoConcept::MemSemanticsRelaxed;
+    case memory_order_acquire:
+      return BIMuxInfoConcept::MemSemanticsAcquire;
+    case memory_order_release:
+      return BIMuxInfoConcept::MemSemanticsRelease;
+    case memory_order_acq_rel:
+      return BIMuxInfoConcept::MemSemanticsAcquireRelease;
+    case memory_order_seq_cst:
+      return BIMuxInfoConcept::MemSemanticsSequentiallyConsistent;
+    default:
+      llvm_unreachable("unhandled memory order");
+    }
+  }
+  return std::nullopt;
+}
+
+// This function returns a mux builtin ID for the corresponding CL builtin ID
+// when that lowering is straightforward and the function types of each builtin
+// are identical.
+static std::optional<BuiltinID> get1To1BuiltinLowering(BuiltinID CLBuiltinID) {
+  switch (CLBuiltinID) {
+  default:
+    return std::nullopt;
+  case eCLBuiltinGetWorkDim:
+    return eMuxBuiltinGetWorkDim;
+  case eCLBuiltinGetGroupId:
+    return eMuxBuiltinGetGroupId;
+  case eCLBuiltinGetGlobalSize:
+    return eMuxBuiltinGetGlobalSize;
+  case eCLBuiltinGetGlobalOffset:
+    return eMuxBuiltinGetGlobalOffset;
+  case eCLBuiltinGetLocalId:
+    return eMuxBuiltinGetLocalId;
+  case eCLBuiltinGetLocalSize:
+    return eMuxBuiltinGetLocalSize;
+  case eCLBuiltinGetEnqueuedLocalSize:
+    return eMuxBuiltinGetEnqueuedLocalSize;
+  case eCLBuiltinGetNumGroups:
+    return eMuxBuiltinGetNumGroups;
+  case eCLBuiltinGetGlobalId:
+    return eMuxBuiltinGetGlobalId;
+  case eCLBuiltinGetLocalLinearId:
+    return eMuxBuiltinGetLocalLinearId;
+  case eCLBuiltinGetGlobalLinearId:
+    return eMuxBuiltinGetGlobalLinearId;
+  case eCLBuiltinGetSubgroupSize:
+    return eMuxBuiltinGetSubGroupSize;
+  case eCLBuiltinGetMaxSubgroupSize:
+    return eMuxBuiltinGetMaxSubGroupSize;
+  case eCLBuiltinGetSubgroupLocalId:
+    return eMuxBuiltinGetSubGroupLocalId;
+  case eCLBuiltinGetNumSubgroups:
+    return eMuxBuiltinGetNumSubGroups;
+  case eCLBuiltinGetEnqueuedNumSubgroups:
+    // Note - this is mapping to the same builtin as
+    // eCLBuiltinGetNumSubgroups, as we don't currently support
+    // non-uniform work-group sizes.
+    return eMuxBuiltinGetNumSubGroups;
+  case eCLBuiltinGetSubgroupId:
+    return eMuxBuiltinGetSubGroupId;
+  }
+}
+
+Instruction *
+CLBuiltinInfo::lowerBuiltinToMuxBuiltin(CallInst &CI,
+                                        BIMuxInfoConcept &BIMuxImpl) {
+  auto &M = *CI.getModule();
+  auto *const F = CI.getCalledFunction();
+  if (!F) {
+    return nullptr;
+  }
+  const auto ID = identifyBuiltin(*F);
+  if (!ID) {
+    return nullptr;
+  }
+
+  // Handle straightforward 1:1 mappings.
+  if (auto MuxID = get1To1BuiltinLowering(*ID)) {
+    auto *const MuxBuiltinFn = BIMuxImpl.getOrDeclareMuxBuiltin(*MuxID, M);
+    assert(MuxBuiltinFn && "Could not get/declare mux builtin");
+    const SmallVector<Value *> Args(CI.args());
+    auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName());
+    NewCI->insertBefore(CI.getIterator());
+    NewCI->takeName(&CI);
+    NewCI->setAttributes(MuxBuiltinFn->getAttributes());
+    return NewCI;
+  }
+
+  IRBuilder<> B(&CI);
+  LLVMContext &Ctx = M.getContext();
+  auto *const I32Ty = Type::getInt32Ty(Ctx);
+
+  auto CtrlBarrierID = eMuxBuiltinWorkGroupBarrier;
+  unsigned DefaultMemScope = BIMuxInfoConcept::MemScopeWorkGroup;
+  unsigned DefaultMemOrder =
+      BIMuxInfoConcept::MemSemanticsSequentiallyConsistent;
+
+  switch (*ID) {
+  default:
+    // Sub-group and work-group builtins need lowering to their mux
+    // equivalents.
+    if (auto *const NewI = lowerGroupBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl)) {
+      return NewI;
+    }
+    return nullptr;
+  case eCLBuiltinSubGroupBarrier:
+    CtrlBarrierID = eMuxBuiltinSubGroupBarrier;
+    DefaultMemScope = BIMuxInfoConcept::MemScopeSubGroup;
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinBarrier:
+  case eCLBuiltinWorkGroupBarrier: {
+    // Memory Scope which the barrier controls. Defaults to 'workgroup' or
+    // 'subgroup' scope depending on the barrier, but sub_group_barrier and
+    // work_group_barrier can optionally provide a scope.
+    unsigned ScopeVal = DefaultMemScope;
+    if ((ID == eCLBuiltinSubGroupBarrier || ID == eCLBuiltinWorkGroupBarrier) &&
+        F->arg_size() == 2) {
+      if (auto Scope = parseMemoryScopeParam(CI.getOperand(1))) {
+        ScopeVal = *Scope;
+      }
+    }
+
+    const unsigned SemanticsVal =
+        DefaultMemOrder | parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0);
+
+    auto *const CtrlBarrier =
+        BIMuxImpl.getOrDeclareMuxBuiltin(CtrlBarrierID, M);
+
+    auto *const BarrierID = ConstantInt::get(I32Ty, 0);
+    auto *const Scope = ConstantInt::get(I32Ty, ScopeVal);
+    auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal);
+    auto *const NewCI =
+        B.CreateCall(CtrlBarrier, {BarrierID, Scope, Semantics}, CI.getName());
+    NewCI->setAttributes(CtrlBarrier->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  case eCLBuiltinAtomicWorkItemFence:
+    // atomic_work_item_fence has two parameters which we can parse.
+    DefaultMemOrder =
+        parseMemoryOrderParam(CI.getOperand(1)).value_or(DefaultMemOrder);
+    DefaultMemScope =
+        parseMemoryScopeParam(CI.getOperand(2)).value_or(DefaultMemScope);
+    LLVM_FALLTHROUGH;
+  case eCLBuiltinMemFence:
+  case eCLBuiltinReadMemFence:
+  case eCLBuiltinWriteMemFence: {
+    // The deprecated 'fence' builtins default to memory_scope_work_group and
+    // have one possible order each.
+    if (ID == eCLBuiltinMemFence) {
+      DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquireRelease;
+    } else if (ID == eCLBuiltinReadMemFence) {
+      DefaultMemOrder = BIMuxInfoConcept::MemSemanticsAcquire;
+    } else if (ID == eCLBuiltinWriteMemFence) {
+      DefaultMemOrder = BIMuxInfoConcept::MemSemanticsRelease;
+    }
+    const unsigned SemanticsVal =
+        DefaultMemOrder | parseMemFenceFlagsParam(CI.getOperand(0)).value_or(0);
+    auto *const MemBarrier =
+        BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M);
+    auto *const Scope = ConstantInt::get(I32Ty, DefaultMemScope);
+    auto *const Semantics = ConstantInt::get(I32Ty, SemanticsVal);
+    auto *const NewCI =
+        B.CreateCall(MemBarrier, {Scope, Semantics}, CI.getName());
+    NewCI->setAttributes(MemBarrier->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  case eCLBuiltinAsyncWorkGroupCopy:
+  case eCLBuiltinAsyncWorkGroupStridedCopy:
+  case eCLBuiltinAsyncWorkGroupCopy2D2D:
+  case eCLBuiltinAsyncWorkGroupCopy3D3D:
+    return lowerAsyncBuiltinToMuxBuiltin(CI, *ID, BIMuxImpl);
+  case eCLBuiltinWaitGroupEvents: {
+    auto *const MuxWait =
+        BIMuxImpl.getOrDeclareMuxBuiltin(eMuxBuiltinDMAWait, M);
+    assert(MuxWait && "Could not get/declare __mux_dma_wait");
+    auto *const Count = CI.getArgOperand(0);
+    auto *Events = CI.getArgOperand(1);
+
+    assert(Events->getType()->isPointerTy() &&
+           (Events->getType()->getPointerAddressSpace() ==
+                compiler::utils::AddressSpace::Private ||
+            Events->getType()->getPointerAddressSpace() ==
+                compiler::utils::AddressSpace::Generic) &&
+           "Pointer to event must be in address space 0 or 4.");
+
+    Events = B.CreatePointerBitCastOrAddrSpaceCast(
+        Events, PointerType::getUnqual(Ctx), "mux.events");
+    auto *const NewCI = B.CreateCall(MuxWait, {Count, Events}, CI.getName());
+    NewCI->setAttributes(MuxWait->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  }
+}
+
+Instruction *
+CLBuiltinInfo::lowerGroupBuiltinToMuxBuiltin(CallInst &CI, BuiltinID ID,
+                                             BIMuxInfoConcept &BIMuxImpl) {
+  auto &M = *CI.getModule();
+  auto *const F = CI.getCalledFunction();
+  assert(F && "No calling function?");
+
+  // Some ops need extra checking to determine their mux ID:
+  // * add/mul operations are split into integer/float
+  // * min/max operations are split into signed/unsigned/float
+  // So we set a 'base' builtin ID for these operations to the (unsigned)
+  // integer variant and do a checking step afterwards where we refine the
+  // builtin ID.
+  bool RecheckOpType = false;
+  BaseBuiltinID MuxBuiltinID;
+  switch (ID) {
+  default:
+    return nullptr;
+  case eCLBuiltinSubgroupAll:
+    MuxBuiltinID = eMuxBuiltinSubgroupAll;
+    break;
+  case eCLBuiltinSubgroupAny:
+    MuxBuiltinID = eMuxBuiltinSubgroupAny;
+    break;
+  case eCLBuiltinSubgroupBroadcast:
+    MuxBuiltinID = eMuxBuiltinSubgroupBroadcast;
+    break;
+  case eCLBuiltinSubgroupReduceAdd:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceAdd;
+    break;
+  case eCLBuiltinSubgroupReduceMin:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceUMin;
+    break;
+  case eCLBuiltinSubgroupReduceMax:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceUMax;
+    break;
+  case eCLBuiltinSubgroupReduceMul:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceMul;
+    break;
+  case eCLBuiltinSubgroupReduceAnd:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceAnd;
+    break;
+  case eCLBuiltinSubgroupReduceOr:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceOr;
+    break;
+  case eCLBuiltinSubgroupReduceXor:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceXor;
+    break;
+  case eCLBuiltinSubgroupReduceLogicalAnd:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalAnd;
+    break;
+  case eCLBuiltinSubgroupReduceLogicalOr:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalOr;
+    break;
+  case eCLBuiltinSubgroupReduceLogicalXor:
+    MuxBuiltinID = eMuxBuiltinSubgroupReduceLogicalXor;
+    break;
+  case eCLBuiltinSubgroupScanAddInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanAddInclusive;
+    break;
+  case eCLBuiltinSubgroupScanAddExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanAddExclusive;
+    break;
+  case eCLBuiltinSubgroupScanMinInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanUMinInclusive;
+    break;
+  case eCLBuiltinSubgroupScanMinExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanUMinExclusive;
+    break;
+  case eCLBuiltinSubgroupScanMaxInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxInclusive;
+    break;
+  case eCLBuiltinSubgroupScanMaxExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanUMaxExclusive;
+    break;
+  case eCLBuiltinSubgroupScanMulInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanMulInclusive;
+    break;
+  case eCLBuiltinSubgroupScanMulExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinSubgroupScanMulExclusive;
+    break;
+  case eCLBuiltinSubgroupScanAndInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanAndInclusive;
+    break;
+  case eCLBuiltinSubgroupScanAndExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanAndExclusive;
+    break;
+  case eCLBuiltinSubgroupScanOrInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanOrInclusive;
+    break;
+  case eCLBuiltinSubgroupScanOrExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanOrExclusive;
+    break;
+  case eCLBuiltinSubgroupScanXorInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanXorInclusive;
+    break;
+  case eCLBuiltinSubgroupScanXorExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanXorExclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalAndInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndInclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalAndExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalAndExclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalOrInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrInclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalOrExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalOrExclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalXorInclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorInclusive;
+    break;
+  case eCLBuiltinSubgroupScanLogicalXorExclusive:
+    MuxBuiltinID = eMuxBuiltinSubgroupScanLogicalXorExclusive;
+    break;
+  case eCLBuiltinWorkgroupAll:
+    MuxBuiltinID = eMuxBuiltinWorkgroupAll;
+    break;
+  case eCLBuiltinWorkgroupAny:
+    MuxBuiltinID = eMuxBuiltinWorkgroupAny;
+    break;
+  case eCLBuiltinWorkgroupBroadcast:
+    MuxBuiltinID = eMuxBuiltinWorkgroupBroadcast;
+    break;
+  case eCLBuiltinWorkgroupReduceAdd:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceAdd;
+    break;
+  case eCLBuiltinWorkgroupReduceMin:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMin;
+    break;
+  case eCLBuiltinWorkgroupReduceMax:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceUMax;
+    break;
+  case eCLBuiltinWorkgroupReduceMul:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceMul;
+    break;
+  case eCLBuiltinWorkgroupReduceAnd:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceAnd;
+    break;
+  case eCLBuiltinWorkgroupReduceOr:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceOr;
+    break;
+  case eCLBuiltinWorkgroupReduceXor:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceXor;
+    break;
+  case eCLBuiltinWorkgroupReduceLogicalAnd:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalAnd;
+    break;
+  case eCLBuiltinWorkgroupReduceLogicalOr:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalOr;
+    break;
+  case eCLBuiltinWorkgroupReduceLogicalXor:
+    MuxBuiltinID = eMuxBuiltinWorkgroupReduceLogicalXor;
+    break;
+  case eCLBuiltinWorkgroupScanAddInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanAddInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanAddExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanAddExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMinInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMinExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanUMinExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMaxInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMaxExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanUMaxExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMulInclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanMulInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanMulExclusive:
+    RecheckOpType = true;
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanMulExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanAndInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanAndInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanAndExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanAndExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanOrInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanOrInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanOrExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanOrExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanXorInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanXorInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanXorExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanXorExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalAndInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalAndExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalAndExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalOrInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalOrExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalOrExclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalXorInclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorInclusive;
+    break;
+  case eCLBuiltinWorkgroupScanLogicalXorExclusive:
+    MuxBuiltinID = eMuxBuiltinWorkgroupScanLogicalXorExclusive;
+    break;
+  }
+
+  if (RecheckOpType) {
+    // We've assumed (unsigned) integer operations, but we may actually have
+    // signed integer, or floating point, operations. Refine the builtin ID to
+    // the correct 'overload' now.
+    compiler::utils::NameMangler Mangler(&F->getContext());
+    SmallVector<Type *, 4> ArgumentTypes;
+    SmallVector<compiler::utils::TypeQualifiers, 4> Qualifiers;
+
+    Mangler.demangleName(F->getName(), ArgumentTypes, Qualifiers);
+
+    assert(Qualifiers.size() == 1 && ArgumentTypes.size() == 1 &&
+           "Unknown collective builtin");
+    auto &Qual = Qualifiers[0];
+
+    bool IsSignedInt = false;
+    while (!IsSignedInt && Qual.getCount()) {
+      IsSignedInt |= Qual.pop_front() == compiler::utils::eTypeQualSignedInt;
+    }
+
+    const bool IsFP = ArgumentTypes[0]->isFloatingPointTy();
+    switch (MuxBuiltinID) {
+    default:
+      llvm_unreachable("unknown group operation for which to check the type");
+    case eMuxBuiltinSubgroupReduceAdd:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceFAdd;
+      break;
+    case eMuxBuiltinSubgroupReduceMul:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceFMul;
+      break;
+    case eMuxBuiltinSubgroupReduceUMin:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceFMin;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceSMin;
+      }
+      break;
+    case eMuxBuiltinSubgroupReduceUMax:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceFMax;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupReduceSMax;
+      }
+      break;
+    case eMuxBuiltinSubgroupScanAddInclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFAddInclusive;
+      break;
+    case eMuxBuiltinSubgroupScanAddExclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFAddExclusive;
+      break;
+    case eMuxBuiltinSubgroupScanMulInclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMulInclusive;
+      break;
+    case eMuxBuiltinSubgroupScanMulExclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMulExclusive;
+      break;
+    case eMuxBuiltinSubgroupScanUMinInclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMinInclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanSMinInclusive;
+      }
+      break;
+    case eMuxBuiltinSubgroupScanUMinExclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMinExclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanSMinExclusive;
+      }
+      break;
+    case eMuxBuiltinSubgroupScanUMaxInclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxInclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxInclusive;
+      }
+      break;
+    case eMuxBuiltinSubgroupScanUMaxExclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanFMaxExclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinSubgroupScanSMaxExclusive;
+      }
+      break;
+    case eMuxBuiltinWorkgroupReduceAdd:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceFAdd;
+      break;
+    case eMuxBuiltinWorkgroupReduceMul:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMul;
+      break;
+    case eMuxBuiltinWorkgroupReduceUMin:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMin;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMin;
+      }
+      break;
+    case eMuxBuiltinWorkgroupReduceUMax:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceFMax;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupReduceSMax;
+      }
+      break;
+    case eMuxBuiltinWorkgroupScanAddInclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddInclusive;
+      break;
+    case eMuxBuiltinWorkgroupScanAddExclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFAddExclusive;
+      break;
+    case eMuxBuiltinWorkgroupScanMulInclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulInclusive;
+      break;
+    case eMuxBuiltinWorkgroupScanMulExclusive:
+      if (IsFP)
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMulExclusive;
+      break;
+    case eMuxBuiltinWorkgroupScanUMinInclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinInclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinInclusive;
+      }
+      break;
+    case eMuxBuiltinWorkgroupScanUMinExclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMinExclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanSMinExclusive;
+      }
+      break;
+    case eMuxBuiltinWorkgroupScanUMaxInclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxInclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxInclusive;
+      }
+      break;
+    case eMuxBuiltinWorkgroupScanUMaxExclusive:
+      if (IsFP) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanFMaxExclusive;
+      } else if (IsSignedInt) {
+        MuxBuiltinID = eMuxBuiltinWorkgroupScanSMaxExclusive;
+      }
+      break;
+    }
+  }
+
+  const bool IsAnyAll = MuxBuiltinID == eMuxBuiltinSubgroupAny ||
+                        MuxBuiltinID == eMuxBuiltinSubgroupAll ||
+                        MuxBuiltinID == eMuxBuiltinWorkgroupAny ||
+                        MuxBuiltinID == eMuxBuiltinWorkgroupAll;
+  SmallVector<Type *, 2> OverloadInfo;
+  if (!IsAnyAll) {
+    OverloadInfo.push_back(CI.getOperand(0)->getType());
+  } else {
+    OverloadInfo.push_back(IntegerType::getInt1Ty(M.getContext()));
+  }
+
+  auto *const MuxBuiltinFn =
+      BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, OverloadInfo);
+
+  assert(MuxBuiltinFn && "Missing mux builtin");
+  auto *const SizeTy = getSizeType(M);
+  auto *const I32Ty = Type::getInt32Ty(M.getContext());
+
+  SmallVector<Value *, 4> Args;
+  if (MuxBuiltinID >= eFirstMuxWorkgroupCollectiveBuiltin &&
+      MuxBuiltinID <= eLastMuxWorkgroupCollectiveBuiltin) {
+    // Work-group operations have a barrier ID first.
+    Args.push_back(ConstantInt::get(I32Ty, 0));
+  }
+  // Then the arg itself
+  // If it's an any/all operation, we must first reduce to i1 because that's how
+  // the mux builtins expect their arguments.
+  auto *Val = CI.getOperand(0);
+  if (!IsAnyAll) {
+    Args.push_back(Val);
+  } else {
+    assert(Val->getType()->isIntegerTy());
+    auto *NEZero = ICmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE, Val,
+                                    ConstantInt::getNullValue(Val->getType()));
+    NEZero->insertBefore(CI.getIterator());
+    Args.push_back(NEZero);
+  }
+
+  if (MuxBuiltinID == eMuxBuiltinSubgroupBroadcast) {
+    // Pass on the ID parameter
+    Args.push_back(CI.getOperand(1));
+  }
+  if (MuxBuiltinID == eMuxBuiltinWorkgroupBroadcast) {
+    // The mux version always has three indices. Any missing ones are replaced
+    // with zeros
+    for (unsigned i = 0, e = CI.arg_size(); i != 3; i++) {
+      Args.push_back(1 + i < e ? CI.getOperand(1 + i)
+                               : ConstantInt::getNullValue(SizeTy));
+    }
+  }
+
+  auto *const NewCI = CallInst::Create(MuxBuiltinFn, Args, CI.getName());
+  NewCI->insertBefore(CI.getIterator());
+  NewCI->takeName(&CI);
+  NewCI->setAttributes(MuxBuiltinFn->getAttributes());
+
+  if (!IsAnyAll) {
+    return NewCI;
+  }
+  // For any/all we need to recreate the original i32 return value.
+  auto *SExt = SExtInst::Create(Instruction::SExt, NewCI, CI.getType(), "sext");
+  SExt->insertBefore(CI.getIterator());
+  return SExt;
+}
+
+Instruction *
+CLBuiltinInfo::lowerAsyncBuiltinToMuxBuiltin(CallInst &CI, BuiltinID ID,
+                                             BIMuxInfoConcept &BIMuxImpl) {
+  assert((ID == eCLBuiltinAsyncWorkGroupCopy ||
+          ID == eCLBuiltinAsyncWorkGroupStridedCopy ||
+          ID == eCLBuiltinAsyncWorkGroupCopy2D2D ||
+          ID == eCLBuiltinAsyncWorkGroupCopy3D3D) &&
+         "Invalid ID");
+
+  IRBuilder<> B(&CI);
+  auto &M = *CI.getModule();
+  LLVMContext &Ctx = M.getContext();
+  const auto &DL = M.getDataLayout();
+
+  switch (ID) {
+  default:
+    llvm_unreachable("Unhandled builtin");
+  case eCLBuiltinAsyncWorkGroupCopy:
+  case eCLBuiltinAsyncWorkGroupStridedCopy: {
+    NameMangler Mangler(&Ctx);
+
+    // Do a full demangle to determing the pointer element type of the first
+    // argument.
+    SmallVector<Type *, 4> BuiltinArgTypes, BuiltinArgPointeeTypes;
+    SmallVector<compiler::utils::TypeQualifiers, 4> BuiltinArgQuals;
+
+    [[maybe_unused]] const StringRef BuiltinName =
+        Mangler.demangleName(CI.getCalledFunction()->getName(), BuiltinArgTypes,
+                             BuiltinArgPointeeTypes, BuiltinArgQuals);
+    assert(!BuiltinName.empty() && BuiltinArgTypes[0]->isPointerTy() &&
+           BuiltinArgPointeeTypes[0] && "Could not demangle async builtin");
+
+    auto *const DataTy = BuiltinArgPointeeTypes[0];
+    const bool IsStrided = ID == eCLBuiltinAsyncWorkGroupStridedCopy;
+
+    auto *const Dst = CI.getArgOperand(0);
+    auto *const Src = CI.getArgOperand(1);
+    auto *const NumElements = CI.getArgOperand(2);
+    auto *const EventIn = CI.getArgOperand(3 + IsStrided);
+
+    // Find out which way the DMA is going and declare the appropriate mux
+    // builtin.
+    const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                        compiler::utils::AddressSpace::Local;
+    const auto ElementTypeWidthInBytes =
+        DL.getTypeAllocSize(DataTy).getFixedValue();
+    auto *const ElementSize =
+        ConstantInt::get(NumElements->getType(), ElementTypeWidthInBytes);
+
+    auto *const WidthInBytes =
+        IsStrided ? ElementSize
+                  : B.CreateMul(ElementSize, NumElements, "width.bytes");
+
+    const BuiltinID MuxBuiltinID = [&] {
+      if (IsRead) {
+        return IsStrided ? eMuxBuiltinDMARead2D : eMuxBuiltinDMARead1D;
+      } else {
+        return IsStrided ? eMuxBuiltinDMAWrite2D : eMuxBuiltinDMAWrite1D;
+      }
+    }();
+
+    auto *const MuxDMA =
+        BIMuxImpl.getOrDeclareMuxBuiltin(MuxBuiltinID, M, EventIn->getType());
+    assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+    CallInst *NewCI = nullptr;
+    if (!IsStrided) {
+      NewCI = B.CreateCall(MuxDMA, {Dst, Src, WidthInBytes, EventIn},
+                           "mux.out.event");
+    } else {
+      // The stride from async_work_group_strided_copy is in elements, but the
+      // stride in the __mux builtins are in bytes so we need to scale the
+      // value.
+      auto *const Stride = CI.getArgOperand(3);
+      auto *const StrideInBytes =
+          B.CreateMul(ElementSize, Stride, "stride.bytes");
+
+      // For async_work_group_strided_copy, the stride only applies to the
+      // global memory, as we are doing scatters/gathers.
+      auto *const DstStride = IsRead ? ElementSize : StrideInBytes;
+      auto *const SrcStride = IsRead ? StrideInBytes : ElementSize;
+
+      NewCI = B.CreateCall(
+          MuxDMA,
+          {Dst, Src, WidthInBytes, DstStride, SrcStride, NumElements, EventIn},
+          "mux.out.event");
+    }
+    NewCI->setAttributes(MuxDMA->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  case eCLBuiltinAsyncWorkGroupCopy2D2D: {
+    // Unpack the arguments for ease of access.
+    auto *const Dst = CI.getArgOperand(0);
+    auto *const DstOffset = CI.getArgOperand(1);
+    auto *const Src = CI.getArgOperand(2);
+    auto *const SrcOffset = CI.getArgOperand(3);
+    auto *const NumBytesPerEl = CI.getArgOperand(4);
+    auto *const NumElsPerLine = CI.getArgOperand(5);
+    auto *const NumLines = CI.getArgOperand(6);
+    auto *const SrcTotalLineLength = CI.getArgOperand(7);
+    auto *const DstTotalLineLength = CI.getArgOperand(8);
+    auto *const EventIn = CI.getArgOperand(9);
+
+    // Find out which way the DMA is going and declare the appropriate mux
+    // builtin.
+    const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                        compiler::utils::AddressSpace::Local;
+    auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin(
+        IsRead ? eMuxBuiltinDMARead2D : eMuxBuiltinDMAWrite2D, M,
+        EventIn->getType());
+    assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+    auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl);
+    auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl);
+    auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl);
+    auto *const ByteTy = B.getInt8Ty();
+    auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes);
+    auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes);
+    auto *const SrcStrideBytes = B.CreateMul(SrcTotalLineLength, NumBytesPerEl);
+    auto *const DstStrideBytes = B.CreateMul(DstTotalLineLength, NumBytesPerEl);
+    auto *const NewCI = B.CreateCall(
+        MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes, DstStrideBytes,
+                 SrcStrideBytes, NumLines, EventIn});
+    NewCI->setAttributes(MuxDMA->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  case eCLBuiltinAsyncWorkGroupCopy3D3D: {
+    auto *const Dst = CI.getArgOperand(0);
+    auto *const DstOffset = CI.getArgOperand(1);
+    auto *const Src = CI.getArgOperand(2);
+    auto *const SrcOffset = CI.getArgOperand(3);
+    auto *const NumBytesPerEl = CI.getArgOperand(4);
+    auto *const NumElsPerLine = CI.getArgOperand(5);
+    auto *const NumLines = CI.getArgOperand(6);
+    auto *const NumPlanes = CI.getArgOperand(7);
+    auto *const SrcTotalLineLength = CI.getArgOperand(8);
+    auto *const SrcTotalPlaneArea = CI.getArgOperand(9);
+    auto *const DstTotalLineLength = CI.getArgOperand(10);
+    auto *const DstTotalPlaneArea = CI.getArgOperand(11);
+    auto *const EventIn = CI.getArgOperand(12);
+
+    // Find out which way the DMA is going and declare the appropriate mux
+    // builtin.
+    const bool IsRead = Dst->getType()->getPointerAddressSpace() ==
+                        compiler::utils::AddressSpace::Local;
+    auto *const MuxDMA = BIMuxImpl.getOrDeclareMuxBuiltin(
+        IsRead ? eMuxBuiltinDMARead3D : eMuxBuiltinDMAWrite3D, M,
+        EventIn->getType());
+    assert(MuxDMA && "Could not get/declare mux dma read/write");
+
+    auto *const DstOffsetBytes = B.CreateMul(DstOffset, NumBytesPerEl);
+    auto *const SrcOffsetBytes = B.CreateMul(SrcOffset, NumBytesPerEl);
+    auto *const LineSizeBytes = B.CreateMul(NumElsPerLine, NumBytesPerEl);
+    auto *const ByteTy = B.getInt8Ty();
+    auto *const DstWithOffset = B.CreateGEP(ByteTy, Dst, DstOffsetBytes);
+    auto *const SrcWithOffset = B.CreateGEP(ByteTy, Src, SrcOffsetBytes);
+    auto *const SrcLineStrideBytes =
+        B.CreateMul(SrcTotalLineLength, NumBytesPerEl);
+    auto *const DstLineStrideBytes =
+        B.CreateMul(DstTotalLineLength, NumBytesPerEl);
+    auto *const SrcPlaneStrideBytes =
+        B.CreateMul(SrcTotalPlaneArea, NumBytesPerEl);
+    auto *const DstPlaneStrideBytes =
+        B.CreateMul(DstTotalPlaneArea, NumBytesPerEl);
+    auto *const NewCI = B.CreateCall(
+        MuxDMA, {DstWithOffset, SrcWithOffset, LineSizeBytes,
+                 DstLineStrideBytes, SrcLineStrideBytes, NumLines,
+                 DstPlaneStrideBytes, SrcPlaneStrideBytes, NumPlanes, EventIn});
+    NewCI->setAttributes(MuxDMA->getAttributes());
+    NewCI->takeName(&CI);
+    return NewCI;
+  }
+  }
+
+  return nullptr;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+Function *CLBuiltinLoader::materializeBuiltin(StringRef BuiltinName,
+                                              Module *DestM,
+                                              BuiltinMatFlags Flags) {
+  auto *const BuiltinModule = this->getBuiltinsModule();
+
+  // Retrieve it from the builtin module.
+  if (!BuiltinModule) {
+    return nullptr;
+  }
+  Function *SrcBuiltin = BuiltinModule->getFunction(BuiltinName);
+  if (!SrcBuiltin) {
+    return nullptr;
+  }
+
+  // The user only wants a declaration.
+  if (!(Flags & eBuiltinMatDefinition)) {
+    if (!DestM) {
+      return SrcBuiltin;
+    } else {
+      FunctionType *FT = dyn_cast<FunctionType>(SrcBuiltin->getFunctionType());
+      Function *BuiltinDecl = cast<Function>(
+          DestM->getOrInsertFunction(BuiltinName, FT).getCallee());
+      BuiltinDecl->copyAttributesFrom(SrcBuiltin);
+      BuiltinDecl->setCallingConv(SrcBuiltin->getCallingConv());
+      return BuiltinDecl;
+    }
+  }
+
+  // Materialize the builtin and its callees.
+  std::set<Function *> Callees;
+  std::vector<Function *> Worklist;
+  Worklist.push_back(SrcBuiltin);
+  while (!Worklist.empty()) {
+    // Materialize the first function in the work list.
+    Function *Current = Worklist.front();
+    Worklist.erase(Worklist.begin());
+    if (!Callees.insert(Current).second) {
+      continue;
+    }
+    if (!BuiltinModule->materialize(Current)) {
+      return nullptr;
+    }
+
+    // Find any callees in the function and add them to the list.
+    for (BasicBlock &BB : *Current) {
+      for (Instruction &I : BB) {
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (!CI) {
+          continue;
+        }
+        Function *callee = CI->getCalledFunction();
+        if (!callee) {
+          continue;
+        }
+        Worklist.push_back(callee);
+      }
+    }
+  }
+
+  if (!DestM) {
+    return SrcBuiltin;
+  }
+
+  // Copy builtin and callees to the target module if requested by the user.
+  ValueToValueMapTy ValueMap;
+  SmallVector<ReturnInst *, 4> Returns;
+  // Avoid linking errors.
+  const GlobalValue::LinkageTypes Linkage = GlobalValue::LinkOnceAnyLinkage;
+
+  // Declare the callees in the module if they don't already exist.
+  for (Function *Callee : Callees) {
+    Function *NewCallee = DestM->getFunction(Callee->getName());
+    if (!NewCallee) {
+      FunctionType *FT = Callee->getFunctionType();
+      NewCallee = Function::Create(FT, Linkage, Callee->getName(), DestM);
+    } else {
+      NewCallee->setLinkage(Linkage);
+    }
+    Function::arg_iterator NewArgI = NewCallee->arg_begin();
+    for (Argument &Arg : Callee->args()) {
+      NewArgI->setName(Arg.getName());
+      ValueMap[&Arg] = &*(NewArgI++);
+    }
+    NewCallee->copyAttributesFrom(Callee);
+    ValueMap[Callee] = NewCallee;
+  }
+
+  // Clone the callees' bodies into the module.
+  GlobalValueMaterializer Materializer(*DestM);
+  for (Function *Callee : Callees) {
+    if (Callee->isDeclaration()) {
+      continue;
+    }
+    Function *NewCallee = cast<Function>(ValueMap[Callee]);
+    assert(DestM);
+    const auto CloneType = DestM == Callee->getParent()
+                               ? CloneFunctionChangeType::LocalChangesOnly
+                               : CloneFunctionChangeType::DifferentModule;
+    CloneFunctionInto(NewCallee, Callee, ValueMap, CloneType, Returns, "",
+                      nullptr, nullptr, &Materializer);
+    Returns.clear();
+  }
+
+  // Clone global variable initializers.
+  for (GlobalVariable *var : Materializer.variables()) {
+    GlobalVariable *newVar = dyn_cast_or_null<GlobalVariable>(ValueMap[var]);
+    if (!newVar) {
+      return nullptr;
+    }
+    Constant *oldInit = var->getInitializer();
+    Constant *newInit = MapValue(oldInit, ValueMap);
+    newVar->setInitializer(newInit);
+  }
+
+  return cast<Function>(ValueMap[SrcBuiltin]);
+}
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
new file mode 100644
index 0000000000000..a176ace88c196
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/define_mux_builtins_pass.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/define_mux_builtins_pass.h>
+#include <llvm/IR/Module.h>
+
+#define DEBUG_TYPE "define-mux-builtins"
+
+using namespace llvm;
+
+PreservedAnalyses
+compiler::utils::DefineMuxBuiltinsPass::run(Module &M,
+                                            ModuleAnalysisManager &AM) {
+  bool Changed = false;
+  auto &BI = AM.getResult<BuiltinInfoAnalysis>(M);
+
+  auto functionNeedsDefining = [&BI](Function &F) {
+    if (F.isDeclaration() && !F.isIntrinsic()) {
+      if (auto B = BI.analyzeBuiltin(F)) {
+        return BI.isMuxBuiltinID(B->ID);
+      }
+    }
+    return false;
+  };
+
+  // Define all mux builtins
+  for (auto &F : M.functions()) {
+    if (!functionNeedsDefining(F)) {
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "  Defining mux builtin: " << F.getName() << "\n";);
+
+    // Define the builtin. If it declares any new dependent builtins, those
+    // will be appended to the module's function list and so will be
+    // encountered by later iterations.
+    auto Builtin = BI.analyzeBuiltin(F);
+    assert(Builtin && "Failed to analyze builtin");
+    if (BI.defineMuxBuiltin(Builtin->ID, M, Builtin->mux_overload_info)) {
+      Changed = true;
+    }
+  }
+
+  // While declaring any builtins should go to the end of the module's list of
+  // functions, it's not technically impossible for something else to happen.
+  // As such, assert that we are leaving the module in the state we are
+  // contractually obliged to: with all functions that need defining having
+  // been defined.
+  assert(all_of(M.functions(),
+                [&](Function &F) {
+                  return F.isDeclaration() || !functionNeedsDefining(F);
+                }) &&
+         "Did not define a function that requires it");
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
new file mode 100644
index 0000000000000..66cb934125195
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/dma.cpp
@@ -0,0 +1,74 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/dma.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <array>
+
+namespace compiler {
+namespace utils {
+
+llvm::Value *isThreadEQ(llvm::BasicBlock *bb, unsigned x, unsigned y,
+                        unsigned z, llvm::Function &LocalIDFn) {
+  llvm::IRBuilder<> builder(bb);
+  LocalIDFn.setCallingConv(llvm::CallingConv::SPIR_FUNC);
+  auto *const indexType = LocalIDFn.arg_begin()->getType();
+  llvm::Value *result = llvm::ConstantInt::getTrue(bb->getContext());
+
+  const std::array<unsigned, 3> threadIDs{x, y, z};
+  for (unsigned i = 0; i < threadIDs.size(); ++i) {
+    auto *const index = llvm::ConstantInt::get(indexType, i);
+    auto *const localID = builder.CreateCall(&LocalIDFn, index);
+    localID->setCallingConv(LocalIDFn.getCallingConv());
+
+    auto *thread =
+        llvm::ConstantInt::get(LocalIDFn.getReturnType(), threadIDs[i]);
+    auto *const cmp = builder.CreateICmpEQ(localID, thread);
+    result = (i == 0) ? cmp : builder.CreateAnd(result, cmp);
+  }
+
+  return result;
+}
+
+llvm::Value *isThreadZero(llvm::BasicBlock *BB, llvm::Function &LocalIDFn) {
+  return isThreadEQ(BB, 0, 0, 0, LocalIDFn);
+}
+
+void buildThreadCheck(llvm::BasicBlock *entryBlock, llvm::BasicBlock *trueBlock,
+                      llvm::BasicBlock *falseBlock, llvm::Function &LocalIDFn) {
+  // only thread 0 in the work group should execute the DMA.
+  llvm::IRBuilder<> entryBuilder(entryBlock);
+  entryBuilder.CreateCondBr(isThreadZero(entryBlock, LocalIDFn), trueBlock,
+                            falseBlock);
+}
+
+llvm::StructType *getOrCreateMuxDMAEventType(llvm::Module &m) {
+  if (auto *eventType = llvm::StructType::getTypeByName(
+          m.getContext(), MuxBuiltins::dma_event_type)) {
+    return eventType;
+  }
+
+  return llvm::StructType::create(m.getContext(), MuxBuiltins::dma_event_type);
+}
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
new file mode 100644
index 0000000000000..5b4db40b0d6be
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/encode_kernel_metadata_pass.cpp
@@ -0,0 +1,54 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/encode_kernel_metadata_pass.h>
+#include <compiler/utils/metadata.h>
+#include <llvm/IR/Module.h>
+
+using namespace llvm;
+
+PreservedAnalyses
+compiler::utils::TransferKernelMetadataPass::run(Module &M,
+                                                 ModuleAnalysisManager &) {
+  SmallVector<KernelInfo, 4> Kernels;
+  populateKernelList(M, Kernels);
+
+  for (const auto &Kernel : Kernels) {
+    if (auto *F = M.getFunction(Kernel.Name)) {
+      setOrigFnName(*F);
+      setIsKernelEntryPt(*F);
+      if (Kernel.ReqdWGSize) {
+        encodeLocalSizeMetadata(*F, *Kernel.ReqdWGSize);
+      }
+    }
+  }
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses
+compiler::utils::EncodeKernelMetadataPass::run(Module &M,
+                                               ModuleAnalysisManager &) {
+  if (auto *F = M.getFunction(KernelName)) {
+    setOrigFnName(*F);
+    setIsKernelEntryPt(*F);
+    if (LocalSizes) {
+      encodeLocalSizeMetadata(*F, *LocalSizes);
+    }
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
new file mode 100644
index 0000000000000..ace34338bb5d5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/group_collective_helpers.cpp
@@ -0,0 +1,71 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/IR/Constant.h>
+#include <llvm/IR/Type.h>
+#include <llvm/IR/Value.h>
+
+using namespace llvm;
+static llvm::Constant *getNeutralIdentityHelper(RecurKind Kind, Type *Ty,
+                                                bool UseNaN, bool UseFZero) {
+  switch (Kind) {
+  default:
+    return nullptr;
+  case RecurKind::And:
+    return ConstantInt::getAllOnesValue(Ty);
+  case RecurKind::Or:
+  case RecurKind::Add:
+  case RecurKind::Xor:
+    return ConstantInt::getNullValue(Ty);
+  case RecurKind::SMin:
+    return ConstantInt::get(
+        Ty, APInt::getSignedMaxValue(Ty->getScalarSizeInBits()));
+  case RecurKind::SMax:
+    return ConstantInt::get(
+        Ty, APInt::getSignedMinValue(Ty->getScalarSizeInBits()));
+  case RecurKind::UMin:
+    return ConstantInt::get(Ty, APInt::getMaxValue(Ty->getScalarSizeInBits()));
+  case RecurKind::UMax:
+    return ConstantInt::get(Ty, APInt::getMinValue(Ty->getScalarSizeInBits()));
+  case RecurKind::FAdd:
+    // -0.0 + 0.0 = 0.0 meaning -0.0 (not 0.0) is the neutral value for floats
+    // under addition.
+    return UseFZero ? ConstantFP::get(Ty, 0.0) : ConstantFP::get(Ty, -0.0);
+  case RecurKind::FMin:
+    return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ false)
+                  : ConstantFP::getInfinity(Ty, /*Negative*/ false);
+  case RecurKind::FMax:
+    return UseNaN ? ConstantFP::getQNaN(Ty, /*Negative*/ true)
+                  : ConstantFP::getInfinity(Ty, /*Negative*/ true);
+  case RecurKind::Mul:
+    return ConstantInt::get(Ty, 1);
+  case RecurKind::FMul:
+    return ConstantFP::get(Ty, 1.0);
+  }
+}
+
+llvm::Constant *compiler::utils::getNeutralVal(RecurKind Kind, Type *Ty) {
+  return getNeutralIdentityHelper(Kind, Ty, /*UseNaN*/ true,
+                                  /*UseFZero*/ false);
+}
+
+llvm::Constant *compiler::utils::getIdentityVal(RecurKind Kind, Type *Ty) {
+  return getNeutralIdentityHelper(Kind, Ty, /*UseNaN*/ false, /*UseFZero*/
+                                  true);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
new file mode 100644
index 0000000000000..d31b3022c7eb5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mangling.cpp
@@ -0,0 +1,889 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/mangling.h>
+#include <compiler/utils/target_extension_types.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/TypeSize.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <cstring>
+#include <optional>
+
+namespace compiler {
+namespace utils {
+using namespace llvm;
+
+NameMangler::NameMangler(LLVMContext *context) : Context(context) {}
+
+std::string NameMangler::mangleName(StringRef Name, ArrayRef<Type *> Tys,
+                                    ArrayRef<TypeQualifiers> Quals) {
+  std::string MangledName;
+  raw_string_ostream O(MangledName);
+  O << "_Z" << Name.size() << Name;
+  for (unsigned i = 0; i < Tys.size(); i++) {
+    const ArrayRef<Type *> PrevTys = Tys.slice(0, i);
+    const ArrayRef<TypeQualifiers> PrevQuals = Quals.slice(0, i);
+    if (!mangleType(O, Tys[i], Quals[i], PrevTys, PrevQuals)) {
+      return std::string();
+    }
+  }
+  O.flush();
+  return MangledName;
+}
+
+StringRef
+NameMangler::demangleName(StringRef Name, SmallVectorImpl<llvm::Type *> &Types,
+                          SmallVectorImpl<llvm::Type *> &PointerElementTypes,
+                          SmallVectorImpl<TypeQualifiers> &Quals) {
+  // Parse the name part.
+  Lexer L(Name);
+  Name = demangleName(L);
+  if (Name.empty()) {
+    return StringRef{};
+  }
+
+  // Parse the argument part.
+  while (L.Left() > 0) {
+    Type *ArgTy = nullptr;
+    Type *ArgEltTy = nullptr;
+    TypeQualifiers ArgQuals;
+    if (!demangleType(L, ArgTy, &ArgEltTy, ArgQuals, Types, Quals)) {
+      return StringRef{};
+    }
+    Types.push_back(ArgTy);
+    PointerElementTypes.push_back(ArgEltTy);
+    Quals.push_back(ArgQuals);
+  }
+  return Name;
+}
+
+StringRef NameMangler::demangleName(StringRef Name,
+                                    SmallVectorImpl<llvm::Type *> &Types,
+                                    SmallVectorImpl<TypeQualifiers> &Quals) {
+  SmallVector<llvm::Type *, 4> EltTys;
+  return demangleName(Name, Types, EltTys, Quals);
+}
+
+StringRef NameMangler::demangleName(StringRef Name) {
+  Lexer L(Name);
+  StringRef DemangledName = demangleName(L);
+  if (!DemangledName.empty()) {
+    return DemangledName;
+  }
+  return Name;
+}
+
+int NameMangler::resolveSubstitution(unsigned SubID,
+                                     SmallVectorImpl<Type *> &Tys,
+                                     SmallVectorImpl<TypeQualifiers> &Quals) {
+  unsigned CurrentSubID = 0;
+  int ResolvedID = -1;
+  for (unsigned i = 0; i < Tys.size(); i++) {
+    // Determine whether the type is a builtin or not.
+    // Builtin types cannot be substituted.
+    Type *Ty = Tys[i];
+    TypeQualifiers &TyQuals = Quals[i];
+    if (isTypeBuiltin(Ty, TyQuals)) {
+      continue;
+    }
+    if (CurrentSubID == SubID) {
+      ResolvedID = (int)i;
+      break;
+    }
+    CurrentSubID++;
+  }
+  return ResolvedID;
+}
+
+bool NameMangler::emitSubstitution(raw_ostream &O, Type *Ty,
+                                   TypeQualifiers Quals,
+                                   ArrayRef<Type *> PrevTys,
+                                   ArrayRef<TypeQualifiers> PrevQuals) {
+  if (isTypeBuiltin(Ty, Quals)) {
+    return false;
+  }
+
+  // Look for a previously-mangled non-builtin type we could use as a
+  // substitution.
+  int SubstitutionID = -1;
+  bool FoundMatch = false;
+  for (unsigned j = 0; j < PrevTys.size(); j++) {
+    Type *PrevTy = PrevTys[j];
+    TypeQualifiers PrevQual = PrevQuals[j];
+    if (!isTypeBuiltin(PrevTy, PrevQual)) {
+      SubstitutionID++;
+      if ((PrevTy == Ty) && (PrevQual == Quals)) {
+        FoundMatch = true;
+        break;
+      }
+    }
+  }
+  if (!FoundMatch) {
+    return false;
+  }
+
+  // Found a match, emit the substitution.
+  O << "S";
+  if (SubstitutionID > 0) {
+    O << SubstitutionID;
+  }
+  O << "_";
+  return true;
+}
+
+bool NameMangler::isTypeBuiltin(Type *Ty, TypeQualifiers &Quals) {
+  (void)Quals;
+  switch (Ty->getTypeID()) {
+  default:
+  case Type::StructTyID:
+  case Type::ArrayTyID:
+  case Type::PointerTyID:
+  case Type::FixedVectorTyID:
+    return false;
+  case Type::VoidTyID:
+  case Type::HalfTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::IntegerTyID:
+    return true;
+  }
+}
+
+const char *NameMangler::mangleSimpleType(Type *Ty, TypeQualifier Qual) {
+  const bool IsSigned = (Qual & eTypeQualSignedInt);
+  switch (Ty->getTypeID()) {
+  default:
+    break;
+  case Type::VoidTyID:
+    return "v";
+  case Type::HalfTyID:
+    return "Dh";
+  case Type::FloatTyID:
+    return "f";
+  case Type::DoubleTyID:
+    return "d";
+  case Type::IntegerTyID:
+    switch (cast<IntegerType>(Ty)->getBitWidth()) {
+    default:
+      break;
+    case 1:
+      return "b"; // bool
+    case 8:
+      return IsSigned ? "c" : "h";
+    case 16:
+      return IsSigned ? "s" : "t";
+    case 32:
+      return IsSigned ? "i" : "j";
+    case 64:
+      return IsSigned ? "l" : "m";
+    }
+  }
+  return nullptr;
+}
+
+bool NameMangler::mangleType(raw_ostream &O, Type *Ty, TypeQualifiers Qual) {
+  return mangleType(O, Ty, Qual, ArrayRef<Type *>(),
+                    ArrayRef<TypeQualifiers>());
+}
+
+static void manglePointerQuals(raw_ostream &O, TypeQualifier Qual,
+                               unsigned AddressSpace) {
+  if (Qual & eTypeQualPointerRestrict) {
+    O << 'r';
+  }
+  if (Qual & eTypeQualPointerVolatile) {
+    O << 'V';
+  }
+  if (Qual & eTypeQualPointerConst) {
+    O << 'K';
+  }
+  if (AddressSpace > 0) {
+    O << "U3AS" << AddressSpace;
+  }
+}
+
+bool NameMangler::mangleType(raw_ostream &O, Type *Ty, TypeQualifiers Quals,
+                             ArrayRef<Type *> PrevTys,
+                             ArrayRef<TypeQualifiers> PrevQuals) {
+  if (emitSubstitution(O, Ty, Quals, PrevTys, PrevQuals)) {
+    return true;
+  }
+
+  const TypeQualifier Qual = Quals.pop_front();
+  if (const char *SimpleName = mangleSimpleType(Ty, Qual)) {
+    O << SimpleName;
+    return true;
+  } else if (isa<ScalableVectorType>(Ty)) {
+    std::string tmp;
+    raw_string_ostream Otmp(tmp);
+    auto *VecTy = cast<llvm::VectorType>(Ty);
+    Otmp << "nxv"
+         << multi_llvm::getVectorElementCount(VecTy).getKnownMinValue();
+    if (!mangleType(Otmp, VecTy->getElementType(), Quals, PrevTys, PrevQuals)) {
+      return false;
+    }
+    O << "u" << tmp.size() << tmp;
+    return true;
+  } else if (Ty->isVectorTy()) {
+    auto *VecTy = cast<FixedVectorType>(Ty);
+    O << "Dv" << VecTy->getNumElements() << "_";
+    return mangleType(O, VecTy->getElementType(), Quals, PrevTys, PrevQuals);
+  } else if (Ty->isPointerTy()) {
+    PointerType *PtrTy = cast<PointerType>(Ty);
+    const unsigned AddressSpace = PtrTy->getAddressSpace();
+    O << "u3ptr";
+    manglePointerQuals(O, Qual, AddressSpace);
+    return true;
+  } else if (Ty->isTargetExtTy()) {
+    if (auto Name = mangleBuiltinType(Ty)) {
+      O << *Name;
+      return true;
+    }
+    return false;
+  } else {
+    return false;
+  }
+}
+
+bool NameMangler::demangleSimpleType(Lexer &L, Type *&Ty, TypeQualifier &Qual) {
+  const int c = L.Current();
+  Ty = nullptr;
+  Qual = eTypeQualNone;
+  if ((c < 0) || !Context) {
+    return false;
+  }
+
+  switch (c) {
+  default:
+    return false;
+  case 'v':
+    Ty = llvm::Type::getVoidTy(*Context);
+    break;
+  case 'D':
+    if (!L.Consume("Dh")) {
+      return false;
+    }
+    Ty = llvm::Type::getHalfTy(*Context);
+    return true;
+  case 'f':
+    Ty = llvm::Type::getFloatTy(*Context);
+    break;
+  case 'd':
+    Ty = llvm::Type::getDoubleTy(*Context);
+    break;
+  case 'b':
+    Ty = llvm::Type::getInt1Ty(*Context);
+    break;
+  case 'c':
+  case 'h':
+    Ty = llvm::Type::getInt8Ty(*Context);
+    if (c == 'c') {
+      Qual = eTypeQualSignedInt;
+    }
+    break;
+  case 's':
+  case 't':
+    Ty = llvm::Type::getInt16Ty(*Context);
+    if (c == 's') {
+      Qual = eTypeQualSignedInt;
+    }
+    break;
+  case 'i':
+  case 'j':
+    Ty = llvm::Type::getInt32Ty(*Context);
+    if (c == 'i') {
+      Qual = eTypeQualSignedInt;
+    }
+    break;
+  case 'l':
+  case 'm':
+    Ty = llvm::Type::getInt64Ty(*Context);
+    if (c == 'l') {
+      Qual = eTypeQualSignedInt;
+    }
+    break;
+  }
+  L.Consume();
+  return true;
+}
+
+std::optional<std::string> NameMangler::mangleBuiltinType(Type *Ty) {
+  auto *const TgtTy = cast<TargetExtType>(Ty);
+  const StringRef Name = TgtTy->getName();
+
+  if (Name == "spirv.Event") {
+    return "9ocl_event";
+  }
+
+  if (Name == "spirv.Sampler") {
+    return "11ocl_sampler";
+  }
+
+  if (Name != "spirv.Image") {
+    // FIXME: Some types don't have official target extension types.
+    // "opencl.clk_event_t" -> "12ocl_clkevent"
+    // "opencl.queue_t" -> "9ocl_queue"
+    // "opencl.ndrange_t" -> "11ocl_ndrange"
+    // "opencl.reserve_id_t" -> "13ocl_reserveid"
+    return std::nullopt;
+  }
+
+  auto Dim = TgtTy->getIntParameter(tgtext::ImageTyDimensionalityIdx);
+  auto Depth = TgtTy->getIntParameter(tgtext::ImageTyDepthIdx);
+  auto Arrayed = TgtTy->getIntParameter(tgtext::ImageTyArrayedIdx);
+  auto MS = TgtTy->getIntParameter(tgtext::ImageTyMSIdx);
+
+  std::string MangledName = "ocl_image";
+
+  switch (Dim) {
+  default:
+    return std::nullopt;
+  case tgtext::ImageDim1D:
+    MangledName += "1d";
+    break;
+  case tgtext::ImageDim2D:
+    MangledName += "2d";
+    break;
+  case tgtext::ImageDim3D:
+    MangledName += "3d";
+    break;
+  case tgtext::ImageDimBuffer:
+    MangledName += "1dbuffer";
+    break;
+  }
+
+  if (Arrayed == tgtext::ImageArrayed) {
+    MangledName += "array";
+  }
+
+  if (MS == tgtext::ImageMSMultiSampled) {
+    MangledName += "msaa";
+  }
+
+  if (Depth == tgtext::ImageDepth) {
+    MangledName += "depth";
+  }
+
+  return std::to_string(MangledName.size()) + MangledName;
+}
+
+bool NameMangler::demangleOpenCLBuiltinType(Lexer &L, Type *&Ty) {
+  if (L.Consume("12memory_scope") || L.Consume("12memory_order")) {
+    Ty = IntegerType::getInt32Ty(*Context);
+    return true;
+  }
+
+  if (auto *TargetExtTy = [this, &L]() -> Type * {
+        if (L.Consume("11ocl_image1d")) {
+          return compiler::utils::tgtext::getImage1DTy(*Context);
+        } else if (L.Consume("16ocl_image1darray")) {
+          return compiler::utils::tgtext::getImage1DArrayTy(*Context);
+        } else if (L.Consume("17ocl_image1dbuffer")) {
+          return compiler::utils::tgtext::getImage1DBufferTy(*Context);
+        } else if (L.Consume("11ocl_image2d")) {
+          return compiler::utils::tgtext::getImage2DTy(*Context);
+        } else if (L.Consume("16ocl_image2darray")) {
+          return compiler::utils::tgtext::getImage2DArrayTy(*Context);
+        } else if (L.Consume("16ocl_image2ddepth")) {
+          return compiler::utils::tgtext::getImage2DTy(*Context, /*Depth*/ true,
+                                                       /*MS*/ false);
+        } else if (L.Consume("21ocl_image2darraydepth")) {
+          return compiler::utils::tgtext::getImage2DArrayTy(*Context);
+        } else if (L.Consume("15ocl_image2dmsaa")) {
+          return compiler::utils::tgtext::getImage2DTy(
+              *Context, /*Depth*/ false, /*MS*/ true);
+        } else if (L.Consume("20ocl_image2darraymsaa")) {
+          return compiler::utils::tgtext::getImage2DArrayTy(
+              *Context, /*Depth*/ false, /*MS*/ true);
+        } else if (L.Consume("20ocl_image2dmsaadepth")) {
+          return compiler::utils::tgtext::getImage2DTy(*Context, /*Depth*/ true,
+                                                       /*MS*/ true);
+        } else if (L.Consume("35ocl_image2darraymsaadepth")) {
+          return compiler::utils::tgtext::getImage2DArrayTy(
+              *Context, /*Depth*/ true, /*MS*/ true);
+        } else if (L.Consume("11ocl_image3d")) {
+          return compiler::utils::tgtext::getImage3DTy(*Context);
+        } else if (L.Consume("11ocl_sampler")) {
+          return compiler::utils::tgtext::getSamplerTy(*Context);
+        } else if (L.Consume("9ocl_event")) {
+          return compiler::utils::tgtext::getEventTy(*Context);
+        }
+        return nullptr;
+      }()) {
+    Ty = TargetExtTy;
+    return true;
+  }
+
+  StringRef Name;
+  //
+  // TODO: Avoid hard coded name. See redmine issue #8656 please.
+  //
+  if (L.Consume("11ocl_image1d")) {
+    Name = "opencl.image1d_t";
+  } else if (L.Consume("16ocl_image1darray")) {
+    Name = "opencl.image1d_array_t";
+  } else if (L.Consume("17ocl_image1dbuffer")) {
+    Name = "opencl.image1d_buffer_t";
+  } else if (L.Consume("11ocl_image2d")) {
+    Name = "opencl.image2d_t";
+  } else if (L.Consume("16ocl_image2darray")) {
+    Name = "opencl.image2d_array_t";
+  } else if (L.Consume("16ocl_image2ddepth")) {
+    Name = "opencl.image2d_depth_t";
+  } else if (L.Consume("21ocl_image2darraydepth")) {
+    Name = "opencl.image2d_array_depth_t";
+  } else if (L.Consume("15ocl_image2dmsaa")) {
+    Name = "opencl.image2d_msaa_t";
+  } else if (L.Consume("20ocl_image2darraymsaa")) {
+    Name = "opencl.image2d_array_msaa_t";
+  } else if (L.Consume("20ocl_image2dmsaadepth")) {
+    Name = "opencl.image2d_msaa_depth_t";
+  } else if (L.Consume("35ocl_image2darraymsaadepth")) {
+    Name = "opencl.image2d_array_msaa_depth_t";
+  } else if (L.Consume("11ocl_image3d")) {
+    Name = "opencl.image3d_t";
+  } else if (L.Consume("11ocl_sampler")) {
+    Name = "opencl_sampler_t";
+  } else if (L.Consume("9ocl_event")) {
+    Name = "opencl.event_t";
+  } else if (L.Consume("12ocl_clkevent")) {
+    Name = "opencl.clk_event_t";
+  } else if (L.Consume("9ocl_queue")) {
+    Name = "opencl.queue_t";
+  } else if (L.Consume("11ocl_ndrange")) {
+    Name = "opencl.ndrange_t";
+  } else if (L.Consume("13ocl_reserveid")) {
+    Name = "opencl.reserve_id_t";
+  } else {
+    return false;
+  }
+
+  if (auto *const OpenCLType =
+          llvm::StructType::getTypeByName(*Context, Name)) {
+    Ty = OpenCLType;
+  } else {
+    Ty = llvm::StructType::create(*Context, Name);
+  }
+
+  return true;
+}
+
+struct PointerASQuals {
+  unsigned AS;
+  TypeQualifier Qual;
+};
+
+static std::optional<PointerASQuals> demanglePointerQuals(Lexer &L) {
+  TypeQualifier PointerQual = eTypeQualNone;
+
+  // Parse the optional pointer qualifier.
+  if (L.Current() < 0) {
+    return std::nullopt;
+  }
+
+  // Parse the optional address space qualifier.
+  bool DemangledAS = false;
+  unsigned AddressSpace = 0;
+
+  if (L.Consume("U3AS")) {
+    if (!L.ConsumeInteger(AddressSpace)) {
+      return std::nullopt;
+    }
+    DemangledAS = true;
+  }
+
+  switch (L.Current()) {
+  default:
+    break;
+  case 'K':
+    PointerQual = eTypeQualPointerConst;
+    L.Consume();
+    break;
+  case 'r':
+    PointerQual = eTypeQualPointerRestrict;
+    L.Consume();
+    break;
+  case 'V':
+    PointerQual = eTypeQualPointerVolatile;
+    L.Consume();
+    break;
+  }
+
+  if (!DemangledAS && L.Consume("U3AS") && !L.ConsumeInteger(AddressSpace)) {
+    return std::nullopt;
+  }
+
+  return PointerASQuals{AddressSpace, PointerQual};
+}
+
+bool NameMangler::demangleType(Lexer &L, Type *&Ty, Type **PointerEltTy,
+                               TypeQualifiers &Quals,
+                               SmallVectorImpl<llvm::Type *> &CtxTypes,
+                               SmallVectorImpl<TypeQualifiers> &CtxQuals) {
+  Ty = nullptr;
+  if (L.Left() < 1) {
+    return false;
+  }
+
+  // Assume the element type is null, and set it if we find a pointer.
+  if (PointerEltTy) {
+    *PointerEltTy = nullptr;
+  }
+
+  // Match vector types.
+  if (L.Consume("Dv")) {
+    const TypeQualifier VectorQual = eTypeQualNone;
+    unsigned NumElements = 0;
+    Quals.push_back(VectorQual);
+    if (!L.ConsumeInteger(NumElements) || !L.Consume("_")) {
+      return false;
+    }
+
+    // Parse the vector element type.
+    Type *ElementType = nullptr;
+    if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) {
+      return false;
+    }
+    Ty = FixedVectorType::get(ElementType, NumElements);
+    return true;
+  }
+
+  // Match opaque pointer types
+  if (L.Consume("u3ptr")) {
+    const auto QualsAS = demanglePointerQuals(L);
+    if (!QualsAS) {
+      return false;
+    }
+    Quals.push_back(QualsAS->Qual);
+    return llvm::PointerType::get(*Context, QualsAS->AS);
+  }
+
+  // Match scalable vector types.
+  if (L.Consume("u")) {
+    unsigned TypeNameLength = 0;
+    if (!L.ConsumeInteger(TypeNameLength) || !L.Consume("nxv")) {
+      return false;
+    }
+    if (TypeNameLength > L.Left()) {
+      return false;
+    }
+    const TypeQualifier VectorQual = eTypeQualNone;
+    unsigned NumElements = 0;
+    Quals.push_back(VectorQual);
+    if (!L.ConsumeInteger(NumElements)) {
+      return false;
+    }
+
+    // Parse the vector element type.
+    Type *ElementType = nullptr;
+    if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) {
+      return false;
+    }
+    Ty = llvm::VectorType::get(ElementType,
+                               ElementCount::getScalable(NumElements));
+    return true;
+  }
+
+  // Match pointer types.
+  if (L.Consume("P")) {
+    const auto QualsAS = demanglePointerQuals(L);
+    if (!QualsAS) {
+      return false;
+    }
+
+    Quals.push_back(QualsAS->Qual);
+
+    // Parse the element type.
+    Type *ElementType = nullptr;
+    if (!demangleType(L, ElementType, nullptr, Quals, CtxTypes, CtxQuals)) {
+      return false;
+    }
+    assert(ElementType);
+    if (PointerEltTy) {
+      *PointerEltTy = ElementType;
+    }
+    Ty = llvm::PointerType::get(*Context, QualsAS->AS);
+    return true;
+  }
+
+  // Match simple types.
+  TypeQualifier SimpleQual = eTypeQualNone;
+  if (demangleSimpleType(L, Ty, SimpleQual)) {
+    Quals.push_back(SimpleQual);
+    return true;
+  }
+
+  // Handle substitutions.
+  if (L.Consume("S")) {
+    unsigned SubID = 0;
+    if (L.ConsumeInteger(SubID)) {
+      SubID++;
+    }
+    if (!L.Consume("_")) {
+      return false;
+    }
+
+    // Resolve it, using a previous type and qualifier.
+    const int entryIndex = resolveSubstitution(SubID, CtxTypes, CtxQuals);
+    if ((entryIndex < 0) || ((unsigned)entryIndex >= CtxTypes.size())) {
+      return false;
+    }
+    Ty = CtxTypes[entryIndex];
+    Quals.push_back(CtxQuals[entryIndex]);
+    return true;
+  }
+
+  if (demangleOpenCLBuiltinType(L, Ty)) {
+    return true;
+  }
+
+  return false;
+}
+
+StringRef NameMangler::demangleName(Lexer &L) {
+  unsigned NameLength = 0;
+  if (!L.Consume("_Z")) {
+    return StringRef();
+  } else if (!L.ConsumeInteger(NameLength)) {
+    return StringRef();
+  } else if (NameLength > L.Left()) {
+    return StringRef();
+  }
+  StringRef Name = L.TextLeft().substr(0, NameLength);
+  L.Consume(NameLength);
+  return Name;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+TypeQualifiers::TypeQualifiers() : storage_(0) {}
+
+TypeQualifiers::TypeQualifiers(TypeQualifier Qual) : storage_(0) {
+  push_back(Qual);
+}
+
+TypeQualifiers::TypeQualifiers(TypeQualifier Qual1, TypeQualifier Qual2)
+    : storage_(0) {
+  push_back(Qual1);
+  push_back(Qual2);
+}
+
+TypeQualifiers::TypeQualifiers(unsigned Qual) : storage_(0) { push_back(Qual); }
+
+TypeQualifiers::TypeQualifiers(unsigned Qual1, unsigned Qual2) : storage_(0) {
+  push_back(Qual1);
+  push_back(Qual2);
+}
+
+TypeQualifiers::StorageT TypeQualifiers::getCount() const {
+  const StorageT Mask = ((1 << NumCountBits) - 1);
+  return storage_ & Mask;
+}
+
+void TypeQualifiers::setCount(StorageT NewCount) {
+  const StorageT Mask = ((1 << NumCountBits) - 1);
+  // Clear the old count.
+  storage_ &= ~Mask;
+  // Set the new count.
+  storage_ |= ((NewCount << 0) & Mask);
+}
+
+TypeQualifier TypeQualifiers::front() const {
+  const StorageT Size = getCount();
+  if (Size == 0) {
+    return eTypeQualNone;
+  }
+  const unsigned Mask = ((1 << NumQualBits) - 1);
+  const unsigned Field = (storage_ >> NumCountBits) & Mask;
+  return (TypeQualifier)Field;
+}
+
+TypeQualifier TypeQualifiers::pop_front() {
+  const TypeQualifier Qual = front();
+  const StorageT Size = getCount();
+  if (Size > 0) {
+    // Pop the field bits.
+    storage_ >>= NumQualBits;
+    // Set the new count, since the old one was overwritten.
+    setCount(Size - 1);
+  }
+  return Qual;
+}
+
+TypeQualifier TypeQualifiers::at(unsigned Idx) const {
+  const StorageT Size = getCount();
+  if (Idx >= Size) {
+    return eTypeQualNone;
+  }
+  const unsigned ShAmt = NumCountBits + (Idx * NumQualBits);
+  const unsigned Field = (storage_ >> ShAmt) & ((1 << NumQualBits) - 1);
+  return TypeQualifier(Field);
+}
+
+bool TypeQualifiers::push_back(TypeQualifier Qual) {
+  const StorageT Size = getCount();
+  if (Size == MaxSize) {
+    return false;
+  }
+  const unsigned Offset = NumCountBits + (Size * NumQualBits);
+  const unsigned Field = Qual & ((1 << NumQualBits) - 1);
+  storage_ |= (static_cast<StorageT>(Field) << Offset);
+  setCount(Size + 1);
+  return true;
+}
+
+bool TypeQualifiers::push_back(unsigned Qual) {
+  return push_back((TypeQualifier)Qual);
+}
+
+bool TypeQualifiers::push_back(TypeQualifiers Quals) {
+  while (Quals.getCount() > 0) {
+    if (!push_back(Quals.pop_front())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+Lexer::Lexer(StringRef text) : Text(text), Pos(0) {}
+
+unsigned Lexer::Left() const { return Text.size() - Pos; }
+
+unsigned Lexer::CurrentPos() const { return Pos; }
+
+StringRef Lexer::TextLeft() const { return Text.substr(Pos); }
+
+int Lexer::Current() const { return Left() ? Text[Pos] : -1; }
+
+bool Lexer::Consume() { return Consume(1); }
+
+bool Lexer::Consume(unsigned Size) {
+  if (Left() < Size) {
+    return false;
+  }
+  Pos += Size;
+  return true;
+}
+
+bool Lexer::Consume(StringRef Pattern) {
+  if (Left() < Pattern.size()) {
+    return false;
+  } else if (!TextLeft().starts_with(Pattern)) {
+    return false;
+  }
+  Pos += Pattern.size();
+  return true;
+}
+
+bool Lexer::ConsumeInteger(unsigned &Result) {
+  size_t NumDigits = 0;
+  size_t i = Pos;
+  while ((i < Text.size()) && isdigit(Text[i])) {
+    i++;
+    NumDigits++;
+  }
+  const StringRef NumText = Text.substr(Pos, NumDigits);
+  if (NumText.size() == 0) {
+    return false;
+  }
+  if (NumText.getAsInteger(10, Result)) {
+    return false;
+  }
+  Pos += NumDigits;
+  return true;
+}
+
+bool Lexer::ConsumeSignedInteger(int &Result) {
+  size_t NumChars = 0;
+  size_t i = Pos;
+  if (Text[i] == '-') {
+    i++;
+    NumChars++;
+  }
+  while ((i < Text.size()) && isdigit(Text[i])) {
+    i++;
+    NumChars++;
+  }
+  const StringRef NumText = Text.substr(Pos, NumChars);
+  if (NumText.size() == 0) {
+    return false;
+  }
+  if (NumText.getAsInteger(10, Result)) {
+    return false;
+  }
+  Pos += NumChars;
+  return true;
+}
+
+bool Lexer::ConsumeAlpha(StringRef &Result) {
+  size_t NumChars = 0;
+  size_t i = Pos;
+  while ((i < Text.size()) && isalpha(Text[i])) {
+    i++;
+    NumChars++;
+  }
+  if (NumChars == 0) {
+    return false;
+  }
+  Result = Text.substr(Pos, NumChars);
+  Pos += NumChars;
+  return true;
+}
+
+bool Lexer::ConsumeAlphanumeric(StringRef &Result) {
+  size_t NumChars = 0;
+  size_t i = Pos;
+  while ((i < Text.size()) && isalnum(Text[i])) {
+    i++;
+    NumChars++;
+  }
+  if (NumChars == 0) {
+    return false;
+  }
+  Result = Text.substr(Pos, NumChars);
+  Pos += NumChars;
+  return true;
+}
+
+bool Lexer::ConsumeUntil(char C, StringRef &Result) {
+  const size_t CPos = Text.find_first_of(C, Pos);
+  if (CPos == std::string::npos) {
+    Result = StringRef();
+    return false;
+  }
+  Result = Text.substr(Pos, CPos - Pos);
+  Pos = CPos;
+  return true;
+}
+
+bool Lexer::ConsumeWhitespace() {
+  bool consumed = false;
+  while (Pos < Text.size() && isspace(Text[Pos])) {
+    consumed = true;
+    ++Pos;
+  }
+
+  return consumed;
+}
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
new file mode 100644
index 0000000000000..985008873c7a8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/metadata.cpp
@@ -0,0 +1,395 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/metadata.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+uint32_t getOpenCLVersion(const llvm::Module &m) {
+  if (auto *const md = m.getNamedMetadata("opencl.ocl.version")) {
+    if (md->getNumOperands() == 1) {
+      auto *const op = md->getOperand(0);
+      if (op->getNumOperands() == 2) {
+        const auto major =
+            mdconst::extract<ConstantInt>(op->getOperand(0))->getZExtValue();
+        const auto minor =
+            mdconst::extract<ConstantInt>(op->getOperand(1))->getZExtValue();
+        return (major * 100 + minor) * 1000;
+      }
+    }
+  }
+  return OpenCLC12;
+}
+
+static constexpr const char *ReqdWGSizeMD = "reqd_work_group_size";
+
+static MDTuple *encodeVectorizationInfo(const VectorizationInfo &info,
+                                        LLVMContext &Ctx) {
+  auto *const i32Ty = Type::getInt32Ty(Ctx);
+
+  return MDTuple::get(
+      Ctx,
+      {ConstantAsMetadata::get(
+           ConstantInt::get(i32Ty, info.vf.getKnownMinValue())),
+       ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.vf.isScalable())),
+       ConstantAsMetadata::get(ConstantInt::get(i32Ty, info.simdDimIdx)),
+       ConstantAsMetadata::get(
+           ConstantInt::get(i32Ty, info.IsVectorPredicated))});
+}
+
+static std::optional<VectorizationInfo> extractVectorizationInfo(MDTuple *md) {
+  if (md->getNumOperands() != 4) {
+    return std::nullopt;
+  }
+  auto *const widthMD = mdconst::extract<ConstantInt>(md->getOperand(0));
+  auto *const isScalableMD = mdconst::extract<ConstantInt>(md->getOperand(1));
+  auto *const simdDimIdxMD = mdconst::extract<ConstantInt>(md->getOperand(2));
+  auto *const isVPMD = mdconst::extract<ConstantInt>(md->getOperand(3));
+
+  VectorizationInfo info;
+
+  info.vf = llvm::ElementCount::get(widthMD->getZExtValue(),
+                                    isScalableMD->equalsInt(1));
+  info.simdDimIdx = simdDimIdxMD->getZExtValue();
+  info.IsVectorPredicated = isVPMD->equalsInt(1);
+
+  return info;
+}
+
+static std::optional<LinkMetadataResult> parseVectorLinkMD(MDNode *mdnode) {
+  if (auto info =
+          extractVectorizationInfo(dyn_cast<MDTuple>(mdnode->getOperand(0)))) {
+    // The Function may well be null.
+    Function *vecFn = mdconst::extract_or_null<Function>(mdnode->getOperand(1));
+    return LinkMetadataResult(vecFn, *info);
+  }
+  return std::nullopt;
+}
+
+void encodeVectorizationFailedMetadata(Function &f,
+                                       const VectorizationInfo &info) {
+  auto *veczInfo = encodeVectorizationInfo(info, f.getContext());
+  f.addMetadata("codeplay_ca_vecz.base.fail", *veczInfo);
+}
+
+void linkOrigToVeczFnMetadata(Function &origF, Function &vectorF,
+                              const VectorizationInfo &info) {
+  auto *veczInfo = encodeVectorizationInfo(info, origF.getContext());
+  auto *const mdTuple = MDTuple::get(
+      origF.getContext(), {veczInfo, ValueAsMetadata::get(&vectorF)});
+  origF.addMetadata("codeplay_ca_vecz.base", *mdTuple);
+}
+
+void linkVeczToOrigFnMetadata(Function &vectorizedF, Function &origF,
+                              const VectorizationInfo &info) {
+  auto *veczInfo = encodeVectorizationInfo(info, vectorizedF.getContext());
+  auto *const mdTuple = MDTuple::get(origF.getContext(),
+                                     {veczInfo, ValueAsMetadata::get(&origF)});
+  vectorizedF.addMetadata("codeplay_ca_vecz.derived", *mdTuple);
+}
+
+static bool parseVectorizedFunctionLinkMetadata(
+    Function &f, StringRef mdName,
+    SmallVectorImpl<LinkMetadataResult> &results) {
+  SmallVector<MDNode *, 1> nodes;
+
+  f.getMetadata(mdName, nodes);
+  if (nodes.empty()) {
+    return false;
+  }
+  results.reserve(results.size() + nodes.size());
+  for (auto *mdnode : nodes) {
+    if (auto link = parseVectorLinkMD(mdnode)) {
+      results.emplace_back(*link);
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool parseOrigToVeczFnLinkMetadata(Function &f,
+                                   SmallVectorImpl<LinkMetadataResult> &VFs) {
+  return parseVectorizedFunctionLinkMetadata(f, "codeplay_ca_vecz.base", VFs);
+}
+
+std::optional<LinkMetadataResult> parseVeczToOrigFnLinkMetadata(Function &f) {
+  auto *mdnode = f.getMetadata("codeplay_ca_vecz.derived");
+  if (!mdnode) {
+    return std::nullopt;
+  }
+  return parseVectorLinkMD(mdnode);
+}
+
+void dropVeczOrigMetadata(Function &f) {
+  f.setMetadata("codeplay_ca_vecz.base", nullptr);
+}
+
+void dropVeczDerivedMetadata(Function &f) {
+  f.setMetadata("codeplay_ca_vecz.derived", nullptr);
+}
+
+void encodeWrapperFnMetadata(Function &f, const VectorizationInfo &mainInfo,
+                             std::optional<VectorizationInfo> tailInfo) {
+  MDTuple *tailInfoMD = nullptr;
+  auto *mainInfoMD = encodeVectorizationInfo(mainInfo, f.getContext());
+
+  if (tailInfo) {
+    tailInfoMD = encodeVectorizationInfo(*tailInfo, f.getContext());
+  }
+
+  f.setMetadata("codeplay_ca_wrapper",
+                MDTuple::get(f.getContext(), {mainInfoMD, tailInfoMD}));
+}
+
+std::optional<std::pair<VectorizationInfo, std::optional<VectorizationInfo>>>
+parseWrapperFnMetadata(Function &f) {
+  auto *const mdnode = f.getMetadata("codeplay_ca_wrapper");
+  if (!mdnode || mdnode->getNumOperands() != 2) {
+    return std::nullopt;
+  }
+
+  auto *const mainTuple = dyn_cast_or_null<MDTuple>(mdnode->getOperand(0));
+  if (!mainTuple) {
+    return std::nullopt;
+  }
+
+  VectorizationInfo mainInfo;
+  std::optional<VectorizationInfo> tailInfo;
+
+  if (auto info = extractVectorizationInfo(mainTuple)) {
+    mainInfo = *info;
+  } else {
+    return std::nullopt;
+  }
+
+  if (auto *const tailTuple =
+          dyn_cast_or_null<MDTuple>(mdnode->getOperand(1))) {
+    if (auto info = extractVectorizationInfo(tailTuple)) {
+      tailInfo = info;
+    }
+  }
+
+  return std::make_pair(mainInfo, tailInfo);
+}
+
+void copyFunctionMetadata(Function &fromF, Function &toF, bool includeDebug) {
+  if (includeDebug) {
+    toF.copyMetadata(&fromF, 0);
+    return;
+  }
+  // Copy the metadata into the new kernel ignoring any debug info.
+  SmallVector<std::pair<unsigned, MDNode *>, 5> metadata;
+  fromF.getAllMetadata(metadata);
+
+  // Iterate through the metadata and only add nodes to the new one if they
+  // are not debug info.
+  for (const auto &pair : metadata) {
+    if (auto *nonDebug = dyn_cast_or_null<MDTuple>(pair.second)) {
+      toF.setMetadata(pair.first, nonDebug);
+    }
+  }
+}
+
+void encodeLocalSizeMetadata(Function &f, const std::array<uint64_t, 3> &size) {
+  // We may be truncating i64 to i32 but we don't expect local sizes to ever
+  // exceed 32 bits.
+  auto *const i32Ty = Type::getInt32Ty(f.getContext());
+  auto *const mdTuple =
+      MDTuple::get(f.getContext(),
+                   {ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[0])),
+                    ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[1])),
+                    ConstantAsMetadata::get(ConstantInt::get(i32Ty, size[2]))});
+  f.setMetadata(ReqdWGSizeMD, mdTuple);
+}
+
+std::optional<std::array<uint64_t, 3>> getLocalSizeMetadata(const Function &f) {
+  if (auto *md = f.getMetadata(ReqdWGSizeMD)) {
+    return std::array<uint64_t, 3>{
+        mdconst::extract<ConstantInt>(md->getOperand(0))->getZExtValue(),
+        mdconst::extract<ConstantInt>(md->getOperand(1))->getZExtValue(),
+        mdconst::extract<ConstantInt>(md->getOperand(2))->getZExtValue()};
+  }
+  return std::nullopt;
+}
+
+static constexpr const char *MuxScheduledFnMD = "mux_scheduled_fn";
+
+void dropSchedulingParameterMetadata(Function &f) {
+  f.setMetadata(MuxScheduledFnMD, nullptr);
+}
+
+SmallVector<int, 4> getSchedulingParameterFunctionMetadata(const Function &f) {
+  SmallVector<int, 4> idxs;
+  if (auto *md = f.getMetadata(MuxScheduledFnMD)) {
+    for (auto &op : md->operands()) {
+      idxs.push_back(mdconst::extract<ConstantInt>(op)->getSExtValue());
+    }
+  }
+  return idxs;
+}
+
+void setSchedulingParameterFunctionMetadata(Function &f, ArrayRef<int> idxs) {
+  if (idxs.empty()) {
+    return;
+  }
+  SmallVector<Metadata *, 4> mdOps;
+  auto *const i32Ty = Type::getInt32Ty(f.getContext());
+  for (auto idx : idxs) {
+    mdOps.push_back(ConstantAsMetadata::get(ConstantInt::get(i32Ty, idx)));
+  }
+  auto *const mdOpsTuple = MDTuple::get(f.getContext(), mdOps);
+  f.setMetadata(MuxScheduledFnMD, mdOpsTuple);
+}
+
+static constexpr const char *MuxSchedulingParamsMD = "mux-scheduling-params";
+
+void setSchedulingParameterModuleMetadata(Module &m,
+                                          ArrayRef<std::string> names) {
+  SmallVector<Metadata *, 4> paramDebugNames;
+  for (const auto &name : names) {
+    paramDebugNames.push_back(MDString::get(m.getContext(), name));
+  }
+  auto *const md = m.getOrInsertNamedMetadata(MuxSchedulingParamsMD);
+  md->clearOperands();
+  md->addOperand(MDNode::get(m.getContext(), paramDebugNames));
+}
+
+NamedMDNode *getSchedulingParameterModuleMetadata(const Module &m) {
+  return m.getNamedMetadata(MuxSchedulingParamsMD);
+}
+
+std::optional<unsigned> isSchedulingParameter(const Function &f, unsigned idx) {
+  if (auto *md = f.getMetadata(MuxScheduledFnMD)) {
+    for (const auto &op : enumerate(md->operands())) {
+      auto paramIdx = mdconst::extract<ConstantInt>(op.value())->getSExtValue();
+      if (paramIdx >= 0 && (unsigned)paramIdx == idx) {
+        return op.index();
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+// Uses the format of a metadata node directly applied to a function.
+std::optional<std::array<uint64_t, 3>>
+parseRequiredWGSMetadata(const Function &f) {
+  if (auto mdnode = f.getMetadata(ReqdWGSizeMD)) {
+    std::array<uint64_t, 3> wgs = {0, 1, 1};
+    assert(mdnode->getNumOperands() >= 1 && mdnode->getNumOperands() <= 3 &&
+           "Unsupported number of operands in reqd_work_group_size");
+    for (const auto &[idx, op] : enumerate(mdnode->operands())) {
+      wgs[idx] = mdconst::extract<ConstantInt>(op)->getZExtValue();
+    }
+    return wgs;
+  }
+  return std::nullopt;
+}
+
+// Uses the format of a metadata node that's a part of the opencl.kernels node.
+std::optional<std::array<uint64_t, 3>>
+parseRequiredWGSMetadata(const MDNode &node) {
+  for (uint32_t i = 1; i < node.getNumOperands(); ++i) {
+    MDNode *const subNode = cast<MDNode>(node.getOperand(i));
+    MDString *const operandName = cast<MDString>(subNode->getOperand(0));
+    if (operandName->getString() == ReqdWGSizeMD) {
+      auto *const op0 = mdconst::extract<ConstantInt>(subNode->getOperand(1));
+      auto *const op1 = mdconst::extract<ConstantInt>(subNode->getOperand(2));
+      auto *const op2 = mdconst::extract<ConstantInt>(subNode->getOperand(3));
+      // KLOCWORK "UNINIT.STACK.ARRAY.MUST" possible false positive
+      // Initialization of looks like an uninitialized access to Klocwork
+      std::array<uint64_t, 3> wgs = {
+          {op0->getZExtValue(), op1->getZExtValue(), op2->getZExtValue()}};
+      return wgs;
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<uint32_t> parseMaxWorkDimMetadata(const Function &f) {
+  if (auto *mdnode = f.getMetadata("max_work_dim")) {
+    auto *op0 = mdconst::extract<ConstantInt>(mdnode->getOperand(0));
+    return op0->getZExtValue();
+  }
+
+  return std::nullopt;
+}
+
+void populateKernelList(Module &m, SmallVectorImpl<KernelInfo> &results) {
+  // Construct list of kernels from metadata, if present.
+  if (auto *md = m.getNamedMetadata("opencl.kernels")) {
+    for (uint32_t i = 0, e = md->getNumOperands(); i < e; ++i) {
+      MDNode *const kernelNode = md->getOperand(i);
+      ValueAsMetadata *vmdKernel =
+          cast<ValueAsMetadata>(kernelNode->getOperand(0));
+      KernelInfo info{vmdKernel->getValue()->getName()};
+      if (auto wgs = parseRequiredWGSMetadata(*kernelNode)) {
+        info.ReqdWGSize = *wgs;
+      }
+      results.push_back(info);
+    }
+    return;
+  }
+
+  // No metadata - assume all functions with the SPIR_KERNEL calling
+  // convention are kernels.
+  for (auto &f : m) {
+    if (f.hasName() && f.getCallingConv() == CallingConv::SPIR_KERNEL) {
+      KernelInfo info(f.getName());
+      if (auto wgs = parseRequiredWGSMetadata(f)) {
+        info.ReqdWGSize = *wgs;
+      }
+      results.push_back(info);
+    }
+  }
+}
+
+void replaceKernelInOpenCLKernelsMetadata(Function &fromF, Function &toF,
+                                          Module &M) {
+  // update the kernel metadata
+  if (auto *const namedMD = M.getNamedMetadata("opencl.kernels")) {
+    for (auto *md : namedMD->operands()) {
+      if (md && md->getOperand(0) == ValueAsMetadata::get(&fromF)) {
+        md->replaceOperandWith(0, ValueAsMetadata::get(&toF));
+      }
+    }
+  }
+}
+
+static constexpr const char *ReqdSGSizeMD = "intel_reqd_sub_group_size";
+
+void encodeReqdSubgroupSizeMetadata(Function &f, uint32_t size) {
+  auto *const i32Ty = Type::getInt32Ty(f.getContext());
+  auto *const mdTuple = MDTuple::get(
+      f.getContext(), ConstantAsMetadata::get(ConstantInt::get(i32Ty, size)));
+  f.setMetadata(ReqdSGSizeMD, mdTuple);
+}
+
+std::optional<uint32_t> getReqdSubgroupSize(const Function &f) {
+  if (auto *md = f.getMetadata(ReqdSGSizeMD)) {
+    return mdconst::extract<ConstantInt>(md->getOperand(0))->getZExtValue();
+  }
+  return std::nullopt;
+}
+
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
new file mode 100644
index 0000000000000..51268147e1345
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/mux_builtin_info.cpp
@@ -0,0 +1,1319 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/address_spaces.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/dma.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/scheduling.h>
+#include <compiler/utils/target_extension_types.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/ModRef.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <optional>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+namespace SchedParamIndices {
+enum {
+  WI = 0,
+  WG = 1,
+  TOTAL = 2,
+};
+}
+
+static Function *defineLocalWorkItemBuiltin(BIMuxInfoConcept &BI, BuiltinID ID,
+                                            Module &M) {
+  // Simple 'local' work-item getters and setters.
+  bool IsSetter = false;
+  bool HasRankArg = false;
+  std::optional<WorkItemInfoStructField::Type> WIFieldIdx;
+  switch (ID) {
+  default:
+    return nullptr;
+  case eMuxBuiltinSetLocalId:
+    IsSetter = true;
+    LLVM_FALLTHROUGH;
+  case eMuxBuiltinGetLocalId:
+    HasRankArg = true;
+    WIFieldIdx = WorkItemInfoStructField::local_id;
+    break;
+  case eMuxBuiltinSetSubGroupId:
+    IsSetter = true;
+    LLVM_FALLTHROUGH;
+  case eMuxBuiltinGetSubGroupId:
+    WIFieldIdx = WorkItemInfoStructField::sub_group_id;
+    break;
+  case eMuxBuiltinSetNumSubGroups:
+    IsSetter = true;
+    LLVM_FALLTHROUGH;
+  case eMuxBuiltinGetNumSubGroups:
+    WIFieldIdx = WorkItemInfoStructField::num_sub_groups;
+    break;
+  case eMuxBuiltinSetMaxSubGroupSize:
+    IsSetter = true;
+    LLVM_FALLTHROUGH;
+  case eMuxBuiltinGetMaxSubGroupSize:
+    WIFieldIdx = WorkItemInfoStructField::max_sub_group_size;
+    break;
+  }
+
+  Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID));
+  assert(F && WIFieldIdx);
+
+  // Gather up the list of scheduling parameters on this builtin
+  const auto &SchedParams = BI.getFunctionSchedulingParameters(*F);
+  assert(SchedParamIndices::WI < SchedParams.size());
+
+  // Grab the work-item info argument
+  const auto &SchedParam = SchedParams[SchedParamIndices::WI];
+  auto *const StructTy = dyn_cast<StructType>(SchedParam.ParamPointeeTy);
+  assert(SchedParam.ArgVal && StructTy == getWorkItemInfoStructTy(M) &&
+         "Inconsistent scheduling parameter data");
+
+  if (IsSetter) {
+    populateStructSetterFunction(*F, *SchedParam.ArgVal, StructTy, *WIFieldIdx,
+                                 HasRankArg);
+  } else {
+    populateStructGetterFunction(*F, *SchedParam.ArgVal, StructTy, *WIFieldIdx,
+                                 HasRankArg);
+  }
+
+  return F;
+}
+
+static Function *defineLocalWorkGroupBuiltin(BIMuxInfoConcept &BI, BuiltinID ID,
+                                             Module &M) {
+  // Simple work-group getters
+  bool HasRankArg = true;
+  size_t DefaultVal = 0;
+  std::optional<WorkGroupInfoStructField::Type> WGFieldIdx;
+  switch (ID) {
+  default:
+    return nullptr;
+  case eMuxBuiltinGetLocalSize:
+    DefaultVal = 1;
+    WGFieldIdx = WorkGroupInfoStructField::local_size;
+    break;
+  case eMuxBuiltinGetGroupId:
+    DefaultVal = 0;
+    WGFieldIdx = WorkGroupInfoStructField::group_id;
+    break;
+  case eMuxBuiltinGetNumGroups:
+    DefaultVal = 1;
+    WGFieldIdx = WorkGroupInfoStructField::num_groups;
+    break;
+  case eMuxBuiltinGetGlobalOffset:
+    DefaultVal = 0;
+    WGFieldIdx = WorkGroupInfoStructField::global_offset;
+    break;
+  case eMuxBuiltinGetWorkDim:
+    DefaultVal = 1;
+    HasRankArg = false;
+    WGFieldIdx = WorkGroupInfoStructField::work_dim;
+    break;
+  }
+
+  Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID));
+  assert(F && WGFieldIdx);
+
+  // Gather up the list of scheduling parameters on this builtin
+  const auto &SchedParams = BI.getFunctionSchedulingParameters(*F);
+  assert(SchedParamIndices::WG < SchedParams.size());
+
+  // Grab the work-group info argument
+  const auto &SchedParam = SchedParams[SchedParamIndices::WG];
+  auto *const StructTy = dyn_cast<StructType>(SchedParam.ParamPointeeTy);
+  assert(SchedParam.ArgVal && StructTy == getWorkGroupInfoStructTy(M) &&
+         "Inconsistent scheduling parameter data");
+
+  populateStructGetterFunction(*F, *SchedParam.ArgVal, StructTy, *WGFieldIdx,
+                               HasRankArg, DefaultVal);
+  return F;
+}
+
+// FIXME: Assumes a sub-group size of 1.
+static Function *defineSubGroupGroupOpBuiltin(Function &F,
+                                              GroupCollective GroupOp,
+                                              ArrayRef<Type *> OverloadInfo) {
+  if (!GroupOp.isSubGroupScope()) {
+    return nullptr;
+  }
+
+  auto *Arg = F.getArg(0);
+
+  IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
+
+  [&] {
+    switch (GroupOp.Op) {
+    case GroupCollective::OpKind::Any:
+    case GroupCollective::OpKind::All:
+    case GroupCollective::OpKind::Broadcast:
+    case GroupCollective::OpKind::Reduction:
+    case GroupCollective::OpKind::ScanInclusive:
+      // In the trivial size=1 case, all of these operations just return the
+      // argument back again
+      B.CreateRet(Arg);
+      return;
+    case GroupCollective::OpKind::ScanExclusive: {
+      // In the trivial size=1 case, exclusive scans return the identity.
+      assert(!OverloadInfo.empty());
+      auto *const IdentityVal =
+          getIdentityVal(GroupOp.Recurrence, OverloadInfo[0]);
+      assert(IdentityVal && "Unable to deduce identity val");
+      B.CreateRet(IdentityVal);
+      return;
+    }
+    case GroupCollective::OpKind::Shuffle:
+    case GroupCollective::OpKind::ShuffleXor:
+      // In the trivial size=1 case, all of these operations just return the
+      // argument back again. Any computed shuffle index other than the only
+      // one in the sub-group would be out of bounds anyway.
+      B.CreateRet(Arg);
+      return;
+    case GroupCollective::OpKind::ShuffleUp: {
+      auto *const Prev = F.getArg(0);
+      auto *const Curr = F.getArg(1);
+      auto *const Delta = F.getArg(2);
+      // In the trivial size=1 case, negative delta is the desired index
+      // (since we're subtracting it from zero). If it's greater than zero and
+      // less than the size, we return 'current', else if it's less than zero
+      // and greater than or equal to the negative size, we return 'prev'. So
+      // if 'delta' is zero, return 'current', else return 'prev'. Anything
+      // else is out of bounds so we can simplify things here.
+      auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
+      auto *const Sel = B.CreateSelect(EqZero, Curr, Prev, "sel");
+      B.CreateRet(Sel);
+      return;
+    }
+    case GroupCollective::OpKind::ShuffleDown: {
+      auto *const Curr = F.getArg(0);
+      auto *const Next = F.getArg(1);
+      auto *const Delta = F.getArg(2);
+      // In the trivial size=1 case, the delta is the desired index (since
+      // we're adding it to zero). If it's less than the size, we return
+      // 'current', else if it's greater or equal to the size but less than
+      // twice the size, we return 'next'. So if 'delta' is zero, return
+      // 'current', else return 'next'. Anything else is out of bounds so we
+      // can simplify things here.
+      auto *const EqZero = B.CreateICmpEQ(Delta, B.getInt32(0), "eqzero");
+      auto *const Sel = B.CreateSelect(EqZero, Curr, Next, "sel");
+      B.CreateRet(Sel);
+      return;
+    }
+    }
+
+    llvm_unreachable("Unhandled group operation");
+  }();
+
+  return &F;
+}
+
+static Value *createCallHelper(IRBuilder<> &B, Function &F,
+                               ArrayRef<Value *> Args) {
+  auto *const CI = B.CreateCall(&F, Args);
+  CI->setAttributes(F.getAttributes());
+  CI->setCallingConv(F.getCallingConv());
+  return CI;
+}
+
+void BIMuxInfoConcept::setDefaultBuiltinAttributes(Function &F,
+                                                   bool AlwaysInline) {
+  // Many of our mux builtin functions are marked alwaysinline (unless they're
+  // already marked noinline)
+  if (AlwaysInline && !F.hasFnAttribute(Attribute::NoInline)) {
+    F.addFnAttr(Attribute::AlwaysInline);
+  }
+  // We never use exceptions
+  F.addFnAttr(Attribute::NoUnwind);
+  // Recursion is not supported in ComputeMux
+  F.addFnAttr(Attribute::NoRecurse);
+}
+
+Function *BIMuxInfoConcept::defineGetGlobalId(Module &M) {
+  Function *F =
+      M.getFunction(BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalId));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  // Create an IR builder with a single basic block in our function
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "entry", F));
+
+  auto *const MuxGetGroupIdFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGroupId, M);
+  auto *const MuxGetGlobalOffsetFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalOffset, M);
+  auto *const MuxGetLocalIdFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  auto *const MuxGetLocalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+  assert(MuxGetGroupIdFn && MuxGetGlobalOffsetFn && MuxGetLocalIdFn &&
+         MuxGetLocalSizeFn);
+
+  // Pass on all arguments through to dependent builtins. We expect that each
+  // function has identical prototypes, regardless of whether scheduling
+  // parameters have been added
+  const SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  auto *const GetGroupIdCall = createCallHelper(B, *MuxGetGroupIdFn, Args);
+  auto *const GetGlobalOffsetCall =
+      createCallHelper(B, *MuxGetGlobalOffsetFn, Args);
+  auto *const GetLocalIdCall = createCallHelper(B, *MuxGetLocalIdFn, Args);
+  auto *const GetLocalSizeCall = createCallHelper(B, *MuxGetLocalSizeFn, Args);
+
+  // (get_group_id(i) * get_local_size(i))
+  auto *Ret = B.CreateMul(GetGroupIdCall, GetLocalSizeCall);
+  // (get_group_id(i) * get_local_size(i)) + get_local_id(i)
+  Ret = B.CreateAdd(Ret, GetLocalIdCall);
+  // get_global_id(i) = (get_group_id(i) * get_local_size(i)) +
+  //                    get_local_id(i) + get_global_offset(i)
+  Ret = B.CreateAdd(Ret, GetGlobalOffsetCall);
+
+  // ... and return our result
+  B.CreateRet(Ret);
+  return F;
+}
+
+// FIXME: Assumes a sub-group size of 1.
+Function *BIMuxInfoConcept::defineGetSubGroupSize(Function &F) {
+  setDefaultBuiltinAttributes(F);
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
+
+  assert(F.getReturnType() == B.getInt32Ty());
+  B.CreateRet(B.getInt32(1));
+
+  return &F;
+}
+
+// FIXME: Assumes a sub-group size of 1.
+Function *BIMuxInfoConcept::defineGetSubGroupLocalId(Function &F) {
+  setDefaultBuiltinAttributes(F);
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
+
+  assert(F.getReturnType() == B.getInt32Ty());
+  B.CreateRet(B.getInt32(0));
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineGetGlobalSize(Module &M) {
+  Function *F =
+      M.getFunction(BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalSize));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  auto *const MuxGetNumGroupsFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetNumGroups, M);
+  auto *const MuxGetLocalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+  assert(MuxGetNumGroupsFn && MuxGetLocalSizeFn);
+
+  // create an IR builder with a single basic block in our function
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F));
+
+  // Pass on all arguments through to dependent builtins. We expect that each
+  // function has identical prototypes, regardless of whether scheduling
+  // parameters have been added
+  const SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  // call get_num_groups
+  auto *const GetNumGroupsCall = createCallHelper(B, *MuxGetNumGroupsFn, Args);
+
+  // call get_local_size
+  auto *const GetLocalSizeCall = createCallHelper(B, *MuxGetLocalSizeFn, Args);
+
+  // get_global_size(i) = get_num_groups(i) * get_local_size(i)
+  auto *const Ret = B.CreateMul(GetNumGroupsCall, GetLocalSizeCall);
+
+  // and return our result
+  B.CreateRet(Ret);
+  return F;
+}
+
+Function *BIMuxInfoConcept::defineGetLocalLinearId(Module &M) {
+  Function *F = M.getFunction(
+      BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetLocalLinearId));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  auto *const MuxGetLocalIdFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  auto *const MuxGetLocalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+  assert(MuxGetLocalIdFn && MuxGetLocalSizeFn);
+
+  // Create a call to all the required builtins.
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F));
+
+  // Pass on all arguments through to dependent builtins. Ignoring the index
+  // parameters we'll add, we expect that each function has identical
+  // prototypes, regardless of whether scheduling parameters have been added
+  SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  SmallVector<Value *, 4> Idx0Args = {B.getInt32(0)};
+  append_range(Idx0Args, Args);
+  SmallVector<Value *, 4> Idx1Args = {B.getInt32(1)};
+  append_range(Idx1Args, Args);
+  SmallVector<Value *, 4> Idx2Args = {B.getInt32(2)};
+  append_range(Idx2Args, Args);
+
+  auto *const GetLocalIDXCall = createCallHelper(B, *MuxGetLocalIdFn, Idx0Args);
+  auto *const GetLocalIDYCall = createCallHelper(B, *MuxGetLocalIdFn, Idx1Args);
+  auto *const GetLocalIDZCall = createCallHelper(B, *MuxGetLocalIdFn, Idx2Args);
+
+  auto *const GetLocalSizeXCall =
+      createCallHelper(B, *MuxGetLocalSizeFn, Idx0Args);
+  auto *const GetLocalSizeYCall =
+      createCallHelper(B, *MuxGetLocalSizeFn, Idx1Args);
+
+  // get_local_id(2) * get_local_size(1).
+  auto *ZTerm = B.CreateMul(GetLocalIDZCall, GetLocalSizeYCall);
+  // get_local_id(2) * get_local_size(1) * get_local_size(0).
+  ZTerm = B.CreateMul(ZTerm, GetLocalSizeXCall);
+
+  // get_local_id(1) * get_local_size(0).
+  auto *const YTerm = B.CreateMul(GetLocalIDYCall, GetLocalSizeXCall);
+
+  // get_local_id(2) * get_local_size(1) * get_local_size(0) +
+  // get_local_id(1) * get_local_size(0).
+  auto *Ret = B.CreateAdd(ZTerm, YTerm);
+  // get_local_id(2) * get_local_size(1) * get_local_size(0) +
+  // get_local_id(1) * get_local_size(0) + get_local_id(0).
+  Ret = B.CreateAdd(Ret, GetLocalIDXCall);
+
+  B.CreateRet(Ret);
+  return F;
+}
+
+Function *BIMuxInfoConcept::defineGetGlobalLinearId(Module &M) {
+  Function *F = M.getFunction(
+      BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetGlobalLinearId));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  auto *const MuxGetGlobalIdFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalId, M);
+  auto *const MuxGetGlobalOffsetFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalOffset, M);
+  auto *const MuxGetGlobalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetGlobalSize, M);
+  assert(MuxGetGlobalIdFn && MuxGetGlobalOffsetFn && MuxGetGlobalSizeFn);
+
+  // Create a call to all the required builtins.
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F));
+
+  // Pass on all arguments through to dependent builtins. Ignoring the index
+  // parameters we'll add, we expect that each function has identical
+  // prototypes, regardless of whether scheduling parameters have been added
+  SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  SmallVector<Value *, 4> Idx0Args = {B.getInt32(0)};
+  append_range(Idx0Args, Args);
+  SmallVector<Value *, 4> Idx1Args = {B.getInt32(1)};
+  append_range(Idx1Args, Args);
+  SmallVector<Value *, 4> Idx2Args = {B.getInt32(2)};
+  append_range(Idx2Args, Args);
+
+  auto *const GetGlobalIDXCall =
+      createCallHelper(B, *MuxGetGlobalIdFn, Idx0Args);
+  auto *const GetGlobalIDYCall =
+      createCallHelper(B, *MuxGetGlobalIdFn, Idx1Args);
+  auto *const GetGlobalIDZCall =
+      createCallHelper(B, *MuxGetGlobalIdFn, Idx2Args);
+
+  auto *const GetGlobalOffsetXCall =
+      createCallHelper(B, *MuxGetGlobalOffsetFn, Idx0Args);
+  auto *const GetGlobalOffsetYCall =
+      createCallHelper(B, *MuxGetGlobalOffsetFn, Idx1Args);
+  auto *const GetGlobalOffsetZCall =
+      createCallHelper(B, *MuxGetGlobalOffsetFn, Idx2Args);
+
+  auto *const GetGlobalSizeXCall =
+      createCallHelper(B, *MuxGetGlobalSizeFn, Idx0Args);
+  auto *const GetGlobalSizeYCall =
+      createCallHelper(B, *MuxGetGlobalSizeFn, Idx1Args);
+
+  // global linear id is calculated as follows:
+  // get_global_linear_id() =
+  // (get_global_id(2) - get_global_offset(2)) * get_global_size(1) *
+  // get_global_size(0) + (get_global_id(1) - get_global_offset(1)) *
+  // get_global_size(0) + get_global_id(0) - get_global_offset(0).
+  // =
+  // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) +
+  // get_global_id(1) - get_global_offset(1)) * get_global_size(0) +
+  // get_global_id(0) - get_global_offset(0).
+
+  auto *ZTerm = B.CreateSub(GetGlobalIDZCall, GetGlobalOffsetZCall);
+  // (get_global_id(2) - get_global_offset(2)) * get_global_size(1).
+  ZTerm = B.CreateMul(ZTerm, GetGlobalSizeYCall);
+
+  // get_global_id(1) - get_global_offset(1).
+  auto *const YTerm = B.CreateSub(GetGlobalIDYCall, GetGlobalOffsetYCall);
+
+  // (get_global_id(2) - get_global_offset(2)) * get_global_size(1) +
+  // get_global_id(1) - get_global_offset(1)
+  auto *YZTermsCombined = B.CreateAdd(ZTerm, YTerm);
+
+  // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) +
+  // get_global_id(1) - get_global_offset(1)) * get_global_size(0).
+  YZTermsCombined = B.CreateMul(YZTermsCombined, GetGlobalSizeXCall);
+
+  // get_global_id(0) - get_global_offset(0).
+  auto *const XTerm = B.CreateSub(GetGlobalIDXCall, GetGlobalOffsetXCall);
+
+  // ((get_global_id(2) - get_global_offset(2)) * get_global_size(1) +
+  // get_global_id(1) - get_global_offset(1)) * get_global_size(0) +
+  // get_global_id(0) - get_global_offset(0).
+  auto *const Ret = B.CreateAdd(XTerm, YZTermsCombined);
+
+  B.CreateRet(Ret);
+  return F;
+}
+
+Function *BIMuxInfoConcept::defineGetEnqueuedLocalSize(Module &M) {
+  Function *F = M.getFunction(
+      BuiltinInfo::getMuxBuiltinName(eMuxBuiltinGetEnqueuedLocalSize));
+  assert(F);
+  setDefaultBuiltinAttributes(*F);
+  F->setLinkage(GlobalValue::InternalLinkage);
+
+  auto *const MuxGetLocalSizeFn =
+      getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+  assert(MuxGetLocalSizeFn);
+
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", F));
+
+  // Pass on all arguments through to dependent builtins. We expect that each
+  // function has identical prototypes, regardless of whether scheduling
+  // parameters have been added
+  const SmallVector<Value *, 4> Args(make_pointer_range(F->args()));
+
+  // Since we don't support non-uniform subgroups
+  // get_enqueued_local_size(x) == get_local_size(x).
+  auto *const GetLocalSize = createCallHelper(B, *MuxGetLocalSizeFn, Args);
+
+  B.CreateRet(GetLocalSize);
+  return F;
+}
+
+Function *BIMuxInfoConcept::defineMemBarrier(Function &F, unsigned,
+                                             unsigned SemanticsIdx) {
+  // FIXME: We're ignoring some operands here. We're dropping the 'scope' but
+  // our set of default set of targets can't make use of anything but a
+  // single-threaded fence. We're also ignoring the kind of memory being
+  // controlled by the barrier.
+  auto &M = *F.getParent();
+  setDefaultBuiltinAttributes(F);
+  F.setLinkage(GlobalValue::InternalLinkage);
+  IRBuilder<> B(BasicBlock::Create(M.getContext(), "", &F));
+
+  // Grab the semantics argument.
+  Value *Semantics = F.getArg(SemanticsIdx);
+  // Mask out only the memory ordering value.
+  Semantics = B.CreateAnd(Semantics, B.getInt32(MemSemanticsMask));
+
+  // Don't insert this exit block just yet
+  auto *const ExitBB = BasicBlock::Create(M.getContext(), "exit");
+
+  auto *const DefaultBB =
+      BasicBlock::Create(M.getContext(), "case.default", &F);
+  auto *const Switch = B.CreateSwitch(Semantics, DefaultBB);
+
+  const struct {
+    StringRef Name;
+    unsigned SwitchVal;
+    AtomicOrdering Ordering;
+  } Data[4] = {
+      {"case.acquire", MemSemanticsAcquire, AtomicOrdering::Acquire},
+      {"case.release", MemSemanticsRelease, AtomicOrdering::Release},
+      {"case.acq_rel", MemSemanticsAcquireRelease,
+       AtomicOrdering::AcquireRelease},
+      {"case.seq_cst", MemSemanticsSequentiallyConsistent,
+       AtomicOrdering::SequentiallyConsistent},
+  };
+
+  for (const auto &D : Data) {
+    auto *const BB = BasicBlock::Create(M.getContext(), D.Name, &F);
+
+    Switch->addCase(B.getInt32(D.SwitchVal), BB);
+    B.SetInsertPoint(BB);
+    B.CreateFence(D.Ordering, SyncScope::SingleThread);
+    B.CreateBr(ExitBB);
+  }
+
+  // The default case assumes a 'relaxed' ordering and emits no fence
+  // whatsoever.
+  B.SetInsertPoint(DefaultBB);
+  B.CreateBr(ExitBB);
+
+  ExitBB->insertInto(&F);
+  B.SetInsertPoint(ExitBB);
+  B.CreateRetVoid();
+
+  return &F;
+}
+
+static BasicBlock *copy1D(Module &M, BasicBlock &ParentBB, Value *DstPtr,
+                          Value *SrcPtr, Value *NumBytes) {
+  Type *const I8Ty = IntegerType::get(M.getContext(), 8);
+
+  assert(SrcPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+  assert(DstPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+
+  compiler::utils::CreateLoopOpts opts;
+  opts.IVs = {SrcPtr, DstPtr};
+  opts.loopIVNames = {"dma.src", "dma.dst"};
+
+  // This is a simple loop copy a byte at a time from SrcPtr to DstPtr.
+  BasicBlock *ExitBB = compiler::utils::createLoop(
+      &ParentBB, nullptr, ConstantInt::get(getSizeType(M), 0), NumBytes, opts,
+      [&](BasicBlock *BB, Value *X, ArrayRef<Value *> IVsCurr,
+          MutableArrayRef<Value *> IVsNext) {
+        IRBuilder<> B(BB);
+        Value *const CurrentDmaSrcPtr1DPhi = IVsCurr[0];
+        Value *const CurrentDmaDstPtr1DPhi = IVsCurr[1];
+        Value *load = B.CreateLoad(I8Ty, CurrentDmaSrcPtr1DPhi);
+        B.CreateStore(load, CurrentDmaDstPtr1DPhi);
+        IVsNext[0] = B.CreateGEP(I8Ty, CurrentDmaSrcPtr1DPhi,
+                                 ConstantInt::get(X->getType(), 1));
+        IVsNext[1] = B.CreateGEP(I8Ty, CurrentDmaDstPtr1DPhi,
+                                 ConstantInt::get(X->getType(), 1));
+        return BB;
+      });
+
+  return ExitBB;
+}
+
+static BasicBlock *copy2D(Module &M, BasicBlock &ParentBB, Value *DstPtr,
+                          Value *SrcPtr, Value *LineSizeBytes,
+                          Value *LineStrideDst, Value *LineStrideSrc,
+                          Value *NumLines) {
+  Type *const I8Ty = IntegerType::get(M.getContext(), 8);
+
+  assert(SrcPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+  assert(DstPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+
+  compiler::utils::CreateLoopOpts opts;
+  opts.IVs = {SrcPtr, DstPtr};
+  opts.loopIVNames = {"dma.src", "dma.dst"};
+
+  // This is a loop over the range of lines, calling a 1D copy on each line
+  BasicBlock *ExitBB = compiler::utils::createLoop(
+      &ParentBB, nullptr, ConstantInt::get(getSizeType(M), 0), NumLines, opts,
+      [&](BasicBlock *block, Value *, ArrayRef<Value *> IVsCurr,
+          MutableArrayRef<Value *> IVsNext) {
+        IRBuilder<> loopIr(block);
+        Value *CurrentDmaSrcPtrPhi = IVsCurr[0];
+        Value *CurrentDmaDstPtrPhi = IVsCurr[1];
+
+        IVsNext[0] = loopIr.CreateGEP(I8Ty, CurrentDmaSrcPtrPhi, LineStrideSrc);
+        IVsNext[1] = loopIr.CreateGEP(I8Ty, CurrentDmaDstPtrPhi, LineStrideDst);
+        return copy1D(M, *block, CurrentDmaDstPtrPhi, CurrentDmaSrcPtrPhi,
+                      LineSizeBytes);
+      });
+
+  return ExitBB;
+}
+
+Function *BIMuxInfoConcept::defineDMA1D(Function &F) {
+  Argument *const ArgDstPtr = F.getArg(0);
+  Argument *const ArgSrcPtr = F.getArg(1);
+  Argument *const ArgWidth = F.getArg(2);
+  Argument *const ArgEvent = F.getArg(3);
+
+  auto &M = *F.getParent();
+  auto &Ctx = F.getContext();
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+  auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB);
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB);
+
+  auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB,
+                                    *GetLocalIDFn);
+
+  BasicBlock *const LoopExitBB =
+      copy1D(M, *LoopEntryBB, ArgDstPtr, ArgSrcPtr, ArgWidth);
+  IRBuilder<> LoopIRB(LoopExitBB);
+  LoopIRB.CreateBr(ExitBB);
+
+  IRBuilder<> ExitIRB(ExitBB);
+  ExitIRB.CreateRet(ArgEvent);
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineDMA2D(Function &F) {
+  Argument *const ArgDstPtr = F.getArg(0);
+  Argument *const ArcSrcPtr = F.getArg(1);
+  Argument *const ArgWidth = F.getArg(2);
+  Argument *const ArgDstStride = F.getArg(3);
+  Argument *const ArgSrcStride = F.getArg(4);
+  Argument *const ArgNumLines = F.getArg(5);
+  Argument *const ArgEvent = F.getArg(6);
+
+  auto &M = *F.getParent();
+  auto &Ctx = F.getContext();
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+  auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB);
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB);
+
+  auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB,
+                                    *GetLocalIDFn);
+
+  // Create a loop around 1D DMA memcpy, adding strides each time.
+  BasicBlock *const LoopExitBB =
+      copy2D(M, *LoopEntryBB, ArgDstPtr, ArcSrcPtr, ArgWidth, ArgDstStride,
+             ArgSrcStride, ArgNumLines);
+
+  IRBuilder<> LoopIRB(LoopExitBB);
+  LoopIRB.CreateBr(ExitBB);
+
+  IRBuilder<> ExitIRB(ExitBB);
+  ExitIRB.CreateRet(ArgEvent);
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineDMA3D(Function &F) {
+  Argument *const ArgDstPtr = F.getArg(0);
+  Argument *const ArgSrcPtr = F.getArg(1);
+  Argument *const ArgLineSize = F.getArg(2);
+  Argument *const ArgDstLineStride = F.getArg(3);
+  Argument *const ArgSrcLineStride = F.getArg(4);
+  Argument *const ArgNumLinesPerPlane = F.getArg(5);
+  Argument *const ArgDstPlaneStride = F.getArg(6);
+  Argument *const ArgSrcPlaneStride = F.getArg(7);
+  Argument *const ArgNumPlanes = F.getArg(8);
+  Argument *const ArgEvent = F.getArg(9);
+
+  auto &M = *F.getParent();
+  auto &Ctx = F.getContext();
+  Type *const I8Ty = IntegerType::get(Ctx, 8);
+
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+  auto *const LoopEntryBB = BasicBlock::Create(Ctx, "loop_entry", &F, ExitBB);
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F, LoopEntryBB);
+
+  auto *const GetLocalIDFn = getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalId, M);
+  compiler::utils::buildThreadCheck(EntryBB, LoopEntryBB, ExitBB,
+                                    *GetLocalIDFn);
+
+  assert(ArgSrcPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+  assert(ArgDstPtr->getType()->isPointerTy() &&
+         "Mux DMA builtins are always byte-accessed");
+
+  compiler::utils::CreateLoopOpts opts;
+  opts.IVs = {ArgSrcPtr, ArgDstPtr};
+  opts.loopIVNames = {"dma.src", "dma.dst"};
+
+  // Create a loop around 1D DMA memcpy, adding stride, local width each time.
+  BasicBlock *LoopExitBB = compiler::utils::createLoop(
+      LoopEntryBB, nullptr, ConstantInt::get(getSizeType(M), 0), ArgNumPlanes,
+      opts,
+      [&](BasicBlock *BB, Value *, ArrayRef<Value *> IVsCurr,
+          MutableArrayRef<Value *> IVsNext) {
+        IRBuilder<> loopIr(BB);
+        Value *CurrentDmaPlaneSrcPtrPhi = IVsCurr[0];
+        Value *CurrentDmaPlaneDstPtrPhi = IVsCurr[1];
+
+        IVsNext[0] =
+            loopIr.CreateGEP(I8Ty, CurrentDmaPlaneSrcPtrPhi, ArgSrcPlaneStride);
+        IVsNext[1] =
+            loopIr.CreateGEP(I8Ty, CurrentDmaPlaneDstPtrPhi, ArgDstPlaneStride);
+
+        return copy2D(M, *BB, CurrentDmaPlaneDstPtrPhi,
+                      CurrentDmaPlaneSrcPtrPhi, ArgLineSize, ArgDstLineStride,
+                      ArgSrcLineStride, ArgNumLinesPerPlane);
+      });
+
+  IRBuilder<> LoopExitIRB(LoopExitBB);
+  LoopExitIRB.CreateBr(ExitBB);
+
+  IRBuilder<> ExitIRB(ExitBB);
+  ExitIRB.CreateRet(ArgEvent);
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineDMAWait(Function &F) {
+  // By default this function is a simple return-void.
+  IRBuilder<> B(BasicBlock::Create(F.getContext(), "entry", &F));
+  B.CreateRetVoid();
+
+  return &F;
+}
+
+Function *BIMuxInfoConcept::defineMuxBuiltin(BuiltinID ID, Module &M,
+                                             ArrayRef<Type *> OverloadInfo) {
+  assert(BuiltinInfo::isMuxBuiltinID(ID) && "Only handling mux builtins");
+  Function *F = M.getFunction(BuiltinInfo::getMuxBuiltinName(ID, OverloadInfo));
+  // FIXME: We'd ideally want to declare it here to reduce pass
+  // inter-dependencies.
+  assert(F && "Function should have been pre-declared");
+  if (!F->isDeclaration()) {
+    return F;
+  }
+
+  switch (ID) {
+  default:
+    break;
+  case eMuxBuiltinGetGlobalId:
+    return defineGetGlobalId(M);
+  case eMuxBuiltinGetGlobalSize:
+    return defineGetGlobalSize(M);
+  case eMuxBuiltinGetLocalLinearId:
+    return defineGetLocalLinearId(M);
+  case eMuxBuiltinGetGlobalLinearId:
+    return defineGetGlobalLinearId(M);
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    return defineGetEnqueuedLocalSize(M);
+  // Just handle the memory synchronization requirements of any barrier
+  // builtin. We assume that the control requirements of work-group and
+  // sub-group control barriers have been handled by earlier passes.
+  case eMuxBuiltinMemBarrier:
+    return defineMemBarrier(*F, 0, 1);
+  case eMuxBuiltinSubGroupBarrier:
+  case eMuxBuiltinWorkGroupBarrier:
+    return defineMemBarrier(*F, 1, 2);
+  case eMuxBuiltinDMARead1D:
+  case eMuxBuiltinDMAWrite1D:
+    return defineDMA1D(*F);
+  case eMuxBuiltinDMARead2D:
+  case eMuxBuiltinDMAWrite2D:
+    return defineDMA2D(*F);
+  case eMuxBuiltinDMARead3D:
+  case eMuxBuiltinDMAWrite3D:
+    return defineDMA3D(*F);
+  case eMuxBuiltinDMAWait:
+    return defineDMAWait(*F);
+  case eMuxBuiltinGetSubGroupSize:
+    return defineGetSubGroupSize(*F);
+  case eMuxBuiltinGetSubGroupLocalId:
+    return defineGetSubGroupLocalId(*F);
+  }
+
+  if (auto *const NewF = defineLocalWorkItemBuiltin(*this, ID, M)) {
+    return NewF;
+  }
+
+  if (auto *const NewF = defineLocalWorkGroupBuiltin(*this, ID, M)) {
+    return NewF;
+  }
+
+  if (auto GroupOp = BuiltinInfo::isMuxGroupCollective(ID)) {
+    if (auto *const NewF =
+            defineSubGroupGroupOpBuiltin(*F, *GroupOp, OverloadInfo)) {
+      return NewF;
+    }
+  }
+
+  return nullptr;
+}
+
+bool BIMuxInfoConcept::requiresSchedulingParameters(BuiltinID ID) {
+  switch (ID) {
+  default:
+    return false;
+  case eMuxBuiltinGetLocalId:
+  case eMuxBuiltinSetLocalId:
+  case eMuxBuiltinGetSubGroupId:
+  case eMuxBuiltinSetSubGroupId:
+  case eMuxBuiltinGetNumSubGroups:
+  case eMuxBuiltinSetNumSubGroups:
+  case eMuxBuiltinGetMaxSubGroupSize:
+  case eMuxBuiltinSetMaxSubGroupSize:
+  case eMuxBuiltinGetLocalLinearId:
+    // Work-item struct only
+    return true;
+  case eMuxBuiltinGetWorkDim:
+  case eMuxBuiltinGetGroupId:
+  case eMuxBuiltinGetNumGroups:
+  case eMuxBuiltinGetGlobalSize:
+  case eMuxBuiltinGetLocalSize:
+  case eMuxBuiltinGetGlobalOffset:
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    // Work-group struct only
+    return true;
+  case eMuxBuiltinGetGlobalId:
+  case eMuxBuiltinGetGlobalLinearId:
+    // Work-item and work-group structs
+    return true;
+  }
+}
+
+Type *BIMuxInfoConcept::getRemappedTargetExtTy(Type *Ty, Module &M) {
+  // We only map target extension types
+  assert(Ty && Ty->isTargetExtTy() && "Only expecting target extension types");
+  auto &Ctx = Ty->getContext();
+  auto *TgtExtTy = cast<TargetExtType>(Ty);
+
+  // Samplers are replaced by default with size_t.
+  if (TgtExtTy == compiler::utils::tgtext::getSamplerTy(Ctx)) {
+    return getSizeType(M);
+  }
+
+  // Events are replaced by default with size_t.
+  if (TgtExtTy == compiler::utils::tgtext::getEventTy(Ctx)) {
+    return getSizeType(M);
+  }
+
+  // *All* images are replaced by default with a pointer in the default address
+  // space to the same structure type (i.e., regardless of image dimensions,
+  // etc.)
+  if (TgtExtTy->getName() == "spirv.Image") {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  return nullptr;
+}
+
+Function *
+BIMuxInfoConcept::getOrDeclareMuxBuiltin(BuiltinID ID, Module &M,
+                                         ArrayRef<Type *> OverloadInfo) {
+  assert(BuiltinInfo::isMuxBuiltinID(ID) && "Only handling mux builtins");
+  auto FnName = BuiltinInfo::getMuxBuiltinName(ID, OverloadInfo);
+  if (auto *const F = M.getFunction(FnName)) {
+    return F;
+  }
+  auto &Ctx = M.getContext();
+  AttrBuilder AB(Ctx);
+  auto *const SizeTy = getSizeType(M);
+  auto *const Int32Ty = Type::getInt32Ty(Ctx);
+  auto *const VoidTy = Type::getVoidTy(Ctx);
+
+  Type *RetTy = nullptr;
+  SmallVector<Type *, 4> ParamTys;
+  SmallVector<std::string, 4> ParamNames;
+
+  switch (ID) {
+  // Ranked Getters
+  case eMuxBuiltinGetLocalId:
+  case eMuxBuiltinGetGlobalId:
+  case eMuxBuiltinGetLocalSize:
+  case eMuxBuiltinGetGlobalSize:
+  case eMuxBuiltinGetGlobalOffset:
+  case eMuxBuiltinGetNumGroups:
+  case eMuxBuiltinGetGroupId:
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    ParamTys.push_back(Int32Ty);
+    ParamNames.push_back("idx");
+    LLVM_FALLTHROUGH;
+  // Unranked Getters
+  case eMuxBuiltinGetWorkDim:
+  case eMuxBuiltinGetSubGroupId:
+  case eMuxBuiltinGetNumSubGroups:
+  case eMuxBuiltinGetSubGroupSize:
+  case eMuxBuiltinGetMaxSubGroupSize:
+  case eMuxBuiltinGetSubGroupLocalId:
+  case eMuxBuiltinGetLocalLinearId:
+  case eMuxBuiltinGetGlobalLinearId: {
+    // Some builtins return uint, others return size_t
+    RetTy =
+        (ID == eMuxBuiltinGetWorkDim || ID == eMuxBuiltinGetSubGroupId ||
+         ID == eMuxBuiltinGetNumSubGroups || ID == eMuxBuiltinGetSubGroupSize ||
+         ID == eMuxBuiltinGetMaxSubGroupSize ||
+         ID == eMuxBuiltinGetSubGroupLocalId)
+            ? Int32Ty
+            : SizeTy;
+    // All of our mux getters are readonly - they may never write data
+    AB.addMemoryAttr(MemoryEffects::readOnly());
+    break;
+  }
+  // Ranked Setters
+  case eMuxBuiltinSetLocalId:
+    ParamTys.push_back(Int32Ty);
+    ParamNames.push_back("idx");
+    LLVM_FALLTHROUGH;
+  // Unranked Setters
+  case eMuxBuiltinSetSubGroupId:
+  case eMuxBuiltinSetNumSubGroups:
+  case eMuxBuiltinSetMaxSubGroupSize: {
+    RetTy = VoidTy;
+    ParamTys.push_back(ID == eMuxBuiltinSetLocalId ? SizeTy : Int32Ty);
+    ParamNames.push_back("val");
+    break;
+  }
+  case eMuxBuiltinMemBarrier: {
+    RetTy = VoidTy;
+    for (auto PName : {"scope", "semantics"}) {
+      ParamTys.push_back(Int32Ty);
+      ParamNames.push_back(PName);
+    }
+    AB.addAttribute(Attribute::NoMerge);
+    AB.addAttribute(Attribute::NoDuplicate);
+    AB.addAttribute(Attribute::Convergent);
+    break;
+  }
+  case eMuxBuiltinSubGroupBarrier:
+  case eMuxBuiltinWorkGroupBarrier: {
+    RetTy = VoidTy;
+    for (auto PName : {"id", "scope", "semantics"}) {
+      ParamTys.push_back(Int32Ty);
+      ParamNames.push_back(PName);
+    }
+    AB.addAttribute(Attribute::NoMerge);
+    AB.addAttribute(Attribute::NoDuplicate);
+    AB.addAttribute(Attribute::Convergent);
+    break;
+  }
+  case eMuxBuiltinDMAWait:
+    RetTy = VoidTy;
+    // Num events
+    ParamTys.push_back(Int32Ty);
+    ParamNames.push_back("num_events");
+    // The events list
+    ParamTys.push_back(PointerType::getUnqual(Ctx));
+    ParamNames.push_back("events");
+    AB.addAttribute(Attribute::Convergent);
+    break;
+  case eMuxBuiltinDMARead1D:
+  case eMuxBuiltinDMAWrite1D: {
+    // We need to be told the target event type to declare this builtin.
+    assert(!OverloadInfo.empty() && "Missing event type");
+    auto *const EventTy = OverloadInfo[0];
+    RetTy = EventTy;
+    const bool IsRead = ID == eMuxBuiltinDMARead1D;
+
+    PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local);
+    PointerType *const GlobalPtrTy =
+        PointerType::get(Ctx, AddressSpace::Global);
+
+    ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+    ParamNames.push_back("dst");
+
+    ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+    ParamNames.push_back("src");
+
+    ParamTys.push_back(SizeTy);
+    ParamNames.push_back("num_bytes");
+
+    ParamTys.push_back(EventTy);
+    ParamNames.push_back("event");
+    break;
+  }
+  case eMuxBuiltinDMARead2D:
+  case eMuxBuiltinDMAWrite2D: {
+    // We need to be told the target event type to declare this builtin.
+    assert(!OverloadInfo.empty() && "Missing event type");
+    auto *const EventTy = OverloadInfo[0];
+    RetTy = EventTy;
+    const bool IsRead = ID == eMuxBuiltinDMARead2D;
+
+    PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local);
+    PointerType *const GlobalPtrTy =
+        PointerType::get(Ctx, AddressSpace::Global);
+
+    ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+    ParamNames.push_back("dst");
+
+    ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+    ParamNames.push_back("src");
+
+    for (auto &P : {"num_bytes", "dst_stride", "src_stride", "height"}) {
+      ParamTys.push_back(SizeTy);
+      ParamNames.push_back(P);
+    }
+
+    ParamTys.push_back(EventTy);
+    ParamNames.push_back("event");
+    break;
+  }
+  case eMuxBuiltinDMARead3D:
+  case eMuxBuiltinDMAWrite3D: {
+    // We need to be told the target event type to declare this builtin.
+    assert(!OverloadInfo.empty() && "Missing event type");
+    auto *const EventTy = OverloadInfo[0];
+    RetTy = EventTy;
+    const bool IsRead = ID == eMuxBuiltinDMARead3D;
+
+    PointerType *const LocalPtrTy = PointerType::get(Ctx, AddressSpace::Local);
+    PointerType *const GlobalPtrTy =
+        PointerType::get(Ctx, AddressSpace::Global);
+
+    ParamTys.push_back(IsRead ? LocalPtrTy : GlobalPtrTy);
+    ParamNames.push_back("dst");
+
+    ParamTys.push_back(IsRead ? GlobalPtrTy : LocalPtrTy);
+    ParamNames.push_back("src");
+
+    for (auto &P : {"num_bytes", "dst_line_stride", "src_line_stride", "height",
+                    "dst_plane_stride", "src_plane_stride", "depth"}) {
+      ParamTys.push_back(SizeTy);
+      ParamNames.push_back(P);
+    }
+
+    ParamTys.push_back(EventTy);
+    ParamNames.push_back("event");
+    break;
+  }
+  default:
+    // Group builtins are more easily found using this helper rather than
+    // explicitly enumerating each switch case.
+    if (auto Group = BuiltinInfo::isMuxGroupCollective(ID)) {
+      RetTy = OverloadInfo.front();
+      AB.addAttribute(Attribute::Convergent);
+      switch (Group->Op) {
+      default:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("val");
+        break;
+      case GroupCollective::OpKind::Broadcast:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("val");
+        // Broadcasts additionally add ID parameters
+        if (Group->isSubGroupScope()) {
+          ParamTys.push_back(Int32Ty);
+          ParamNames.push_back("lid");
+        } else {
+          ParamTys.push_back(SizeTy);
+          ParamNames.push_back("lidx");
+          ParamTys.push_back(SizeTy);
+          ParamNames.push_back("lidy");
+          ParamTys.push_back(SizeTy);
+          ParamNames.push_back("lidz");
+        }
+        break;
+      case GroupCollective::OpKind::Shuffle:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("val");
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back("lid");
+        break;
+      case GroupCollective::OpKind::ShuffleXor:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("val");
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back("xor_val");
+        break;
+      case GroupCollective::OpKind::ShuffleUp:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("prev");
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("curr");
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back("delta");
+        break;
+      case GroupCollective::OpKind::ShuffleDown:
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("curr");
+        ParamTys.push_back(RetTy);
+        ParamNames.push_back("next");
+        ParamTys.push_back(Int32Ty);
+        ParamNames.push_back("delta");
+        break;
+      }
+      // All work-group operations have a 'barrier id' operand as their first
+      // parameter.
+      if (Group->isWorkGroupScope()) {
+        ParamTys.insert(ParamTys.begin(), Int32Ty);
+        ParamNames.insert(ParamNames.begin(), "id");
+      }
+    } else {
+      // Unknown mux builtin
+      return nullptr;
+    }
+  }
+
+  assert(RetTy);
+  assert(ParamTys.size() == ParamNames.size());
+
+  SmallVector<int, 4> SchedParamIdxs;
+  // Fill up the scalar parameters with the default attributes.
+  SmallVector<AttributeSet, 4> ParamAttrs(ParamTys.size(), AttributeSet());
+
+  if (requiresSchedulingParameters(ID) &&
+      getSchedulingParameterModuleMetadata(M)) {
+    for (const auto &P : getMuxSchedulingParameters(M)) {
+      ParamTys.push_back(P.ParamTy);
+      ParamNames.push_back(P.ParamName);
+      ParamAttrs.push_back(P.ParamAttrs);
+      SchedParamIdxs.push_back(ParamTys.size() - 1);
+    }
+  }
+
+  auto *const FnTy = FunctionType::get(RetTy, ParamTys, /*isVarArg*/ false);
+  auto *const F = Function::Create(FnTy, Function::ExternalLinkage, FnName, &M);
+  F->addFnAttrs(AB);
+
+  // Add some extra attributes we know are always true.
+  setDefaultBuiltinAttributes(*F);
+
+  for (unsigned i = 0, e = ParamNames.size(); i != e; i++) {
+    F->getArg(i)->setName(ParamNames[i]);
+    auto AB = AttrBuilder(Ctx, ParamAttrs[i]);
+    F->getArg(i)->addAttrs(AB);
+  }
+
+  setSchedulingParameterFunctionMetadata(*F, SchedParamIdxs);
+
+  return F;
+}
+
+// By default we use two parameters:
+// * one structure containing local work-group data
+// * one structure containing non-local work-group data
+SmallVector<BuiltinInfo::SchedParamInfo, 4>
+BIMuxInfoConcept::getMuxSchedulingParameters(Module &M) {
+  auto &Ctx = M.getContext();
+  auto &DL = M.getDataLayout();
+  AttributeSet DefaultAttrs;
+  DefaultAttrs = DefaultAttrs.addAttribute(Ctx, Attribute::NonNull);
+  DefaultAttrs = DefaultAttrs.addAttribute(Ctx, Attribute::NoAlias);
+
+  BuiltinInfo::SchedParamInfo WIInfo;
+  {
+    auto *const WIInfoS = getWorkItemInfoStructTy(M);
+    WIInfo.ID = SchedParamIndices::WI;
+    WIInfo.ParamPointeeTy = WIInfoS;
+    WIInfo.ParamTy = PointerType::get(Ctx, /*AddressSpace=*/0);
+    WIInfo.ParamName = "wi-info";
+    WIInfo.ParamDebugName = WIInfoS->getStructName().str();
+    WIInfo.PassedExternally = false;
+
+    auto AB = AttrBuilder(Ctx, DefaultAttrs);
+    AB.addAlignmentAttr(DL.getABITypeAlign(WIInfoS));
+    AB.addDereferenceableAttr(DL.getTypeAllocSize(WIInfoS));
+    WIInfo.ParamAttrs = AttributeSet::get(Ctx, AB);
+  }
+
+  BuiltinInfo::SchedParamInfo WGInfo;
+  {
+    auto *const WGInfoS = getWorkGroupInfoStructTy(M);
+    WGInfo.ID = SchedParamIndices::WG;
+    WGInfo.ParamPointeeTy = WGInfoS;
+    WGInfo.ParamTy = PointerType::get(Ctx, /*AddressSpace=*/0);
+    WGInfo.ParamName = "wg-info";
+    WGInfo.ParamDebugName = WGInfoS->getStructName().str();
+    WGInfo.PassedExternally = true;
+
+    auto AB = AttrBuilder(Ctx, DefaultAttrs);
+    AB.addAlignmentAttr(DL.getABITypeAlign(WGInfoS));
+    AB.addDereferenceableAttr(DL.getTypeAllocSize(WGInfoS));
+    WGInfo.ParamAttrs = AttributeSet::get(Ctx, AB);
+  }
+
+  return {WIInfo, WGInfo};
+}
+
+SmallVector<BuiltinInfo::SchedParamInfo, 4>
+BIMuxInfoConcept::getFunctionSchedulingParameters(Function &F) {
+  // Query function metadata to determine whether this function has scheduling
+  // parameters
+  auto ParamIdxs = getSchedulingParameterFunctionMetadata(F);
+  if (ParamIdxs.empty()) {
+    return {};
+  }
+
+  auto SchedParamInfo = getMuxSchedulingParameters(*F.getParent());
+  // We don't allow a function to have a subset of the global scheduling
+  // parameters.
+  assert(ParamIdxs.size() >= SchedParamInfo.size());
+  // Set the concrete argument values on each of the scheduling parameter data.
+  for (auto it : zip(SchedParamInfo, ParamIdxs)) {
+    // Some scheduling parameters may not be present (returning an index of
+    // -1), in which case skip their concrete argument values.
+    if (std::get<1>(it) >= 0) {
+      std::get<0>(it).ArgVal = F.getArg(std::get<1>(it));
+    }
+  }
+
+  return SchedParamInfo;
+}
+
+Value *BIMuxInfoConcept::initializeSchedulingParamForWrappedKernel(
+    const BuiltinInfo::SchedParamInfo &Info, IRBuilder<> &B, Function &IntoF,
+    Function &) {
+  // We only expect to have to initialize the work-item info. The work-group
+  // info is straight passed through.
+  (void)IntoF;
+  assert(!Info.PassedExternally && Info.ID == SchedParamIndices::WI &&
+         Info.ParamName == "wi-info" &&
+         Info.ParamPointeeTy == getWorkItemInfoStructTy(*IntoF.getParent()));
+  return B.CreateAlloca(Info.ParamPointeeTy,
+                        /*ArraySize*/ nullptr, Info.ParamName);
+}
+
+std::optional<llvm::ConstantRange> BIMuxInfoConcept::getBuiltinRange(
+    llvm::CallInst &CI, BuiltinID ID,
+    std::array<std::optional<uint64_t>, 3> MaxLocalSizes,
+    std::array<std::optional<uint64_t>, 3> MaxGlobalSizes) const {
+  assert(CI.getCalledFunction() && CI.getType()->isIntegerTy() &&
+         "Unexpected builtin");
+
+  auto Bits = CI.getType()->getIntegerBitWidth();
+  // Assume we're indexing the global sizes array.
+  std::array<std::optional<uint64_t>, 3> *SizesPtr = &MaxGlobalSizes;
+
+  switch (ID) {
+  default:
+    return std::nullopt;
+  case eMuxBuiltinGetWorkDim:
+    return ConstantRange::getNonEmpty(APInt(Bits, 1), APInt(Bits, 4));
+  case eMuxBuiltinGetLocalId:
+  case eMuxBuiltinGetLocalSize:
+  case eMuxBuiltinGetEnqueuedLocalSize:
+    // Use the local sizes array, and fall through to common handling.
+    SizesPtr = &MaxLocalSizes;
+    [[fallthrough]];
+  case eMuxBuiltinGetGlobalSize: {
+    auto *DimIdx = CI.getOperand(0);
+    if (!isa<ConstantInt>(DimIdx)) {
+      return std::nullopt;
+    }
+    const uint64_t DimVal = cast<ConstantInt>(DimIdx)->getZExtValue();
+    if (DimVal >= SizesPtr->size()) {
+      return std::nullopt;
+    }
+    const std::optional<uint64_t> Size = (*SizesPtr)[DimVal];
+    if (!Size) {
+      return std::nullopt;
+    }
+    // ID builtins range [0,size) (exclusive), and size builtins [1,size]
+    // (inclusive). Thus offset the range by 1 at each low/high end when
+    // returning the range for a size builtin.
+    const int SizeAdjust = ID == eMuxBuiltinGetLocalSize ||
+                           ID == eMuxBuiltinGetEnqueuedLocalSize ||
+                           ID == eMuxBuiltinGetGlobalSize;
+    return ConstantRange::getNonEmpty(APInt(Bits, SizeAdjust),
+                                      APInt(Bits, Size.value() + SizeAdjust));
+  }
+  }
+}
+
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
new file mode 100644
index 0000000000000..f735b1d1e6b8f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/optimal_builtin_replacement_pass.cpp
@@ -0,0 +1,319 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This pass replaces builtin functions with optimal equivalents.
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/optimal_builtin_replacement_pass.h>
+#include <llvm/ADT/PriorityWorklist.h>
+#include <llvm/ADT/STLExtras.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/InstIterator.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
+#include <llvm/TargetParser/Triple.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#define DEBUG_TYPE "ca-optimal-builtins"
+
+using namespace llvm;
+
+namespace {
+
+void removeCallSite(CallBase &CB, LazyCallGraph &CG) {
+  Function *Caller = CB.getCaller();
+  Function *Callee = CB.getCaller();
+  auto CallerNode = CG.get(*Caller);
+  auto CalleeNode = CG.get(*Callee);
+  if (auto *CallerRef = CG.lookupRefSCC(CallerNode)) {
+    CallerRef->removeOutgoingEdge(CallerNode, CalleeNode);
+  }
+}
+
+} // namespace
+
+namespace compiler {
+namespace utils {
+
+Value *OptimalBuiltinReplacementPass::replaceAbacusCLZ(
+    CallBase &CB, StringRef BaseName, const SmallVectorImpl<Type *> &,
+    const SmallVectorImpl<TypeQualifiers> &) {
+  if (BaseName != "__abacus_clz") {
+    return nullptr;
+  }
+  Module *M = CB.getModule();
+  SmallVector<Value *, 4> Args(CB.args());
+  // Get the declaration for the intrinsic
+  auto *const ArgTy = Args[0]->getType();
+  auto *const Intrinsic =
+      llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::ctlz, ArgTy);
+  // If we didn't find the intrinsic or the return type isn't what we
+  // expect, skip this optimization
+  Function *Callee = CB.getCalledFunction();
+  assert(Callee);
+  if (!Intrinsic || Intrinsic->getReturnType() != Callee->getReturnType()) {
+    return nullptr;
+  }
+
+  // On 32-bit ARM, the llvm.ctlz intrinsic on 64-bit types is expanded using
+  // compiler-rt. Without online linking, we can't support that.
+  const Triple TT(CB.getModule()->getTargetTriple());
+  if (TT.getArch() == Triple::arm && ArgTy->isIntOrIntVectorTy(64)) {
+    return nullptr;
+  }
+
+  // LLVM's ctlz has a second argument to specify that zeroes in the first
+  // argument produces a defined result.
+  LLVMContext &Ctx = M->getContext();
+  Args.push_back(ConstantInt::getFalse(Ctx));
+
+  auto *Call = CallInst::Create(Intrinsic, Args);
+  Call->insertBefore(CB.getIterator());
+  return Call;
+}
+
+Value *OptimalBuiltinReplacementPass::replaceAbacusMulhi(
+    CallBase &CB, StringRef BaseName, const SmallVectorImpl<Type *> &,
+    const SmallVectorImpl<TypeQualifiers> &Quals) {
+  if (BaseName != "__abacus_mul_hi") {
+    return nullptr;
+  }
+  IRBuilder<> B(&CB);
+
+  auto I = CB.arg_begin();
+  Value *const LHS = *I++;
+  Value *const RHS = *I++;
+
+  const auto BitWidth = LHS->getType()->getScalarType()->getIntegerBitWidth();
+
+  // Don't perform this optimization on 64-bit types as 128-bit types aren't
+  // generally well supported.
+  if (BitWidth == 64) {
+    return nullptr;
+  }
+
+  unsigned VecWidth = 1;
+  if (const auto *VecTy = dyn_cast<VectorType>(LHS->getType())) {
+    VecWidth = multi_llvm::getVectorNumElements(VecTy);
+  }
+
+  Type *UpTy = B.getIntNTy(BitWidth * 2);
+  if (VecWidth != 1) {
+    UpTy = FixedVectorType::get(UpTy, VecWidth);
+  }
+
+  bool SrcIsSigned = false;
+  for (unsigned i = 0, e = Quals[0].getCount(); i != e; i++) {
+    if (Quals[0].at(i) == eTypeQualSignedInt) {
+      SrcIsSigned = true;
+      break;
+    }
+  }
+
+  const auto CastOp = SrcIsSigned ? Instruction::SExt : Instruction::ZExt;
+
+  auto *const UpLHS = B.CreateCast(CastOp, LHS, UpTy);
+  auto *const UpRHS = B.CreateCast(CastOp, RHS, UpTy);
+
+  auto *const Mul = B.CreateMul(UpLHS, UpRHS);
+
+  Constant *ShiftAmt = B.getIntN(BitWidth * 2, BitWidth);
+  if (VecWidth != 1) {
+    ShiftAmt = ConstantDataVector::getSplat(VecWidth, ShiftAmt);
+  }
+
+  auto *const Shift = B.CreateAShr(Mul, ShiftAmt);
+
+  return B.CreateTrunc(Shift, LHS->getType());
+}
+
+Value *OptimalBuiltinReplacementPass::replaceAbacusFMinFMax(
+    CallBase &CB, StringRef BaseName, const SmallVectorImpl<Type *> &,
+    const SmallVectorImpl<TypeQualifiers> &) {
+  const bool IsFMin = BaseName == "__abacus_fmin";
+  if (!IsFMin && BaseName != "__abacus_fmax") {
+    return nullptr;
+  }
+
+  const Triple TT(CB.getModule()->getTargetTriple());
+  // minnum/maxnum intrinsics fail CTS on arm targets. See
+  // https://llvm.org/PR27363.
+  if (TT.getArch() == Triple::arm || TT.getArch() == Triple::aarch64) {
+    return nullptr;
+  }
+
+  IRBuilder<> B(&CB);
+
+  auto I = CB.arg_begin();
+  Value *LHS = *I++;
+  Value *RHS = *I++;
+
+  const auto *LHSTy = LHS->getType();
+  const auto *RHSTy = RHS->getType();
+
+  if (LHSTy->isVectorTy() != RHSTy->isVectorTy()) {
+    auto VectorEC =
+        multi_llvm::getVectorElementCount(LHSTy->isVectorTy() ? LHSTy : RHSTy);
+    if (!LHS->getType()->isVectorTy()) {
+      LHS = B.CreateVectorSplat(VectorEC, LHS);
+    }
+    if (!RHS->getType()->isVectorTy()) {
+      RHS = B.CreateVectorSplat(VectorEC, RHS);
+    }
+  }
+  return B.CreateBinaryIntrinsic(IsFMin ? Intrinsic::minnum : Intrinsic::maxnum,
+                                 LHS, RHS);
+}
+
+OptimalBuiltinReplacementPass::OptimalBuiltinReplacementPass() {
+  replacements.emplace_back(replaceAbacusCLZ);
+  replacements.emplace_back(replaceAbacusMulhi);
+  replacements.emplace_back(replaceAbacusFMinFMax);
+}
+
+Value *
+OptimalBuiltinReplacementPass::replaceBuiltinWithInlineIR(CallBase &CB) const {
+  auto *M = CB.getModule();
+  NameMangler mangler(&M->getContext());
+
+  SmallVector<Type *, 4> Types;
+  SmallVector<TypeQualifiers, 4> Quals;
+  Function *Callee = CB.getCalledFunction();
+  assert(Callee);
+  const StringRef BaseName =
+      mangler.demangleName(Callee->getName(), Types, Quals);
+
+  for (const auto &replace_fn : replacements) {
+    if (replace_fn) {
+      if (auto *V = replace_fn(CB, BaseName, Types, Quals)) {
+        return V;
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+PreservedAnalyses OptimalBuiltinReplacementPass::run(LazyCallGraph::SCC &C,
+                                                     CGSCCAnalysisManager &AM,
+                                                     LazyCallGraph &CG,
+                                                     CGSCCUpdateResult &) {
+  // Without the possibility of recursion, we can expect all meaningful
+  // OpenCL/ComputeMux programs to be contained within a single singular SCC
+  // serving as the entry point. We use this as the root.
+  if (C.size() != 1) {
+    return PreservedAnalyses::all();
+  }
+  Module &M = *C.begin()->getFunction().getParent();
+
+  // Check that at least one node in this graph is a kernel.
+  if (none_of(C, [](const LazyCallGraph::Node &N) {
+        return N.getFunction().getCallingConv() == CallingConv::SPIR_KERNEL;
+      })) {
+    return PreservedAnalyses::all();
+  }
+
+  const auto &MAMProxy = AM.getResult<ModuleAnalysisManagerCGSCCProxy>(C, CG);
+  if (auto *BI = MAMProxy.getCachedResult<BuiltinInfoAnalysis>(M)) {
+    replacements.emplace_back([BI](CallBase &CB, StringRef,
+                                   const SmallVectorImpl<Type *> &,
+                                   const SmallVectorImpl<TypeQualifiers> &)
+                                  -> Value * {
+      if (Function *Callee = CB.getCalledFunction()) {
+        if (const auto Builtin = BI->analyzeBuiltin(*Callee)) {
+          if (Builtin->properties & eBuiltinPropertyCanEmitInline) {
+            IRBuilder<> B(&CB);
+            const SmallVector<Value *, 4> Args(CB.args());
+            if (Value *Impl = BI->emitBuiltinInline(Callee, B, Args)) {
+              assert(
+                  Impl->getType() == CB.getType() &&
+                  "The inlinined function type must match that of the original "
+                  "function");
+              return Impl;
+            }
+          }
+        }
+      }
+      return nullptr;
+    });
+  }
+
+  if (adjustReplacements) {
+    adjustReplacements(replacements);
+  }
+
+  // If there are no replacements to run, for whatever reason, we can bail
+  // early.
+  if (replacements.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  SmallVector<CallBase *, 4> ToDelete;
+  // The SmallPriorityWorklist prioritises nodes which have been inserted
+  // multiple times, and avoids duplication of already-inserted items, but
+  // *not* ones already visited and popped off.
+  SmallPriorityWorklist<LazyCallGraph::Node *, 4> Worklist;
+  // Assuming we only have one node to begin with (see above), start off with
+  // that.
+  Worklist.insert(&*C.begin());
+  // While the worklist above prevents re-insertion, we might end up visiting
+  // the same function again after already visiting if popping it off the
+  // worklist. So we still have to keep track of recursion.
+  SmallPtrSet<LazyCallGraph::Node *, 4> Visited;
+
+  // Now visit all nodes in this "root" graph in order. We will visit
+  // outer-most functions (kernels) first before descending the call graph.
+  // This gives precedence to "outer-most" replacements.
+  while (!Worklist.empty()) {
+    LazyCallGraph::Node *N = Worklist.pop_back_val();
+    LLVM_DEBUG(dbgs() << "OptimalBuiltinReplacement: visiting " << *N << "\n");
+    for (Instruction &I : instructions(N->getFunction())) {
+      if (auto *CB = dyn_cast<CallBase>(&I)) {
+        if (CB->getCalledFunction() && !isa<IntrinsicInst>(I)) {
+          if (Value *New = replaceBuiltinWithInlineIR(*CB)) {
+            LLVM_DEBUG(dbgs()
+                       << "\tOptimalBuiltinReplacement: replacing call to "
+                       << CB->getCalledFunction()->getName() << "\n");
+            ToDelete.push_back(CB);
+            removeCallSite(*CB, CG);
+            // Assume that replacements don't introduce new calls, and we can
+            // simply mark this one as gone and move on.
+            CB->replaceAllUsesWith(New);
+          } else if (auto *CalledN = CG.lookup(*CB->getCalledFunction())) {
+            if (Visited.insert(CalledN).second) {
+              Worklist.insert(CalledN);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const bool Modified = !ToDelete.empty();
+
+  // Clean up any dead calls.
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.pop_back_val();
+    I->eraseFromParent();
+  }
+
+  return Modified ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
new file mode 100644
index 0000000000000..d1e46ee67b290
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_functions.cpp
@@ -0,0 +1,739 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/address_spaces.h>
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/device_info.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <cassert>
+
+llvm::AnalysisKey compiler::utils::DeviceInfoAnalysis::Key;
+
+namespace compiler {
+namespace utils {
+
+uint64_t computeApproximatePrivateMemoryUsage(const llvm::Function &fn) {
+  const llvm::Module *module = fn.getParent();
+  const auto &layout = module->getDataLayout();
+  uint64_t bytes = 0;
+
+  // BarrierPass asserts that `allocas` only exist in the entry block
+  for (auto &inst : fn.getEntryBlock()) {
+    if (!llvm::isa<llvm::AllocaInst>(inst)) {
+      continue;
+    }
+    const auto &alloca_inst = llvm::cast<llvm::AllocaInst>(inst);
+    const auto *type = alloca_inst.getType();
+    if (type->getAddressSpace() != AddressSpace::Private) {
+      continue;
+    }
+    auto *alloc_type = alloca_inst.getAllocatedType();
+    const auto alloc_size = layout.getTypeAllocSize(alloc_type);
+    if (alloca_inst.isArrayAllocation()) {
+      auto *arr_size_val = alloca_inst.getArraySize();
+      auto *const_int = llvm::dyn_cast<llvm::ConstantInt>(arr_size_val);
+      assert(const_int != nullptr && "Array Allocation of dynamic size");
+      const uint64_t arr_size = const_int->getUniqueInteger().getLimitedValue();
+      bytes += arr_size * alloc_size;
+
+    } else {
+      bytes += alloc_size;
+    }
+  }
+  return bytes;
+}
+
+static llvm::SmallVector<llvm::Constant *>
+getNewOps(llvm::Constant *constant, llvm::Constant *from, llvm::Constant *to) {
+  llvm::SmallVector<llvm::Constant *> newOps;
+  // iterate through the constant and create a vector of old and new
+  // ones
+  for (unsigned i = 0, e = constant->getNumOperands(); i != e; ++i) {
+    auto op = constant->getOperand(i);
+    if (op == from) {
+      newOps.push_back(to);
+    } else {
+      newOps.push_back(llvm::cast<llvm::Constant>(op));
+    }
+  }
+  return newOps;
+}
+
+void remapConstantArray(llvm::ConstantArray *arr, llvm::Constant *from,
+                        llvm::Constant *to) {
+  const llvm::SmallVector<llvm::Constant *> newOps = getNewOps(arr, from, to);
+  // Create a new array with the list of operands and replace all uses with
+  llvm::Constant *newConstant =
+      llvm::ConstantArray::get(arr->getType(), newOps);
+  arr->replaceAllUsesWith(newConstant);
+  arr->destroyConstant();
+}
+
+void remapConstantExpr(llvm::ConstantExpr *expr, llvm::Constant *from,
+                       llvm::Constant *to) {
+  const llvm::SmallVector<llvm::Constant *> newOps = getNewOps(expr, from, to);
+  // Create a new expression with the list of operands and replace all uses with
+  llvm::Constant *newConstant = expr->getWithOperands(newOps);
+  expr->replaceAllUsesWith(newConstant);
+  expr->destroyConstant();
+}
+
+bool funcContainsDebugMetadata(const llvm::Function &func,
+                               llvm::ValueToValueMapTy &vmap) {
+  // Check if function references debug info
+  bool foundDI = false;
+
+  // Function has a DISubprogram entry attached
+  if (auto DISubprogram = func.getSubprogram()) {
+    vmap.MD()[DISubprogram].reset(DISubprogram);
+    foundDI = true;
+  }
+
+  for (auto &BB : func) {
+    for (auto &Inst : BB) {
+      if (const auto &DL = Inst.getDebugLoc()) {
+        llvm::DILocation *loc = DL.get();
+        vmap.MD()[loc].reset(loc);
+        foundDI = true;
+      }
+    }
+  }
+
+  return foundDI;
+}
+
+void replaceConstantExpressionWithInstruction(llvm::Constant *const constant) {
+  // remove all dead constant users (sometimes these are left over by previous
+  // passes)
+  constant->removeDeadConstantUsers();
+
+  // Only handle constants which are ConstantExpr, ConstantVector or
+  // ConstantArray
+  assert((llvm::isa<llvm::ConstantExpr>(constant) ||
+          llvm::isa<llvm::ConstantVector>(constant) ||
+          llvm::isa<llvm::ConstantArray>(constant)) &&
+         "Unsupported constant type in IR");
+
+  // For each user of a constant we will check to see if they in turn are
+  // constants. If they are convert them to instructions first (still
+  // referencing this constant). We can are then clear to convert the current
+  // constant to an instruction as the only users left are instructions.
+
+  llvm::SmallVector<llvm::User *, 8> users;
+  // Create the list of users of this constant. We don't want duplicates here,
+  // which often happens with ConstantVectors, as we only want convert them to
+  // an instruction once. We want determinism here so use a vector to maintain
+  // order.
+  for (auto *constantUser : constant->users()) {
+    if (std::find(users.begin(), users.end(), constantUser) == users.end()) {
+      users.push_back(constantUser);
+    }
+  }
+
+  for (auto *constantUser : users) {
+    if (llvm::isa<llvm::Instruction>(constantUser)) {
+      // instructions are our best case, do nothing!
+    } else if (llvm::Constant *subConstant =
+                   llvm::dyn_cast<llvm::Constant>(constantUser)) {
+      replaceConstantExpressionWithInstruction(subConstant);
+    } else {
+      constantUser->print(llvm::errs());
+      llvm_unreachable("Constant user is not a constant or instruction!!");
+    }
+  }
+
+  // we record each use
+  llvm::SmallVector<llvm::Use *, 8> uses;
+
+  for (auto &use : constant->uses()) {
+    uses.push_back(&use);
+  }
+
+  for (auto *use : uses) {
+    // get the instruction that is the user of the use
+    auto inst = llvm::cast<llvm::Instruction>(use->getUser());
+
+    // get the function for this use
+    auto useFunc = inst->getFunction();
+
+    llvm::Instruction *newInst = nullptr;
+    // create a new instruction that matches the constant expression
+    if (llvm::ConstantExpr *constantExpr =
+            llvm::dyn_cast<llvm::ConstantExpr>(constant)) {
+      newInst = constantExpr->getAsInstruction();
+      // insert the instruction at the beginning of the entry block
+      newInst->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt());
+    } else if (llvm::ConstantVector *constantVec =
+                   llvm::dyn_cast<llvm::ConstantVector>(constant)) {
+      // If it is a ConstantVector then only handle the case where it is
+      // a single splatted value. This is the only kind generated at present.
+      auto splatVal = constantVec->getSplatValue();
+      assert(splatVal &&
+             "ConstantVector does not contained identical constants so cannot "
+             "be splatted!");
+      // Take the splatted Value and create two instructions. The first is
+      // InsertElement to place it in a new vector and the second is a
+      // ShuffleVector to duplicate the value across the vector.
+      auto numEls = constantVec->getNumOperands();
+      llvm::Value *poison = llvm::PoisonValue::get(
+          llvm::FixedVectorType::get(splatVal->getType(), numEls));
+      llvm::Type *i32Ty = llvm::Type::getInt32Ty(constant->getContext());
+      auto insert = llvm::InsertElementInst::Create(
+          poison, splatVal, llvm::ConstantInt::get(i32Ty, 0));
+      insert->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt());
+      llvm::Value *zeros = llvm::ConstantAggregateZero::get(
+          llvm::FixedVectorType::get(i32Ty, numEls));
+      newInst = new llvm::ShuffleVectorInst(insert, poison, zeros);
+      newInst->insertAfter(insert);
+    } else if (llvm::ConstantArray *constantArr =
+                   llvm::dyn_cast<llvm::ConstantArray>(constant)) {
+      auto numEls = constantArr->getNumOperands();
+      llvm::Value *poison = llvm::PoisonValue::get(constantArr->getType());
+      llvm::Instruction *insertedIns = nullptr;
+      for (unsigned int i = 0; i < numEls; i++) {
+        auto *insertNext =
+            llvm::InsertValueInst::Create(insertedIns ? insertedIns : poison,
+                                          constantArr->getOperand(i), {i});
+        if (insertedIns) {
+          insertNext->insertAfter(insertedIns);
+        } else {
+          insertNext->insertBefore(useFunc->getEntryBlock().getFirstNonPHIIt());
+        }
+        insertedIns = insertNext;
+      }
+      newInst = insertedIns;
+    }
+
+    // replace the use of the constant with the instruction
+    use->set(newInst);
+  }
+
+  // lastly, destroy the constant we just replaced
+  constant->destroyConstant();
+}
+
+llvm::AttributeList getCopiedFunctionAttrs(const llvm::Function &oldFn,
+                                           int numParams) {
+  const unsigned numParamsToCopy =
+      numParams < 0 ? oldFn.arg_size() : (unsigned)numParams;
+  llvm::SmallVector<llvm::AttributeSet, 4> newArgAttrs(numParamsToCopy);
+  const llvm::AttributeList oldAttrs = oldFn.getAttributes();
+  // clone any argument attributes we're copying over. Note we can't simply
+  // call Function::copyAttributes as not all arguments are present in the new
+  // function.
+  for (unsigned i = 0, e = numParamsToCopy; i != e; i++) {
+    newArgAttrs[i] = oldAttrs.getParamAttrs(i);
+  }
+
+  return llvm::AttributeList::get(oldFn.getContext(), oldAttrs.getFnAttrs(),
+                                  oldAttrs.getRetAttrs(), newArgAttrs);
+}
+
+void copyFunctionAttrs(const llvm::Function &oldFn, llvm::Function &newFn,
+                       int numParams) {
+  newFn.setAttributes(getCopiedFunctionAttrs(oldFn, numParams));
+}
+
+bool cloneFunctionsAddArg(
+    llvm::Module &module,
+    std::function<ParamTypeAttrsPair(llvm::Module &)> paramTypeFunc,
+    std::function<void(const llvm::Function &, bool &ClonedWithBody,
+                       bool &ClonedNoBody)>
+        toBeCloned,
+    const UpdateMDCallbackFn &updateMetaDataCallback) {
+  // mapping from new -> old function
+  llvm::ValueMap<llvm::Function *, llvm::Function *> newToOldMap;
+
+  // Preserve the value map across all function clones
+  llvm::ValueToValueMapTy vmap;
+
+  const ParamTypeAttrsPair paramInfo = paramTypeFunc(module);
+
+  // For each function we run the function toBeCloned to set the bools
+  // doCloneNoBody and doCloneWithBody
+  // first, run through our functions and make copies of all functions that:
+  //   1) are not declarations (these will be builtins we expand later) or
+  //   doCloneNoBody is set (don't clone but flesh out)
+  //   2) are not new functions that we just added
+  //   3) Functions marked by doCloneWithBody
+  for (auto &func : module.functions()) {
+    bool doCloneWithBody = false;
+    bool doCloneNoBody = false;
+
+    toBeCloned(func, doCloneWithBody, doCloneNoBody);
+    const bool isDecl = func.isDeclaration();
+    bool processFunc = (0 == newToOldMap.count(&func));
+
+    if (!isDecl) {
+      processFunc = processFunc && doCloneWithBody;
+    } else {
+      processFunc = processFunc && doCloneNoBody;
+    }
+
+    if (processFunc) {
+      auto funcTy = func.getFunctionType();
+
+      const unsigned numParams = funcTy->getNumParams();
+      llvm::SmallVector<llvm::Type *, 8> newParamTypes(numParams + 1);
+
+      // add each param from the original function to the new one
+      for (unsigned i = 0; i < numParams; i++) {
+        newParamTypes[i] = funcTy->getParamType(i);
+      }
+      // and lastly add our extra arg as the last param
+      newParamTypes[numParams] = paramInfo.first;
+
+      auto newFuncTy = llvm::FunctionType::get(funcTy->getReturnType(),
+                                               newParamTypes, false);
+
+      // create our new function, using the linkage from the old one
+      auto newFunc =
+          llvm::Function::Create(newFuncTy, func.getLinkage(), "", &module);
+
+      // set the correct calling convention
+      newFunc->setCallingConv(func.getCallingConv());
+
+      // take the name of the old function
+      newFunc->takeName(&func);
+
+      // Copy names over for the parameters
+      llvm::Function::arg_iterator DestI = newFunc->arg_begin();
+      for (const auto &I : func.args()) {
+        (*DestI).setName(I.getName()); // Copy the name over...
+        DestI++;
+      }
+
+      if (isDecl) {
+        // copy debug info for function over; CloneFunctionInto takes care of
+        // this if this function has a body
+        newFunc->setSubprogram(func.getSubprogram());
+        // copy the metadata into the new function, ignoring any debug info.
+        copyFunctionMetadata(func, *newFunc);
+      } else {
+        // map all original function arguments to the new function arguments
+        for (auto iter = func.arg_begin(), iter_end = func.arg_end(),
+                  new_iter = newFunc->arg_begin();
+             iter != iter_end; ++iter, ++new_iter) {
+          vmap[(&*iter)] = (&*new_iter);
+        }
+
+        llvm::SmallVector<llvm::ReturnInst *, 8> returns;
+
+        // we have module changes if our function contains any debug info
+        assert(newFunc->getParent() &&
+               "assumed newFunc has an associated module");
+        const bool hasDbgMetadata = funcContainsDebugMetadata(func, vmap);
+        const bool differentModules = newFunc->getParent() != func.getParent();
+        auto changeType = differentModules
+                              ? llvm::CloneFunctionChangeType::DifferentModule
+                              : llvm::CloneFunctionChangeType::LocalChangesOnly;
+        if (hasDbgMetadata) {
+          changeType = std::max(changeType,
+                                llvm::CloneFunctionChangeType::GlobalChanges);
+        }
+        CloneFunctionInto(newFunc, &func, vmap, changeType, returns);
+      }
+
+      // Add in the new parameter attributes here, because CloneFunctionInto
+      // wipes out pre-existing attributes on newFunc which aren't in oldFunc.
+      newFunc->addParamAttrs(numParams, llvm::AttrBuilder(newFunc->getContext(),
+                                                          paramInfo.second));
+
+      // map new func -> old func
+      newToOldMap[newFunc] = &func;
+
+      // remove the body of the old function that we are going to delete
+      // anyway, so that none of its callsites get processed in the remainder
+      // of this pass
+      func.deleteBody();
+    }
+  }
+
+  // next, remap all callsites that would have called the old function, to the
+  // new function we just created
+  for (auto pair : newToOldMap) {
+    llvm::Function *newFunc = pair.first;
+    llvm::Function *oldFunc = pair.second;
+
+    remapClonedCallsites(*oldFunc, *newFunc, true);
+
+    // next, let the caller update any metadata.
+    if (updateMetaDataCallback) {
+      updateMetaDataCallback(*oldFunc, *newFunc,
+                             newFunc->getFunctionType()->getNumParams() - 1);
+    }
+  }
+
+  // lastly, remove all the old functions we no longer need
+  for (auto pair : newToOldMap) {
+    // the old function, no longer used
+    llvm::Function *const oldFunc = pair.second;
+
+    // then destroy the function
+    oldFunc->eraseFromParent();
+  }
+
+  return true;
+}
+
+void remapClonedCallsites(llvm::Function &oldFunc, llvm::Function &newFunc,
+                          bool extraArg) {
+  // list of calls we need to erase at the end
+  llvm::SmallVector<llvm::CallInst *, 32> callsToErase;
+
+  // for everything that uses our old function
+  for (auto *user : oldFunc.users()) {
+    // if the user calls our old function
+    if (auto ci = llvm::dyn_cast<llvm::CallInst>(user)) {
+      // store the name from the old call
+      const std::string name = ci->getName().str();
+
+      // get the number of args at the old callsite
+      const unsigned numArgs = ci->arg_size();
+
+      // the number of args at the new callsite. If we're adding an extra
+      // argument this is incremented.
+      const unsigned newNumArgs = extraArg ? numArgs + 1 : numArgs;
+
+      // create a buffer for our args
+      llvm::SmallVector<llvm::Value *, 8> args(newNumArgs);
+
+      // set all the new call args to be the old call args
+      for (unsigned i = 0; i < numArgs; i++) {
+        args[i] = ci->getArgOperand(i);
+      }
+
+      // if we're adding an extra param it's always the last
+      // argument, so propagate the value on from the parent
+      if (extraArg) {
+        llvm::Function *parentFunc = ci->getFunction();
+        llvm::Argument *lastArg = getLastArgument(parentFunc);
+        args[numArgs] = lastArg;
+      }
+
+      // create our new call instruction to replace the old one
+      auto newCi = llvm::CallInst::Create(&newFunc, args, name);
+      newCi->insertBefore(ci->getIterator());
+
+      // use the debug location from the old call (if any)
+      newCi->setDebugLoc(ci->getDebugLoc());
+
+      // set the calling convention for our new call the same as the old one
+      newCi->setCallingConv(ci->getCallingConv());
+
+      // replace anything that uses the old call with the new one
+      ci->replaceAllUsesWith(newCi);
+
+      // and remember to erase the old callsite
+      callsToErase.push_back(ci);
+    } else if (llvm::ConstantExpr *constant =
+                   llvm::dyn_cast<llvm::ConstantExpr>(user)) {
+      remapConstantExpr(constant, &oldFunc, &newFunc);
+    } else {
+      llvm_unreachable(
+          "UNHANDLED user for Function not a CallInst or ConstantExpr\n");
+    }
+  }
+
+  // remove all the old instructions we no longer need
+  for (llvm::CallInst *ci : callsToErase) {
+    // then destroy the call
+    ci->eraseFromParent();
+  }
+}
+
+llvm::BasicBlock *createLoop(llvm::BasicBlock *entry, llvm::BasicBlock *exit,
+                             llvm::Value *indexStart, llvm::Value *indexEnd,
+                             const CreateLoopOpts &opts,
+                             CreateLoopBodyFn body) {
+  // If the index increment is null, we default to 1 as our index.
+  auto indexInc = opts.indexInc
+                      ? opts.indexInc
+                      : llvm::ConstantInt::get(indexStart->getType(), 1);
+
+  llvm::LLVMContext &ctx = entry->getContext();
+
+  llvm::SmallVector<llvm::Value *, 4> currIVs(opts.IVs.begin(), opts.IVs.end());
+  llvm::SmallVector<llvm::Value *, 4> nextIVs(opts.IVs.size());
+
+  // the basic block that will link into our loop
+  llvm::IRBuilder<> entryIR(entry);
+
+  // the basic block that will form the start of our loop
+  llvm::IRBuilder<> loopIR(
+      llvm::BasicBlock::Create(ctx, opts.headerName, entry->getParent()));
+
+  // branch into our loop to begin executing
+  entryIR.CreateBr(loopIR.GetInsertBlock());
+
+  // first thing in the loop is our phi node for the loop counter
+  auto phi = loopIR.CreatePHI(indexInc->getType(), 2);
+
+  // and make the phi node equal the start index when coming from our entry
+  phi->addIncoming(indexStart, entryIR.GetInsertBlock());
+
+  // Set up all of our user PHIs
+  for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
+    // For convenience to callers, permit nullptr and skip over it.
+    if (!currIVs[i])
+      continue;
+
+    auto *const phi = loopIR.CreatePHI(currIVs[i]->getType(), 2);
+    llvm::cast<llvm::PHINode>(phi)->addIncoming(currIVs[i],
+                                                entryIR.GetInsertBlock());
+    // Set IV names if they've been given to us.
+    if (i < opts.loopIVNames.size()) {
+      phi->setName(opts.loopIVNames[i]);
+    }
+    currIVs[i] = phi;
+  }
+
+  // run the lamdba for the loop body, storing the block is finished at
+  llvm::BasicBlock *const latch =
+      body(loopIR.GetInsertBlock(), phi, currIVs, nextIVs);
+  llvm::IRBuilder<> bodyIR(latch);
+
+  // add to the phi node to increment our loop counter
+  auto *const add = bodyIR.CreateAdd(phi, indexInc);
+
+  // and set that if we loop back around, the phi node will be the increment
+  phi->addIncoming(add, latch);
+
+  // Update all of our PHIs
+  for (unsigned i = 0, e = currIVs.size(); i != e; i++) {
+    if (!currIVs[i])
+      continue;
+    llvm::cast<llvm::PHINode>(currIVs[i])->addIncoming(nextIVs[i], latch);
+  }
+
+  if (!exit) {
+    // the basic block to exit our loop when we are done
+    const llvm::IRBuilder<> exitIR(
+        llvm::BasicBlock::Create(ctx, "exitIR", entry->getParent()));
+    exit = exitIR.GetInsertBlock();
+  }
+
+  // last, branch condition either to the exit, or for another loop iteration
+  auto *const termBR = bodyIR.CreateCondBr(bodyIR.CreateICmpULT(add, indexEnd),
+                                           loopIR.GetInsertBlock(), exit);
+
+  if (opts.disableVectorize) {
+    auto *const vecDisable = llvm::MDNode::get(
+        ctx, {llvm::MDString::get(ctx, "llvm.loop.vectorize.enable"),
+              llvm::ConstantAsMetadata::get(
+                  llvm::ConstantInt::get(llvm::Type::getInt1Ty(ctx), false))});
+    // LLVM loop metadata -- for legacy reasons -- must have a reference to
+    // itself as its first operand. See
+    // https://llvm.org/docs/LangRef.html#llvm-loop.
+    auto *loopID = llvm::MDNode::get(ctx, {nullptr, vecDisable});
+    loopID->replaceOperandWith(0, loopID);
+    termBR->setMetadata(llvm::LLVMContext::MD_loop, loopID);
+  }
+
+  // we stopped executing in the exit block, so return that
+  return exit;
+}
+
+llvm::Argument *getLastArgument(llvm::Function *f) {
+  assert(!f->arg_empty() &&
+         "Can't get last argument if there are no arguments");
+  return f->arg_end() - 1;
+}
+
+unsigned getSizeTypeBytes(const llvm::Module &m) {
+  return m.getDataLayout().getPointerSize(0);
+}
+
+llvm::IntegerType *getSizeType(const llvm::Module &m) {
+  const llvm::DataLayout &dataLayout = m.getDataLayout();
+  return llvm::IntegerType::get(m.getContext(),
+                                dataLayout.getPointerSizeInBits(0));
+}
+
+static llvm::Function *
+createKernelWrapperFunctionImpl(llvm::Function &F, llvm::Function &NewFunction,
+                                llvm::StringRef Suffix,
+                                llvm::StringRef OldSuffix) {
+  // Make sure we take a copy of the basename as we're going to change the
+  // original function's name from underneath the StringRef.
+  const std::string baseName = getOrSetBaseFnName(NewFunction, F).str();
+
+  if (!OldSuffix.empty()) {
+    if (getBaseFnName(F).empty()) {
+      setBaseFnName(F, F.getName());
+    }
+    F.setName(F.getName() + OldSuffix);
+  }
+
+  NewFunction.setName(baseName + Suffix);
+
+  // we don't use exceptions
+  NewFunction.addFnAttr(llvm::Attribute::NoUnwind);
+
+  // copy the calling convention from the old function
+  NewFunction.setCallingConv(F.getCallingConv());
+
+  // and remove spir_kernel from the old function
+  if (F.getCallingConv() == llvm::CallingConv::SPIR_KERNEL) {
+    F.setCallingConv(llvm::CallingConv::SPIR_FUNC);
+  }
+
+  // copy the metadata into the new kernel ignoring any debug info.
+  copyFunctionMetadata(F, NewFunction);
+
+  // drop kernel (+ entry point) information from the old function: we've
+  // copied it over to the new one.
+  dropIsKernel(F);
+
+  // copy debug info for function over
+  if (auto *SP = F.getSubprogram()) {
+    const llvm::DIBuilder DIB(*F.getParent());
+    llvm::DISubprogram *const NewSP = DIB.createArtificialSubprogram(SP);
+    // Wipe the list of retained nodes, as this new function is a wrapper over
+    // the old one and does not itself contain/retain any of the wrapped
+    // function's nodes.
+    NewSP->replaceRetainedNodes({});
+    NewFunction.setSubprogram(NewSP);
+  }
+
+  // set the function to always inline: 'noinline' takes precedence, though
+  if (!F.hasFnAttribute(llvm::Attribute::NoInline)) {
+    F.addFnAttr(llvm::Attribute::AlwaysInline);
+  }
+
+  // lastly set the linkage to internal
+  F.setLinkage(llvm::GlobalValue::InternalLinkage);
+
+  return &NewFunction;
+}
+
+llvm::Function *createKernelWrapperFunction(llvm::Function &F,
+                                            llvm::StringRef Suffix,
+                                            llvm::StringRef OldSuffix) {
+  // Create our new function
+  llvm::Function *const NewFunction = llvm::Function::Create(
+      F.getFunctionType(), llvm::Function::ExternalLinkage, "", F.getParent());
+
+  // copy over function attributes, including parameter attributes
+  copyFunctionAttrs(F, *NewFunction);
+
+  // Copy over parameter names
+  for (auto it : zip(NewFunction->args(), F.args())) {
+    std::get<0>(it).setName(std::get<1>(it).getName());
+  }
+
+  return createKernelWrapperFunctionImpl(F, *NewFunction, Suffix, OldSuffix);
+}
+
+llvm::Function *
+createKernelWrapperFunction(llvm::Module &M, llvm::Function &F,
+                            llvm::ArrayRef<llvm::Type *> ArgTypes,
+                            llvm::StringRef Suffix, llvm::StringRef OldSuffix) {
+  llvm::FunctionType *NewFunctionType =
+      llvm::FunctionType::get(F.getReturnType(), ArgTypes, false);
+
+  // create our new function
+  llvm::Function *const NewFunction = llvm::Function::Create(
+      NewFunctionType, llvm::Function::ExternalLinkage, "", &M);
+
+  // copy over function attributes, ignoring all parameter attributes - we
+  // don't know what the parameter mapping is.
+  copyFunctionAttrs(F, *NewFunction, 0);
+
+  return createKernelWrapperFunctionImpl(F, *NewFunction, Suffix, OldSuffix);
+}
+
+llvm::CallInst *createCallToWrappedFunction(
+    llvm::Function &WrappedF, const llvm::SmallVectorImpl<llvm::Value *> &Args,
+    llvm::BasicBlock *BB, llvm::BasicBlock::iterator InsertPt,
+    llvm::StringRef Name) {
+  auto *const CI =
+      llvm::CallInst::Create(WrappedF.getFunctionType(), &WrappedF, Args);
+
+  CI->setName(Name);
+  CI->setCallingConv(WrappedF.getCallingConv());
+  CI->setAttributes(getCopiedFunctionAttrs(WrappedF));
+
+  if (BB) {
+    CI->insertInto(BB, InsertPt);
+
+    if (auto *const ParentF = BB->getParent()) {
+      // An inlinable function call in a function with debug info *must* be
+      // given a debug location.
+      if (auto *const SP = ParentF->getSubprogram()) {
+        auto *const DbgLoc = llvm::DILocation::get(ParentF->getContext(),
+                                                   /*line*/ 0, /*col*/ 0, SP);
+        CI->setDebugLoc(DbgLoc);
+      }
+    }
+  }
+
+  return CI;
+}
+
+llvm::Value *createBinOpForRecurKind(llvm::IRBuilderBase &B, llvm::Value *LHS,
+                                     llvm::Value *RHS, llvm::RecurKind Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unexpected Kind");
+  case llvm::RecurKind::None:
+    return nullptr;
+  case llvm::RecurKind::Add:
+    return B.CreateAdd(LHS, RHS);
+  case llvm::RecurKind::Mul:
+    return B.CreateMul(LHS, RHS);
+  case llvm::RecurKind::Or:
+    return B.CreateOr(LHS, RHS);
+  case llvm::RecurKind::And:
+    return B.CreateAnd(LHS, RHS);
+  case llvm::RecurKind::Xor:
+    return B.CreateXor(LHS, RHS);
+  case llvm::RecurKind::SMin:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::smin, LHS, RHS);
+  case llvm::RecurKind::UMin:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::umin, LHS, RHS);
+  case llvm::RecurKind::SMax:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::smax, LHS, RHS);
+  case llvm::RecurKind::UMax:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::umax, LHS, RHS);
+  case llvm::RecurKind::FAdd:
+    return B.CreateFAdd(LHS, RHS);
+  case llvm::RecurKind::FMul:
+    return B.CreateFMul(LHS, RHS);
+  case llvm::RecurKind::FMin:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::minnum, LHS, RHS);
+  case llvm::RecurKind::FMax:
+    return B.CreateBinaryIntrinsic(llvm::Intrinsic::maxnum, LHS, RHS);
+  }
+}
+
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
new file mode 100644
index 0000000000000..c9d66624db7ef
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/pass_machinery.cpp
@@ -0,0 +1,134 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/pass_machinery.h>
+#include <llvm/ADT/StringMap.h>
+#include <llvm/Analysis/AliasAnalysis.h>
+#include <multi_llvm/llvm_version.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+// Note that Clang has three on/off options for debugging pass managers:
+// `-fdebug-pass-manager`, `-fdebug-pass-structure`, and
+// `-fdebug-pass-arguments``.
+// LLVM's `opt` tool combines them all into one:
+//   --debug-pass-manager (Normal)
+//   --debug-pass-manager=verbose (Verbose)
+//   --debug-pass-manager=quiet (Quiet)
+// However, the mapping is not one-to-one:
+// opt:
+//   PrintPassOptions PrintPassOpts;
+//   PrintPassOpts.Verbose = DebugPM == DebugLogging::Verbose;
+//   PrintPassOpts.SkipAnalyses = DebugPM == DebugLogging::Quiet;
+//   StandardInstrumentations SI(DebugPM != DebugLogging::None, VerifyEachPass,
+//                               PrintPassOpts);
+// clang:
+//   bool DebugPassStructure = CodeGenOpts.DebugPass == "Structure";
+//   PrintPassOptions PrintPassOpts;
+//   PrintPassOpts.Indent = DebugPassStructure;
+//   PrintPassOpts.SkipAnalyses = DebugPassStructure;
+//   StandardInstrumentations SI(CodeGenOpts.DebugPassManager ||
+//                                   DebugPassStructure,
+//                               false, PrintPassOpts);
+// While clang also pushes `mdebug-pass` onto LLVM, it only works for the
+// legacy pass manager, and so we choose to only support and model the
+// `debug-pass-manager` form.
+DebugLogging DebugPasses;
+static cl::opt<DebugLogging, true> DebugPM(
+    "debug-pass-manager", cl::location(DebugPasses), cl::Hidden,
+    cl::ValueOptional, cl::desc("Print pass management debugging information"),
+    cl::init(DebugLogging::None),
+    cl::values(
+        clEnumValN(DebugLogging::Normal, "", ""),
+        clEnumValN(DebugLogging::Quiet, "quiet",
+                   "Skip printing info about analyses"),
+        clEnumValN(
+            DebugLogging::Verbose, "verbose",
+            "Print extra information about adaptors and pass managers")));
+
+bool VerifyEachIsEnabled;
+static cl::opt<bool, true> VerifyEach("verify-each",
+                                      cl::location(VerifyEachIsEnabled),
+                                      cl::desc("Verify after each transform"));
+
+PassMachinery::PassMachinery(LLVMContext &Ctx, TargetMachine *TM,
+                             bool VerifyEach, DebugLogging debugLogLevel)
+    : TM(TM) {
+  llvm::PrintPassOptions PrintPassOpts;
+  PrintPassOpts.Verbose = DebugPM == DebugLogging::Verbose;
+  PrintPassOpts.SkipAnalyses = DebugPM == DebugLogging::Quiet;
+  PrintPassOpts.Indent = debugLogLevel != DebugLogging::None;
+  SI = std::make_unique<StandardInstrumentations>(
+      Ctx, debugLogLevel != DebugLogging::None, VerifyEach, PrintPassOpts);
+}
+
+PassMachinery::~PassMachinery() {}
+
+void PassMachinery::initializeStart(PipelineTuningOptions PTO) {
+  const std::optional<PGOOptions> PGOOpt;
+  PB = PassBuilder(TM, PTO, PGOOpt, &PIC);
+}
+
+void PassMachinery::registerPasses() {
+  buildDefaultAAPipeline();
+  registerLLVMAnalyses();
+}
+
+void PassMachinery::initializeFinish() {
+  // Register LLVM analyses now, with the knowledge that users have had the
+  // chance to register their own versions of LLVM analyses first.
+  registerPasses();
+  // With all passes registered, cross-register all the proxies
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  // Allow registration of callbacks and instrumentation machinery
+  addClassToPassNames();
+  registerPassCallbacks();
+
+  // Register pass instrumentation
+  SI->registerCallbacks(PIC, &MAM);
+}
+
+void PassMachinery::buildDefaultAAPipeline() {
+  FAM.registerPass([&] { return PB.buildDefaultAAPipeline(); });
+}
+
+void PassMachinery::registerLLVMAnalyses() {
+  // Register standard analyses
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+}
+
+} // namespace utils
+} // namespace compiler
+
+namespace compiler {
+namespace utils {
+/// Helper functions for printing
+void printPassName(StringRef PassName, raw_ostream &OS) {
+  OS << "  " << PassName << "\n";
+}
+
+void printPassName(StringRef PassName, StringRef Params, raw_ostream &OS) {
+  OS << "  " << PassName << "<" << Params << ">\n";
+}
+
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
new file mode 100644
index 0000000000000..32d9feb5b41bd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/prepare_barriers_pass.cpp
@@ -0,0 +1,131 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/prepare_barriers_pass.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <functional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ca-barriers"
+
+PreservedAnalyses
+compiler::utils::PrepareBarriersPass::run(Module &M,
+                                          ModuleAnalysisManager &AM) {
+  SmallPtrSet<Function *, 4> Kernels;
+  auto &BI = AM.getResult<BuiltinInfoAnalysis>(M);
+  for (auto &F : M.functions()) {
+    if (isKernelEntryPt(F)) {
+      Kernels.insert(&F);
+    }
+  }
+
+  SmallPtrSet<Function *, 4> FuncsWithBarriers;
+
+  for (Function &F : M) {
+    const auto B = BI.analyzeBuiltin(F);
+    // If the function does not have a barrier id.
+    if (!B || !BI.isMuxBuiltinWithBarrierID(B->ID)) {
+      continue;
+    }
+
+    for (User *U : F.users()) {
+      if (auto *const CI = dyn_cast<CallInst>(U)) {
+        auto *const Callee = CI->getFunction();
+
+        // If it's one of our kernels don't inline it, and definitely don't
+        // delete it either. No need to inline already dead functions, either!
+        if (!Callee->isDefTriviallyDead() && !Kernels.contains(Callee)) {
+          FuncsWithBarriers.insert(Callee);
+        }
+      }
+    }
+  }
+
+  bool Changed = false;
+
+  // Walk the users of the barrier.
+  while (!FuncsWithBarriers.empty()) {
+    auto *F = *FuncsWithBarriers.begin();
+    FuncsWithBarriers.erase(F);
+
+    // Make a copy of the users of the function to be inlined since
+    // InlineFunction modifies the state of ci/F which affects
+    // the range being iterated over, resulting in use-after-free.
+    const SmallVector<User *, 8> Users{F->user_begin(), F->user_end()};
+
+    // Check the users of the function the call instruction inhabits.
+    for (User *U : Users) {
+      // If the call instruction's function does not any users.
+      if (!isa<CallInst>(U)) {
+        continue;
+      }
+
+      auto *const InfoF = cast<CallInst>(U)->getFunction();
+      InlineFunctionInfo IFI;
+      auto InlineResult =
+          InlineFunction(*cast<CallInst>(U), IFI, /*MergeAttributes*/ false,
+                         /*CalleeAAR*/ nullptr, /*InsertLifetime*/ true,
+                         /*ForwardVarArgsTo*/ nullptr);
+      if (InlineResult.isSuccess()) {
+        Changed = true;
+
+        // The function we inlined into now contains a barrier, so add it
+        // to the set.
+        if (!InfoF->isDefTriviallyDead() && !Kernels.contains(InfoF)) {
+          FuncsWithBarriers.insert(InfoF);
+        }
+      } else {
+        LLVM_DEBUG(dbgs() << "Could not inline: " << *U << '\n';);
+      }
+    }
+
+    // Delete the now-dead inlined function
+    if (F->isDefTriviallyDead()) {
+      F->eraseFromParent();
+    }
+  }
+
+  // Assign all barriers a unique ID
+  unsigned ID = 0U;
+  auto &Ctx = M.getContext();
+  auto *const I32Ty = IntegerType::get(Ctx, 32);
+  for (auto *F : Kernels) {
+    for (BasicBlock &BB : *F) {
+      for (Instruction &I : BB) {
+        // Check call instructions for barrier.
+        if (auto *const CI = dyn_cast<CallInst>(&I)) {
+          if (Function *Callee = CI->getCalledFunction()) {
+            if (auto B = BI.analyzeBuiltin(*Callee)) {
+              if (BI.isMuxBuiltinWithBarrierID(B->ID)) {
+                CI->setOperand(0, ConstantInt::get(I32Ty, ID++));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
new file mode 100644
index 0000000000000..396bc347f7fa1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/replace_local_module_scope_variables_pass.cpp
@@ -0,0 +1,767 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/address_spaces.h>
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/replace_local_module_scope_variables_pass.h>
+#include <llvm/ADT/PriorityWorklist.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <algorithm>
+#include <cassert>
+#include <functional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "replace-module-scope-vars"
+
+namespace {
+using AlignIntTy = uint64_t;
+
+// Creates and returns a new GEP instruction, inserted before input parameter
+// 'inst'. This GEP points to the element at 'index' of the struct living at
+// the final argument of each function.
+GetElementPtrInst *generateStructGEP(Instruction &inst,
+                                     StructType *funcsStructTy,
+                                     unsigned index) {
+  // find the function the instruction is in
+  auto func = inst.getFunction();
+
+  // the local module-scope variables struct we added to each function
+  auto funcsStruct = compiler::utils::getLastArgument(func);
+
+  assert(funcsStruct->getType()->isPointerTy());
+
+  // the type with which to index into our struct type
+  auto indexTy = Type::getInt32Ty(inst.getModule()->getContext());
+
+  // create a new GEP just before the instruction
+  auto GEP = GetElementPtrInst::CreateInBounds(
+      funcsStructTy, funcsStruct,
+      {ConstantInt::get(indexTy, 0), ConstantInt::get(indexTy, index)});
+  GEP->insertBefore(inst.getIterator());
+  return GEP;
+}
+
+// Given the type of a __local variable about to be added to the
+// struct function calculates and returns the alignment of the type.
+AlignIntTy calculateTypeAlign(Type *type, const DataLayout &layout) {
+  // Get underlying type if variable is an array
+  while (type->isArrayTy()) {
+    type = type->getArrayElementType();
+  }
+
+  // 3 component wide vectors have the size of 4 components according to the
+  // OpenCL spec section 6.1.5 'Alignment of Types'
+  unsigned int vectorWidth =
+      type->isVectorTy() ? multi_llvm::getVectorNumElements(type) : 1;
+  if (3 == vectorWidth) {
+    vectorWidth = 4;
+  }
+
+  // if we have a pointer type return the size of a pointer on the target
+  if (type->isPointerTy()) {
+    return layout.getPointerSize();
+  }
+
+  // size of member in bytes - at least 8 bits to avoid zero alignment on
+  // integer types smaller than i8.
+  const unsigned int vectorSize =
+      (std::max(type->getScalarSizeInBits(), 8u) * vectorWidth) / 8;
+
+  return vectorSize;
+}
+
+// Variables in the local address space not passed as arguments can only be
+// declared in the outermost scope of a kernel function. Here we find the kernel
+// function the local address space global resides in.
+Function *determineKernel(GlobalVariable &global) {
+  auto global_user = *(global.user_begin());
+  if (auto instruction = dyn_cast<Instruction>(global_user)) {
+    return instruction->getFunction();
+  } else if (ConstantVector *cv = dyn_cast<ConstantVector>(global_user)) {
+    User *cv_user = *(cv->user_begin());
+    auto instruction = cast<Instruction>(cv_user);
+    return instruction->getFunction();
+  } else if (global_user) {
+    global_user->print(errs());
+    llvm_unreachable("Unknown user used the local module-scope variable!");
+  }
+  return nullptr;
+}
+
+// Information associated to with a local address space module scope variable
+// that is needed to update it's debug info metadata
+struct GlobalVarDebugInfoWrapper final {
+  // Byte offset into struct of replacement variables
+  unsigned offset;
+  // Associated debug info metadata entry
+  DIGlobalVariable *DIGlobal;
+  // Kernel function variable was defined in
+  Function *function;
+};
+
+// Check if a user is an instruction and if so add it to the Visited, Worklist
+// and FuncsToClone. If it's not an instruction repeat for all its users
+void checkUsersForInstructions(
+    User *user, llvm::SmallPtrSet<llvm::Function *, 4> &Visited,
+    llvm::SmallVector<llvm::Function *, 4> &FuncsToClone,
+    llvm::SmallPriorityWorklist<llvm::Function *, 4> &Worklist) {
+  if (auto *I = dyn_cast<Instruction>(user)) {
+    auto *F = I->getFunction();
+    if (Visited.insert(F).second) {
+      Worklist.insert(F);
+      FuncsToClone.push_back(F);
+      LLVM_DEBUG(
+          dbgs() << "Function '" << F->getName()
+                 << "' requires additional local module struct parameter\n");
+    }
+  } else {
+    for (auto *user_of_user : user->users()) {
+      checkUsersForInstructions(user_of_user, Visited, FuncsToClone, Worklist);
+    }
+  }
+}
+
+/// @brief Clone all required functions in a module, appending an extra
+/// parameter to them if they are part of the call graph required for access to
+/// local variables.
+///
+/// @param module llvm module containing the functions
+/// @param newParamType Type of the parameter to be added
+/// @param newParamAttrs Parameter attributes of the parameter to be added
+/// @return bool if the module has changed (currently always true)
+///
+/// This recurses through all the users of the local variables to look for any
+/// functions which use them as well as assuming that the top level kernels must
+/// have them.
+bool addParamToAllRequiredFunctions(llvm::Module &module,
+                                    llvm::Type *const newParamType,
+                                    const llvm::AttributeSet &newParamAttrs) {
+  llvm::SmallPtrSet<llvm::Function *, 4> Visited;
+  llvm::SmallVector<llvm::Function *, 4> FuncsToClone;
+  llvm::SmallPriorityWorklist<llvm::Function *, 4> Worklist;
+
+  // Iterate through the top level functions checking if they are kernels.
+  for (auto &F : module.functions()) {
+    // Kernel entry points must present a consistent ABI to external users
+    if (compiler::utils::isKernelEntryPt(F)) {
+      Visited.insert(&F);
+      Worklist.insert(&F);
+      FuncsToClone.push_back(&F);
+      LLVM_DEBUG(
+          dbgs() << "Function '" << F.getName()
+                 << "' requires additional local module struct parameter\n");
+      continue;
+    }
+  }
+
+  // Check each global's users if they are instructions or recurse up the user
+  // chain if not. If an Instruction is found we add it to the functions to
+  // clone.
+  for (auto &global : module.globals()) {
+    for (auto *user : global.users()) {
+      checkUsersForInstructions(user, Visited, FuncsToClone, Worklist);
+    }
+  }
+
+  // Iterate over the functions that require local struct parameters and
+  // recursively register all callers of those functions as needing local struct
+  // parameters too.
+  while (!Worklist.empty()) {
+    Function *F = Worklist.pop_back_val();
+    for (auto *U : F->users()) {
+      if (auto *CB = dyn_cast<CallBase>(U)) {
+        auto *Caller = CB->getFunction();
+        if (Visited.insert(Caller).second) {
+          Worklist.insert(Caller);
+          FuncsToClone.push_back(Caller);
+          LLVM_DEBUG(dbgs() << "Function '" << Caller->getName()
+                            << "' requires local struct parameters\n");
+        }
+      } else {
+        report_fatal_error("unhandled user type");
+      }
+    }
+  }
+
+  // Ideally cloneFunctionsAddArg() would take a list of functions, but
+  // currently takes a std::function so we search the created vector of
+  // functions.
+  return compiler::utils::cloneFunctionsAddArg(
+      module,
+      [newParamType, newParamAttrs](llvm::Module &) {
+        return compiler::utils::ParamTypeAttrsPair{newParamType, newParamAttrs};
+      },
+      [&FuncsToClone](const llvm::Function &func, bool &ClonedWithBody,
+                      bool &ClonedNoBody) {
+        ClonedWithBody = llvm::is_contained(FuncsToClone, &func);
+        ClonedNoBody = false;
+      },
+      nullptr /*updateMetaDataCallback*/);
+}
+
+} // namespace
+
+PreservedAnalyses compiler::utils::ReplaceLocalModuleScopeVariablesPass::run(
+    Module &M, ModuleAnalysisManager &) {
+  // the element types of the struct of replacement local module-scope
+  // variables we are replacing
+  SmallVector<Type *, 8> structElementTypes;
+
+  // ordered list of kernel names which are used to find cached function
+  // types. StringRef is safe here because the names will be taken over from
+  // the old functions to the new ones.
+  SmallVector<StringRef, 4> names;
+
+  // unmodified function types of functions in the module
+  DenseMap<StringRef, FunctionType *> functionTypes;
+
+  for (auto &F : M.functions()) {
+    if (isKernel(F)) {
+      names.push_back(F.getName());
+      functionTypes[F.getName()] = F.getFunctionType();
+    }
+  }
+
+  // a map from the original global variable to the index into
+  // structElementTypes
+  ValueMap<GlobalVariable *, unsigned> index_map;
+
+  // the global variables we need to process and remove
+  SmallVector<GlobalVariable *, 8> globals;
+
+  // maps variables in `globals` we're processing to helper information
+  // needed for updating debug info
+  DenseMap<GlobalVariable *, GlobalVarDebugInfoWrapper> debug_info_map;
+
+  // __local address space automatic variables are represented in the LLVM
+  // module as global variables with address space 3.
+  //
+  // This pass identifies these variables and places them into a struct
+  // allocated in a newly created wrapper function. A pointer to the struct
+  // is then passed via a parameter to the original kernel.
+  for (auto &global : M.globals()) {
+    // get the type of the global variable
+    const auto type = global.getType();
+
+    if (global.use_empty()) {
+      continue;
+    }
+
+    if (type->isPointerTy() &&
+        type->getPointerAddressSpace() == AddressSpace::Local) {
+      // and save that this is a global we care about
+      globals.push_back(&global);
+    }
+  }
+
+  // if we found no local module-scope variables to be replaced...
+  if (globals.empty()) {
+    // ... then we're done!
+    return PreservedAnalyses::all();
+  }
+
+  // Pad struct so that members are aligned.
+  //
+  // Unlike x86, ARM architecture alignment can be different from the
+  // member size. So that __local alignment is OpenCL conformant
+  // we need to manually pad our struct.
+  //
+  // To do this we keep track of each local module-scope elements
+  // offset in the struct, and ensure that it is a multiple of
+  // that elements alignment. Finally we then align the whole struct
+  // to the largest alignment found out of all our __local members.
+
+  // track largest member alignment found so far.
+  unsigned int maxAlignment = 0;
+  // byte offset in struct of current member
+  unsigned int offset = 0;
+  const auto &dl = M.getDataLayout();
+  for (auto &global : globals) {
+    auto memberType = global->getValueType();
+
+    // alignment of the new struct member, in the case where we can't
+    // calculate this, e.g. struct types, use the alignment of the llvm
+    // global. This is also needed if '__attribute__(aligned)' was used to
+    // set a specific alignment.
+    const unsigned int alignment =
+        std::max(global->getAlignment(), calculateTypeAlign(memberType, dl));
+    assert(alignment > 0 && "'0' is an impossible alignment");
+
+    // check if this is the largest alignment seen so far
+    maxAlignment = std::max(alignment, maxAlignment);
+
+    // check if member is not already aligned
+    const unsigned int remainder = offset % alignment;
+    if (0 != remainder) {
+      // calculate number of padding bytes
+      const unsigned int padding = alignment - remainder;
+
+      // Use a byte array to pad struct rather than trying to create
+      // an arbitrary intNTy, since this may not be supported by the backend.
+      const auto padByteType = Type::getInt8Ty(M.getContext());
+      const auto padByteArrayType = ArrayType::get(padByteType, padding);
+      structElementTypes.push_back(padByteArrayType);
+
+      // bump offset by padding size
+      offset += padding;
+    }
+
+    // we need the byte-offset when generating debug info
+    debug_info_map[global] = {offset, nullptr, nullptr};
+
+    // map the global variable to its index in structElementTypes
+    index_map[global] = structElementTypes.size();
+
+    // then add our element type to the struct
+    structElementTypes.push_back(memberType);
+
+    // update the offset based on the type's size
+    auto allocSize = dl.getTypeAllocSize(memberType);
+    if (dl.getTypeAllocSize(memberType).isScalable()) {
+      // Not an assert because this can happen in user-supplied IR
+      report_fatal_error("Scalable types in local memory are not supported");
+    }
+    const unsigned int totalSize = allocSize.getFixedValue();
+    offset += totalSize;
+  }
+
+  // create a struct containing all the local module-scope variables
+  auto structTy = StructType::create(structElementTypes, "localVarTypes");
+
+  // change all our functions to take a pointer to the new structTy we created
+  const AttributeSet defaultAttrs;
+  addParamToAllRequiredFunctions(
+      M, PointerType::get(M.getContext(), /*AddressSpace=*/0), defaultAttrs);
+
+  // Check if we have debug info, if so we need to fix it up to turn global
+  // variable entries into local variable ones.
+  if (const auto NMD = M.getNamedMetadata("llvm.dbg.cu")) {
+    const DIBuilder DIB(M, /*AllowUnresolved*/ false);
+
+    for (auto *CUOp : NMD->operands()) {
+      // Find module compilation unit
+      DICompileUnit *CU = cast<DICompileUnit>(CUOp);
+
+      // Check if there are any debug info global variables, as the DMA
+      // pass can create global variables without debug metadata attached.
+      auto DIGlobalVariables = CU->getGlobalVariables();
+      if (DIGlobalVariables.empty()) {
+        continue;
+      }
+      // Updated list of global debug info variables so that it no longer
+      // contains entries we will later replace with DILocalVariable metadata
+      SmallVector<Metadata *, 2> CU_DIExprs;
+      for (auto &global : M.globals()) {
+        // Get debug info expression for global variable
+        SmallVector<DIGlobalVariableExpression *, 1> Global_DIExprs;
+        global.getDebugInfo(Global_DIExprs);
+
+        if (Global_DIExprs.empty()) {
+          continue;
+        }
+
+        if (globals.end() == find(globals, &global)) {
+          // This is not a __local address space variable we will
+          // replace, so retain its debug info in the CU MDNode
+          CU_DIExprs.append(Global_DIExprs.begin(), Global_DIExprs.end());
+        } else {
+          // We will replace this debug info variable later
+          assert(Global_DIExprs.size() == 1 &&
+                 "Only expecting a single debug info variable");
+          debug_info_map[&global].DIGlobal = Global_DIExprs[0]->getVariable();
+        }
+      }
+      CU->replaceGlobalVariables(MDTuple::get(M.getContext(), CU_DIExprs));
+    }
+  }
+
+  for (auto &global : globals) {
+    const SmallVector<User *, 8> users(global->users());
+
+    for (auto *user : users) {
+      // if we have a constant expression, we need to force it back to a
+      // normal instruction, as we are removing the constant that the
+      // constant expression was associated with (we are removing the global
+      // variable), we can't use a constant expression to calculate the
+      // result.
+      if (auto *constant = dyn_cast<ConstantExpr>(user)) {
+        replaceConstantExpressionWithInstruction(constant);
+      }
+    }
+  }
+
+  for (auto &global : globals) {
+    if (debug_info_map[global].DIGlobal) {
+      // If global variable has debug info, find out what kernel the __local
+      // variable was defined in so we can use that information later.
+      debug_info_map[global].function = determineKernel(*global);
+      assert(debug_info_map[global].function);
+    }
+
+    // For each user that matches a specific kind of instruction, we do 3
+    // different things:
+    // 1) Create a GEP instruction to retrieve the address of the local
+    // version of 'global' in the newly created local struct.
+    // 2) We create a cast instruction to cast the type of the GEP created
+    // in 1) to the type of the global instruction.
+    // 3) Replace the use of the global instruction with the instruction
+    // created in 2).
+    const SmallVector<User *, 4> users(global->users());
+    for (auto *user : users) {
+      // if we have a GEP instruction...
+      if (GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(user)) {
+        auto local = generateStructGEP(*gep, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(gep->getIterator());
+
+        gep->setOperand(0, castedLocal);
+        gep->setIsInBounds();
+      } else if (CastInst *cast = dyn_cast<CastInst>(user)) {
+        auto local = generateStructGEP(*cast, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(cast->getIterator());
+
+        cast->setOperand(0, castedLocal);
+      } else if (LoadInst *load = dyn_cast<LoadInst>(user)) {
+        auto local = generateStructGEP(*load, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(load->getIterator());
+
+        load->setOperand(0, castedLocal);
+      } else if (StoreInst *store = dyn_cast<StoreInst>(user)) {
+        auto local = generateStructGEP(*store, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(store->getIterator());
+
+        // global could be pointer or value operand of the store
+        if (store->getValueOperand() == global) {
+          store->setOperand(0, castedLocal);
+        } else {
+          store->setOperand(1, castedLocal);
+        }
+      } else if (ConstantVector *cv = dyn_cast<ConstantVector>(user)) {
+        // Because 'cv' is not an instruction, we have to iterate over all its
+        // users and do the work for all of them individually.
+        for (auto cvIt = cv->user_begin(); cvIt != cv->user_end();) {
+          auto cvUser = *cvIt++;
+          auto inst = ::cast<Instruction>(cvUser);
+          auto local = generateStructGEP(*inst, structTy, index_map[global]);
+
+          auto castedLocal =
+              CastInst::CreatePointerCast(local, global->getType());
+          castedLocal->insertBefore(inst->getIterator());
+
+          auto indexTy = Type::getInt32Ty(M.getContext());
+          Value *newCv = PoisonValue::get(cv->getType());
+
+          // We can't simply 'setOperand' in a 'ConstantVector'. We have to
+          // recreate it from scratch.
+          for (unsigned i = 0; i < cv->getNumOperands(); ++i) {
+            Instruction *newCvInst;
+            if (cv->getOperand(i) == global) {
+              newCvInst = InsertElementInst::Create(
+                  newCv, castedLocal, ConstantInt::get(indexTy, i));
+            } else {
+              newCvInst = InsertElementInst::Create(
+                  newCv, cv->getOperand(i), ConstantInt::get(indexTy, i));
+            }
+            newCvInst->insertBefore(inst->getIterator());
+            newCv = newCvInst;
+          }
+
+          // And don't forget to replace 'cv' by 'newCv'.
+          inst->replaceUsesOfWith(cv, newCv);
+        }
+      } else if (PHINode *phi = dyn_cast<PHINode>(user)) {
+        // Because we can't create 1) before a phi node, we have to create it
+        // before the terminator of the incoming block.
+        for (unsigned i = 0; i < phi->getNumIncomingValues(); ++i) {
+          if (phi->getIncomingValue(i) == global) {
+            auto incomingBlock = phi->getIncomingBlock(i);
+            auto incomingBlockT = incomingBlock->getTerminator();
+            auto local =
+                generateStructGEP(*incomingBlockT, structTy, index_map[global]);
+
+            auto castedLocal =
+                CastInst::CreatePointerCast(local, global->getType());
+            castedLocal->insertBefore(incomingBlockT->getIterator());
+
+            phi->setIncomingValue(i, castedLocal);
+          }
+        }
+      } else if (AtomicRMWInst *atomic = dyn_cast<AtomicRMWInst>(user)) {
+        auto local = generateStructGEP(*atomic, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(atomic->getIterator());
+
+        // global could be pointer or value operand of the atomic
+        if (atomic->getPointerOperand() == global) {
+          atomic->setOperand(0, castedLocal);
+        } else {
+          atomic->setOperand(1, castedLocal);
+        }
+      } else if (auto *atomic = dyn_cast<AtomicCmpXchgInst>(user)) {
+        const auto local =
+            generateStructGEP(*atomic, structTy, index_map[global]);
+        const auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(atomic->getIterator());
+
+        // global could be the pointer
+        if (atomic->getPointerOperand() == global) {
+          atomic->setOperand(0, castedLocal);
+        }
+        // the comparison value
+        if (atomic->getCompareOperand() == global) {
+          atomic->setOperand(1, castedLocal);
+        }
+        // the new value
+        if (atomic->getNewValOperand() == global) {
+          atomic->setOperand(2, castedLocal);
+        }
+      } else if (SelectInst *select = dyn_cast<SelectInst>(user)) {
+        auto local = generateStructGEP(*select, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(select->getIterator());
+
+        // global could be the true or false value of the select
+        if (select->getTrueValue() == global) {
+          select->setOperand(1, castedLocal);
+        } else {
+          select->setOperand(2, castedLocal);
+        }
+      } else if (CallInst *call = dyn_cast<CallInst>(user)) {
+        auto local = generateStructGEP(*call, structTy, index_map[global]);
+
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(call->getIterator());
+
+        unsigned i = 0;
+        for (; i < call->getNumOperands(); ++i) {
+          if (call->getOperand(i) == global) {
+            call->setOperand(i, castedLocal);
+          }
+        }
+      } else if (InsertElementInst *insertIns =
+                     dyn_cast<InsertElementInst>(user)) {
+        auto local = generateStructGEP(*insertIns, structTy, index_map[global]);
+        auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(insertIns->getIterator());
+
+        // Update middle operand as the others are the vector and index
+        insertIns->setOperand(1, castedLocal);
+      } else if (auto *cmpIns = dyn_cast<CmpInst>(user)) {
+        const auto local =
+            generateStructGEP(*cmpIns, structTy, index_map[global]);
+        const auto castedLocal =
+            CastInst::CreatePointerCast(local, global->getType());
+        castedLocal->insertBefore(cmpIns->getIterator());
+
+        // global could be either side of the compare
+        if (cmpIns->getOperand(0) == global) {
+          cmpIns->setOperand(0, castedLocal);
+        }
+        if (cmpIns->getOperand(1) == global) {
+          cmpIns->setOperand(1, castedLocal);
+        }
+      } else {
+        user->print(errs());
+        llvm_unreachable("Unknown user used the local module-scope variable!");
+      }
+    }
+  }
+
+  // lastly, we create a wrapper function with the original kernel signature
+  // of each kernel, which will alloca the struct for the remapped local
+  // module-scope variables
+  for (const auto &name : names) {
+    // the original kernel function
+    auto *kernelFunc = M.getFunction(name);
+
+    // the original kernel function type, saved earlier
+    auto kernelFuncTy = functionTypes[name];
+
+    auto newFunc =
+        Function::Create(kernelFuncTy, kernelFunc->getLinkage(), "", &M);
+
+    // copy over function parameter names
+    for (unsigned i = 0, e = newFunc->arg_size(); i != e; i++) {
+      newFunc->getArg(i)->setName(kernelFunc->getArg(i)->getName());
+    }
+    // copy over function/parameter/ret attributes
+    copyFunctionAttrs(*kernelFunc, *newFunc, newFunc->arg_size());
+
+    auto baseName = getOrSetBaseFnName(*newFunc, *kernelFunc);
+    newFunc->setName(baseName + ".mux-local-var-wrapper");
+
+    // copy over function metadata
+    copyFunctionMetadata(*kernelFunc, *newFunc);
+    // drop the old function's kernel information - we've stolen it.
+    dropIsKernel(*kernelFunc);
+
+    // copy the calling convention too
+    newFunc->setCallingConv(kernelFunc->getCallingConv());
+
+    // and clear spir_kernel from the original function
+    if (kernelFunc->getCallingConv() == llvm::CallingConv::SPIR_KERNEL) {
+      kernelFunc->setCallingConv(llvm::CallingConv::SPIR_FUNC);
+    }
+
+    // we don't use exceptions
+    newFunc->addFnAttr(Attribute::NoUnwind);
+
+    // next, set the function to always inline unless it has a noinline
+    // attribute.
+    if (!kernelFunc->hasFnAttribute(Attribute::NoInline)) {
+      kernelFunc->addFnAttr(Attribute::AlwaysInline);
+    }
+
+    // lastly set the linkage to internal
+    kernelFunc->setLinkage(GlobalValue::InternalLinkage);
+
+    // move debug info for function over
+    newFunc->setSubprogram(kernelFunc->getSubprogram());
+    kernelFunc->setSubprogram(nullptr);
+
+    // create an irbuilder and basic block for our new function
+    IRBuilder<> ir(BasicBlock::Create(newFunc->getContext(), "", newFunc));
+
+    // stack allocate the local module-scope variables struct
+    auto alloca = ir.CreateAlloca(structTy);
+    alloca->setAlignment(MaybeAlign(maxAlignment).valueOrOne());
+
+    // Generate debug info metadata for the globals we have replaced
+    // which previously had debug info attached
+    for (auto global : globals) {
+      auto debug_info_wrapper = debug_info_map[global];
+      auto DIGlobal = debug_info_wrapper.DIGlobal;
+      if (!DIGlobal) {
+        // No debug info for GlobalVariable
+        continue;
+      }
+
+      // Expression for byte offset in newly allocated struct where our
+      // replacement variable lives
+      const unsigned offset = debug_info_wrapper.offset;
+      const uint64_t dwPlusOp = dwarf::DW_OP_plus_uconst;
+      DIBuilder DIB(M, /*AllowUnresolved*/ false);
+      auto offset_expr =
+          DIB.createExpression(ArrayRef<uint64_t>{dwPlusOp, offset});
+
+      // enqueued_kernel_scope is true if the variable was originally defined
+      // in kernelFunc, the kernel being enqueued by the user, rather than
+      // another kernel function being called by kernelFunc.
+      auto func = debug_info_wrapper.function;
+      const bool enqueued_kernel_scope = !func->getSubprogram();
+      auto DISubprogram = enqueued_kernel_scope ? newFunc->getSubprogram()
+                                                : func->getSubprogram();
+
+      // We can't guarantee a subprogram for all functions.
+      // FIXME: Should we be able to? Do we need to clone subprograms somehow?
+      if (!DISubprogram) {
+        continue;
+      }
+
+      // Create replacement debug metadata entry representing the global
+      // as a DILocalVariable in the kernel function scope.
+      auto DILocal = DIB.createAutoVariable(
+          DISubprogram, DIGlobal->getName(), DIGlobal->getFile(),
+          DIGlobal->getLine(), dyn_cast<DIType>(DIGlobal->getType()));
+
+      // Insert debug declare intrinsic pointing to the location of
+      // the variable in our allocated struct
+      auto *location =
+          DILocation::get(DISubprogram->getContext(), DIGlobal->getLine(),
+                          /*Column*/ 0, DISubprogram);
+      if (enqueued_kernel_scope) {
+        DIB.insertDeclare(alloca, DILocal, offset_expr, location,
+                          alloca->getParent());
+      } else {
+        // A pointer to our struct is passed as the last argument to each
+        // function, use this argument if the global came from another kernel
+        // function which is called by kernelFunc.
+        auto last_arg = func->arg_end() - 1;
+        DIB.insertDeclare(last_arg, DILocal, offset_expr, location,
+                          func->getEntryBlock().getFirstNonPHIOrDbg());
+      }
+    }
+
+    // create a buffer for our args
+    SmallVector<Value *, 8> args;
+
+    for (auto &arg : newFunc->args()) {
+      args.push_back(&arg);
+    }
+
+    // add the new alloca for the local module-scope variables struct
+    args.push_back(alloca);
+
+    // call the original function
+    auto ci = ir.CreateCall(kernelFunc, args);
+    ci->setCallingConv(kernelFunc->getCallingConv());
+    ci->setAttributes(getCopiedFunctionAttrs(*kernelFunc));
+
+    // and return void
+    ir.CreateRetVoid();
+  }
+
+  // erase all the global variables that we have removed all uses for
+  for (auto global : globals) {
+    // Vecz generates constant vector with global variable with local scope.
+    // In this case, if we try to remove the global variable, llvm generates
+    // assert because there are still uses with constant vector in
+    // LLVMContext. As a result, if constant vector uses global variable with
+    // local scope, keep it.
+    bool keepIt = false;
+    for (auto *user : global->users()) {
+      if (isa<ConstantVector>(user)) {
+        keepIt = true;
+        break;
+      }
+    }
+
+    if (!keepIt) {
+      global->eraseFromParent();
+    }
+  }
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
new file mode 100644
index 0000000000000..a05ff3e077c80
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/scheduling.cpp
@@ -0,0 +1,155 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/scheduling.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <sys/types.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+static constexpr const char *WorkItemParamName = "MuxWorkItemInfo";
+static constexpr const char *WorkGroupParamName = "MuxWorkGroupInfo";
+
+StructType *getWorkItemInfoStructTy(llvm::Module &M) {
+  LLVMContext &ctx = M.getContext();
+  // Check whether this struct has previously been defined.
+  if (auto *ty = StructType::getTypeByName(ctx, WorkItemParamName)) {
+    return ty;
+  }
+  auto *uint_type = Type::getInt32Ty(ctx);
+  auto *size_type = getSizeType(M);
+  auto *array_type = ArrayType::get(size_type, 3);
+
+  SmallVector<Type *, WorkItemInfoStructField::total> elements(
+      WorkItemInfoStructField::total);
+
+  elements[WorkItemInfoStructField::local_id] = array_type;
+  elements[WorkItemInfoStructField::sub_group_id] = uint_type;
+  elements[WorkItemInfoStructField::num_sub_groups] = uint_type;
+  elements[WorkItemInfoStructField::max_sub_group_size] = uint_type;
+
+  return StructType::create(elements, WorkItemParamName);
+}
+
+StructType *getWorkGroupInfoStructTy(llvm::Module &M) {
+  LLVMContext &ctx = M.getContext();
+  // Check whether this struct has previously been defined.
+  if (auto *ty = StructType::getTypeByName(ctx, WorkGroupParamName)) {
+    return ty;
+  }
+  auto *uint_type = Type::getInt32Ty(ctx);
+  auto *size_type = getSizeType(M);
+  auto *array_type = ArrayType::get(size_type, 3);
+
+  SmallVector<Type *, WorkGroupInfoStructField::total> elements(
+      WorkGroupInfoStructField::total);
+
+  elements[WorkGroupInfoStructField::group_id] = array_type;
+  elements[WorkGroupInfoStructField::num_groups] = array_type;
+  elements[WorkGroupInfoStructField::global_offset] = array_type;
+  elements[WorkGroupInfoStructField::local_size] = array_type;
+  elements[WorkGroupInfoStructField::work_dim] = uint_type;
+
+  return StructType::create(elements, WorkGroupParamName);
+}
+
+void populateStructSetterFunction(Function &F, Argument &structPtrArg,
+                                  StructType *const structTy,
+                                  uint32_t structFieldIdx, bool hasRankArg) {
+  assert(F.isDeclaration() && "Scrubbing existing function");
+
+  F.addFnAttr(Attribute::AlwaysInline);
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  auto argIter = F.arg_begin();
+
+  Value *const indexArg = hasRankArg ? argIter++ : nullptr;
+
+  Value *const valueArg = argIter++;
+
+  IRBuilder<> ir(BasicBlock::Create(F.getContext(), "", &F));
+
+  SmallVector<Value *, 3> gep_indices{ir.getInt32(0),
+                                      ir.getInt32(structFieldIdx)};
+
+  if (hasRankArg) {
+    gep_indices.push_back(indexArg);
+  }
+
+  assert(structPtrArg.getType()->isPointerTy() &&
+         "Assuming a pointer type as the last argument");
+
+  Value *gep = ir.CreateGEP(structTy, &structPtrArg, gep_indices);
+
+  ir.CreateStore(valueArg, gep);
+
+  ir.CreateRetVoid();
+}
+
+void populateStructGetterFunction(llvm::Function &F, Argument &structPtrArg,
+                                  llvm::StructType *const structTy,
+                                  uint32_t structFieldIdx, bool hasRankArg,
+                                  size_t defaultValue) {
+  assert(F.isDeclaration() && "Scrubbing existing function");
+  F.addFnAttr(Attribute::AlwaysInline);
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  auto *indexArg = hasRankArg ? F.arg_begin() : nullptr;
+
+  assert(structPtrArg.getType()->isPointerTy() &&
+         "Assuming a pointer type as the last argument");
+
+  IRBuilder<> ir(BasicBlock::Create(F.getContext(), "", &F));
+
+  SmallVector<Value *, 3> gep_indices{ir.getInt32(0),
+                                      ir.getInt32(structFieldIdx)};
+
+  Value *ret = nullptr;
+  Value *cmp = nullptr;
+
+  if (hasRankArg) {
+    // we have 3 dimensions; x, y & z
+    auto *maxValidIndex = ir.getInt32(3);
+
+    cmp = ir.CreateICmp(CmpInst::ICMP_ULT, indexArg, maxValidIndex);
+
+    auto *sel = ir.CreateSelect(cmp, indexArg, ir.getInt32(0));
+
+    gep_indices.push_back(sel);
+  }
+
+  auto gep = ir.CreateGEP(structTy, &structPtrArg, gep_indices);
+
+  ret = ir.CreateLoad(F.getReturnType(), gep);
+
+  if (hasRankArg) {
+    ret = ir.CreateSelect(cmp, ret,
+                          ConstantInt::get(F.getReturnType(), defaultValue));
+  }
+
+  ir.CreateRet(ret);
+}
+
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
new file mode 100644
index 0000000000000..8b421ccaf4c30
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/sub_group_analysis.cpp
@@ -0,0 +1,172 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/sub_group_analysis.h>
+#include <llvm/ADT/PriorityWorklist.h>
+#include <llvm/ADT/SetOperations.h>
+#include <llvm/IR/Module.h>
+
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+
+GlobalSubgroupInfo::GlobalSubgroupInfo(Module &M, BuiltinInfo &BI) : BI(BI) {
+  SmallPtrSet<Function *, 8> UsesSubgroups;
+  SmallPriorityWorklist<Function *, 4> Worklist;
+
+  for (auto &F : M) {
+    if (F.isDeclaration()) {
+      continue;
+    }
+    auto SGI = std::make_unique<SubgroupInfo>();
+
+    // Assume the 'mux-no-subgroups' attribute is correct. If a pass introduces
+    // the use of sub-groups, then it should remove the attribute itself!
+    if (hasNoExplicitSubgroups(F)) {
+      FunctionMap.insert({&F, std::move(SGI)});
+      continue;
+    }
+
+    for (auto &BB : F) {
+      for (const auto &I : BB) {
+        if (auto *const CI = dyn_cast<CallInst>(&I)) {
+          if (auto SGBuiltin = isMuxSubgroupBuiltin(CI->getCalledFunction())) {
+            // Only add each function to the worklist once
+            if (UsesSubgroups.insert(&F).second) {
+              Worklist.insert(&F);
+            }
+            // Track this function's use of this builtin
+            SGI->UsedSubgroupBuiltins.insert(SGBuiltin->ID);
+          }
+        }
+      }
+    }
+    FunctionMap.insert({&F, std::move(SGI)});
+  }
+
+  // Collect all functions that contain sub-group calls, including calls to
+  // other functions in the module that contain sub-group calls.
+  while (!Worklist.empty()) {
+    auto *const F = Worklist.pop_back_val();
+    const auto &FSubgroups = FunctionMap[F]->UsedSubgroupBuiltins;
+    // Track which unique call-graph edges we've traversed, in case F ends up
+    // calling the same function multiple times. The set of builtins used by
+    // this item isn't going to change while we're working on it.
+    SmallPtrSet<Function *, 4> AlreadyUnioned;
+    for (auto *const U : F->users()) {
+      if (auto *const CI = dyn_cast<CallInst>(U)) {
+        auto *const CallerF = CI->getFunction();
+        // If we haven't seen this function before, we need to process it and
+        // propagate its users.
+        if (UsesSubgroups.insert(CallerF).second) {
+          Worklist.insert(CallerF);
+        }
+        // If we've recorded that CallerF calls F for the first time in this
+        // loop, CallerF's set of used builtins gains all the builtins used by
+        // F.
+        if (AlreadyUnioned.insert(CallerF).second) {
+          auto &CallerSubgroups = FunctionMap[CallerF]->UsedSubgroupBuiltins;
+          // If the set union produces a new set...
+          if (set_union(CallerSubgroups, FSubgroups)) {
+            // ... we might have previously visited CallerF when it had fewer
+            // registered uses of sub-groups. Thus we need to stick it back on
+            // the worklist to propagate these to its users.
+            Worklist.insert(CallerF);
+          }
+        }
+      }
+    }
+  }
+}
+
+bool GlobalSubgroupInfo::usesSubgroups(const llvm::Function &F) const {
+  auto I = FunctionMap.find(&F);
+  assert(I != FunctionMap.end() && "Missing entry for function");
+  return !I->second->UsedSubgroupBuiltins.empty();
+}
+
+std::optional<Builtin>
+GlobalSubgroupInfo::isMuxSubgroupBuiltin(const Function *F) const {
+  if (!F) {
+    return std::nullopt;
+  }
+  auto SGBuiltin = BI.analyzeBuiltin(*F);
+  if (!SGBuiltin) {
+    return std::nullopt;
+  }
+
+  switch (SGBuiltin->ID) {
+  default:
+    break;
+  case eMuxBuiltinSubGroupBarrier:
+  case eMuxBuiltinGetSubGroupSize:
+  case eMuxBuiltinGetMaxSubGroupSize:
+  case eMuxBuiltinGetNumSubGroups:
+  case eMuxBuiltinGetSubGroupId:
+  case eMuxBuiltinGetSubGroupLocalId:
+    return SGBuiltin;
+  }
+
+  if (auto GroupOp = BI.isMuxGroupCollective(SGBuiltin->ID);
+      GroupOp && GroupOp->isSubGroupScope()) {
+    return SGBuiltin;
+  }
+
+  return std::nullopt;
+}
+
+AnalysisKey SubgroupAnalysis::Key;
+
+SubgroupAnalysis::Result SubgroupAnalysis::run(Module &M,
+                                               ModuleAnalysisManager &AM) {
+  return GlobalSubgroupInfo(M, AM.getResult<BuiltinInfoAnalysis>(M));
+}
+
+PreservedAnalyses SubgroupAnalysisPrinterPass::run(Module &M,
+                                                   ModuleAnalysisManager &AM) {
+  const auto &Info = AM.getResult<SubgroupAnalysis>(M);
+
+  for (auto &F : M) {
+    if (F.isDeclaration()) {
+      continue;
+    }
+    OS << "Function '" << F.getName() << "' uses";
+    if (!Info.usesSubgroups(F)) {
+      OS << " no sub-group builtins\n";
+      continue;
+    }
+    auto *FInfo = Info[&F];
+    assert(FInfo && "Missing function info");
+    const auto &UsedBuiltins = FInfo->UsedSubgroupBuiltins;
+    // Note: this output isn't stable and shouldn't be relied upon. It's mostly
+    // for developer analysis.
+    OS << " " << UsedBuiltins.size() << " sub-group builtin"
+       << (UsedBuiltins.size() == 1 ? "" : "s") << ": "
+       << static_cast<unsigned>(*UsedBuiltins.begin());
+    for (auto B :
+         make_range(std::next(UsedBuiltins.begin()), UsedBuiltins.end())) {
+      OS << "," << static_cast<unsigned>(B);
+    }
+    OS << "\n";
+  }
+
+  return PreservedAnalyses::all();
+}
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
new file mode 100644
index 0000000000000..1b6f0de967602
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/target_extension_types.cpp
@@ -0,0 +1,103 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/target_extension_types.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Type.h>
+#include <multi_llvm/llvm_version.h>
+
+using namespace compiler::utils;
+using namespace llvm;
+
+namespace compiler {
+namespace utils {
+namespace tgtext {
+
+Type *getEventTy(LLVMContext &Ctx) {
+  return TargetExtType::get(Ctx, "spirv.Event");
+}
+
+Type *getSamplerTy(LLVMContext &Ctx) {
+  return TargetExtType::get(Ctx, "spirv.Sampler");
+}
+
+[[maybe_unused]] static Type *
+getImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
+                 ImageTyDepthParam Depth, ImageTyArrayedParam Arrayed,
+                 ImageTyMSParam MS, ImageTySampledParam Sampled,
+                 ImageTyAccessQualParam AccessQual) {
+  unsigned IntParams[7];
+  IntParams[ImageTyDimensionalityIdx] = Dim;
+  IntParams[ImageTyDepthIdx] = Depth;
+  IntParams[ImageTyArrayedIdx] = Arrayed;
+  IntParams[ImageTyMSIdx] = MS;
+  IntParams[ImageTySampledIdx] = Sampled;
+  IntParams[ImageTyFormatIdx] = /*Unknown*/ 0;
+  IntParams[ImageTyAccessQualIdx] = AccessQual;
+  return TargetExtType::get(Ctx, "spirv.Image", Type::getVoidTy(Ctx),
+                            IntParams);
+}
+
+[[maybe_unused]] static Type *
+getOpenCLImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
+                       ImageTyArrayedParam Arrayed, ImageTyDepthParam Depth,
+                       ImageTyMSParam MS, ImageTyAccessQualParam AccessQual) {
+  return getImageTyHelper(Ctx, Dim, Depth, Arrayed, MS, ImageSampledRuntime,
+                          AccessQual);
+}
+
+[[maybe_unused]] static Type *
+getOpenCLImageTyHelper(LLVMContext &Ctx, ImageTyDimensionalityParam Dim,
+                       ImageTyArrayedParam Arrayed,
+                       ImageTyAccessQualParam AccessQual) {
+  return getOpenCLImageTyHelper(Ctx, Dim, Arrayed, ImageDepthNone,
+                                ImageMSSingleSampled, AccessQual);
+}
+
+Type *getImage1DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
+  return getOpenCLImageTyHelper(Ctx, ImageDim1D, ImageNonArrayed, AccessQual);
+}
+
+Type *getImage1DArrayTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
+  return getOpenCLImageTyHelper(Ctx, ImageDim1D, ImageArrayed, AccessQual);
+}
+
+Type *getImage1DBufferTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
+  return getOpenCLImageTyHelper(Ctx, ImageDimBuffer, ImageNonArrayed,
+                                AccessQual);
+}
+
+Type *getImage2DTy(LLVMContext &Ctx, bool Depth, bool MS,
+                   ImageTyAccessQualParam AccessQual) {
+  return getOpenCLImageTyHelper(
+      Ctx, ImageDim2D, ImageNonArrayed, Depth ? ImageDepth : ImageDepthNone,
+      MS ? ImageMSMultiSampled : ImageMSSingleSampled, AccessQual);
+}
+
+Type *getImage2DArrayTy(LLVMContext &Ctx, bool Depth, bool MS,
+                        ImageTyAccessQualParam AccessQual) {
+  return getOpenCLImageTyHelper(
+      Ctx, ImageDim2D, ImageArrayed, Depth ? ImageDepth : ImageDepthNone,
+      MS ? ImageMSMultiSampled : ImageMSSingleSampled, AccessQual);
+}
+
+Type *getImage3DTy(LLVMContext &Ctx, ImageTyAccessQualParam AccessQual) {
+  return getOpenCLImageTyHelper(Ctx, ImageDim3D, ImageNonArrayed, AccessQual);
+}
+
+} // namespace tgtext
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
new file mode 100644
index 0000000000000..dafbd1484f3c8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/unique_opaque_structs_pass.cpp
@@ -0,0 +1,284 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Defines the RenameStructsPass.
+
+#include <compiler/utils/StructTypeRemapper.h>
+#include <compiler/utils/unique_opaque_structs_pass.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+#include <multi_llvm/multi_llvm.h>
+
+using namespace compiler::utils;
+using namespace llvm;
+
+/// @brief Indicates whether a function needs to be cloned.
+///
+/// There are a few ways the undesirable types can exist in a function:
+/// * As a return type.
+/// * As a parameter type.
+/// * As a call to a function returning undesirable type.
+/// * The result of an alloca.
+/// * Result of a cast of some type.
+/// * Reference to a global.
+///
+/// @param[in] StructTypeRemapper Map from suffixed opaque structs to
+/// unsuffixed opaque structs.
+/// @param[in] Function function to be checked for cloning.
+///
+/// @return Whether function should be cloned.
+/// @retval true if function should be cloned.
+/// @retval false otherwise.
+static bool shouldClone(compiler::utils::StructTypeRemapper &StructTypeRemapper,
+                        const Function &Func) {
+  // First check the return type.
+  if (StructTypeRemapper.isRemapped(Func.getReturnType())) {
+    return true;
+  }
+
+  // Then the arguments.
+  for (const Argument &Arg : Func.args()) {
+    if (StructTypeRemapper.isRemapped(Arg.getType())) {
+      return true;
+    }
+  }
+
+  // Now look for specific instructions that could introduce the type.
+  for (auto &BB : Func) {
+    for (auto &I : BB) {
+      // We can catch any instruction that produces an undesirable type by
+      // just checking its type.
+      if (StructTypeRemapper.isRemapped(I.getType())) {
+        return true;
+      }
+    }
+  }
+
+  // TODO: Check globals.
+
+  // If an instruction makes use of a type but
+  // isn't of that type e.g. a cast it will necessarily get caught by
+  // the above case as it is a use of something which produced that
+  // type.
+
+  // If we've got here, we've checked all the cases, so no need to clone.
+  return false;
+}
+
+/// @brief Constructs a map of suffixed opaque structure types to their
+/// unsuffixed versions.
+///
+/// If a module references opaque structs that have identical names up to a
+/// suffix within the context, e.g. opencl.event_t and opencl.event_t this
+/// function will return a map mapping the suffixed versions to the unsuffixed
+/// versions e.g. map[opencl.event_t.0] = opencl.event_t.
+///
+/// @param module Module referencing the types in the context.
+///
+/// @return The map of suffixed structures to the unsuffixed structures.
+static compiler::utils::StructMap
+uniqueOpaqueSuffixedStructs(llvm::Module &module) {
+  StructMap map;
+  for (auto *structTy : module.getIdentifiedStructTypes()) {
+    if (!structTy->isOpaque()) {
+      continue;
+    }
+
+    // Look up each struct in the module by name.
+    auto structName = structTy->getName();
+    const char *Suffix = ".0123456789";
+
+    // Check whether there is a type in the context with the same name minus a
+    // suffix.
+    if (auto *ctxStructTy = llvm::StructType::getTypeByName(
+            module.getContext(), structName.rtrim(Suffix))) {
+      // Make sure it is also opaque.
+      if (!ctxStructTy->isOpaque()) {
+        continue;
+      }
+
+      // If it isn't the same type as the first map the suffixed
+      // type to the unsuffixed type.
+      if (ctxStructTy != structTy) {
+        map[structTy] = ctxStructTy;
+      }
+    }
+  }
+  return map;
+}
+
+/// @brief Populates list of functions that need to be cloned.
+///
+/// @param[in] Module module containing the functions to be inspected.
+/// @param[in] StructTypeRemapper Map from suffixed opaque structs to
+/// unsuffixed opaque structs.
+/// @param[out] WorkList vector of functions that need to be processed.
+static void
+populateWorkList(Module &Module,
+                 compiler::utils::StructTypeRemapper &StructTypeRemapper,
+                 SmallVectorImpl<Function *> &WorkList) {
+  for (auto &Function : Module) {
+    // We don't need to touch intrinsics.
+    if (Function.isIntrinsic()) {
+      continue;
+    }
+
+    // Check the function for undesirable types.
+    if (shouldClone(StructTypeRemapper, Function)) {
+      WorkList.push_back(&Function);
+    }
+  }
+}
+
+static void removeOldFunctions(const SmallVectorImpl<Function *> &OldFuncs) {
+  // First we have to delete the bodies of the functions, otherwise we will
+  // get issues about uses missing their defs.
+  for (auto &OldFunc : OldFuncs) {
+    OldFunc->deleteBody();
+  }
+
+  // Now we can delete the actual functions.
+  for (auto &OldFunc : OldFuncs) {
+    OldFunc->eraseFromParent();
+  }
+}
+
+/// @brief Clones a list of functions updating types within the function.
+///
+/// Clones a list of functions updating the types of any instances of the
+/// undesirable types according to the map that was passed to this pass. A new
+/// call graph is constructed and the old functions names are taken by the
+/// new functions.
+///
+/// @param[in] StructTypeRemapper Map from suffixed opaque structs to
+/// unsuffixed opaque structs.
+/// @param[in] OldFuncs list of functions to clone and update.
+static void
+replaceRemappedTypeRefs(compiler::utils::StructTypeRemapper &StructTypeRemapper,
+                        const SmallVectorImpl<Function *> &OldFuncs) {
+  // Maps the old functions to their new versions with updated types.
+  // Note: it is important we do this before cloning to catch the case that
+  // functions A and B both need updating, but function A calls function B and
+  // A is processed before B, otherwise function calls won't be updated during
+  // the clone.
+  SmallDenseMap<Function *, Function *> FFMap;
+  for (auto &OldFunc : OldFuncs) {
+    auto *OldFuncTy = OldFunc->getFunctionType();
+    // First map the return type.
+    auto *RetTy = StructTypeRemapper.remapType(OldFuncTy->getReturnType());
+
+    // Then map the parameter types.
+    SmallVector<Type *, 4> ParamTys;
+    for (auto ParamTy : OldFuncTy->params()) {
+      ParamTys.push_back(StructTypeRemapper.remapType(ParamTy));
+    }
+
+    // Create the new function with updated types.
+    auto *NewFuncTy = FunctionType::get(RetTy, ParamTys, OldFuncTy->isVarArg());
+    auto *NewFunc = Function::Create(NewFuncTy, OldFunc->getLinkage(), "",
+                                     OldFunc->getParent());
+    NewFunc->setCallingConv(OldFunc->getCallingConv());
+
+    FFMap[OldFunc] = NewFunc;
+  }
+
+  // Here we actually do the cloning.
+  for (auto &OldFunc : OldFuncs) {
+    // We construct a new value map on each iteration to avoid entries in the
+    // value map potentially being overwritten during cloning which would then
+    // be used be subsequent loop iterations.
+    ValueToValueMapTy ValueMap;
+    for (auto &pair : FFMap) {
+      ValueMap[pair.getFirst()] = pair.getSecond();
+    }
+    auto *NewFunc = FFMap[OldFunc];
+    auto NewArgIterator = NewFunc->arg_begin();
+    for (llvm::Argument &Arg : OldFunc->args()) {
+      NewArgIterator->setName(Arg.getName());
+      ValueMap[&Arg] = &*(NewArgIterator++);
+    }
+    NewFunc->takeName(OldFunc);
+
+    if (OldFunc->isDeclaration()) {
+      // Everything that follows requires a body.
+      continue;
+    }
+
+    SmallVector<ReturnInst *, 4> Returns;
+    CloneFunctionInto(NewFunc, OldFunc, ValueMap,
+                      CloneFunctionChangeType::GlobalChanges, Returns, "",
+                      /* CodeInfo */ nullptr, &StructTypeRemapper);
+    Returns.clear();
+
+    // It's possible we still have references to the old types in our new
+    // new function, this can happen via allocas and cast as well as
+    // references to global variables.
+    for (auto &BB : *NewFunc) {
+      for (auto &I : BB) {
+        // Anything that defines a undesirable instance will get caught
+        // here.
+        I.mutateType(StructTypeRemapper.remapType(I.getType()));
+
+        // GEP instructions need to be handled separately.
+        if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+          if (StructTypeRemapper.isRemapped(GEP->getSourceElementType())) {
+            GEP->setSourceElementType(
+                StructTypeRemapper.remapType(GEP->getSourceElementType()));
+          }
+        }
+      }
+    }
+  }
+
+  // We can now remove any of the misnamed types and any functions that used
+  // them.
+  removeOldFunctions(OldFuncs);
+}
+
+namespace compiler {
+namespace utils {
+PreservedAnalyses UniqueOpaqueStructsPass::run(Module &Module,
+                                               ModuleAnalysisManager &) {
+  // Find the opaque types in the module that have suffixes and map them to
+  // their unsuffixed versions.
+  auto StructMap = uniqueOpaqueSuffixedStructs(Module);
+  StructTypeRemapper StructTypeRemapper(StructMap);
+
+  // Build the list of functions we need to process.
+  SmallVector<Function *, 8> WorkList;
+  populateWorkList(Module, StructTypeRemapper, WorkList);
+
+  // If the set is empty we have no work and can exit early.
+  if (WorkList.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  // Otherwise, clone the functions, updating the types.
+  replaceRemappedTypeRefs(StructTypeRemapper, WorkList);
+
+  // We definitely cloned something by this point, so the module has been
+  // modified.
+  return PreservedAnalyses::none();
+}
+} // namespace utils
+} // namespace compiler
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
new file mode 100644
index 0000000000000..4569df09a5495
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/compiler_pipeline/source/work_item_loops_pass.cpp
@@ -0,0 +1,1927 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/barrier_regions.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/pass_functions.h>
+#include <compiler/utils/sub_group_analysis.h>
+#include <compiler/utils/work_item_loops_pass.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/Utils/Local.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <optional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "work-item-loops"
+
+namespace compiler {
+namespace utils {
+
+/// @brief A subclass of the generic Barrier which is used by the
+/// WorkItemLoopsPass.
+///
+/// It adds additional fields used when creating wrapper kernels.
+class BarrierWithLiveVars : public Barrier {
+public:
+  BarrierWithLiveVars(llvm::Module &m, llvm::Function &f,
+                      VectorizationInfo vf_info, bool IsDebug)
+      : Barrier(m, f, IsDebug), vf_info(vf_info) {}
+
+  VectorizationInfo getVFInfo() const { return vf_info; }
+
+  AllocaInst *getMemSpace() const { return mem_space; }
+  void setMemSpace(AllocaInst *ai) { mem_space = ai; }
+
+  void setSize0(Value *v) { size0 = v; }
+  Value *getSize0() const { return size0; }
+
+  void setTotalSize(Value *v) { totalSize = v; }
+  Value *getTotalSize() const { return totalSize; }
+
+  Value *getStructSize() const { return structSize; }
+  void setStructSize(Value *v) { structSize = v; }
+
+  AllocaInst *getDebugAddr() const { return debug_addr; }
+  void setDebugAddr(AllocaInst *ai) { debug_addr = ai; }
+
+private:
+  VectorizationInfo vf_info;
+
+  // Alloca representing the memory for the live variables for a given kernel,
+  // with enough space for each individual work-item in a work-group to have
+  // its own view.
+  //
+  // This is typically used to hold Z*Y*(X/vec_width) individual instances of
+  // the live-variables structure.
+  AllocaInst *mem_space = nullptr;
+
+  // Alloca holding the address of the live vars struct for the
+  // currently executing work item.
+  AllocaInst *debug_addr = nullptr;
+
+  // The number of items along the primary dimension
+  Value *size0 = nullptr;
+
+  // The total number of items
+  Value *totalSize = nullptr;
+
+  /// @brief The size of the struct in bytes, if the barrier contains
+  /// scalables
+  Value *structSize = nullptr;
+};
+
+} // namespace utils
+} // namespace compiler
+
+namespace {
+
+struct ScheduleGenerator {
+  ScheduleGenerator(Module &m,
+                    const compiler::utils::BarrierWithLiveVars &barrierMain,
+                    const compiler::utils::BarrierWithLiveVars *barrierTail,
+                    compiler::utils::BuiltinInfo &BI)
+      : module(m), context(m.getContext()), barrierMain(barrierMain),
+        barrierTail(barrierTail), BI(BI), i32Ty(Type::getInt32Ty(context)) {
+    set_local_id =
+        BI.getOrDeclareMuxBuiltin(compiler::utils::eMuxBuiltinSetLocalId, m);
+    set_subgroup_id =
+        BI.getOrDeclareMuxBuiltin(compiler::utils::eMuxBuiltinSetSubGroupId, m);
+    assert(set_local_id && set_subgroup_id && "Missing mux builtins");
+  }
+  Module &module;
+  LLVMContext &context;
+  const compiler::utils::BarrierWithLiveVars &barrierMain;
+  const compiler::utils::BarrierWithLiveVars *barrierTail;
+  compiler::utils::BuiltinInfo &BI;
+
+  SmallVector<Value *, 8> args;
+  Function *set_local_id = nullptr;
+  Function *set_subgroup_id = nullptr;
+  Type *const i32Ty;
+
+  uint32_t workItemDim0 = 0;
+  uint32_t workItemDim1 = 1;
+  uint32_t workItemDim2 = 2;
+  Value *localSizeDim[3];
+
+  AllocaInst *nextID = nullptr;
+  Value *mainLoopLimit = nullptr;
+  Value *peel = nullptr;
+  bool noExplicitSubgroups = false;
+  bool emitTail = true;
+  bool wrapperHasMain = false;
+  bool wrapperHasTail = false;
+
+  DILocation *wrapperDbgLoc = nullptr;
+
+  Value *
+  createLinearLiveVarsPtr(const compiler::utils::BarrierWithLiveVars &barrier,
+                          IRBuilder<> &ir, Value *index) {
+    Value *const mem_space = barrier.getMemSpace();
+    if (!mem_space) {
+      return nullptr;
+    }
+
+    // Calculate the offset for where the live variables of the current
+    // work item (within the nested loops) are stored.
+    // Loop i,j,k  -->  ((i * dim1) + j) * size0 + k
+    // memory access pattern should not depend on the vectorization
+    // dimension
+
+    Value *live_var_ptr;
+    if (!barrier.getStructSize()) {
+      Value *const live_var_mem_idxs[] = {index};
+      live_var_ptr = ir.CreateInBoundsGEP(barrier.getLiveVarsType(), mem_space,
+                                          live_var_mem_idxs);
+    } else {
+      // index into the byte buffer
+      auto *const byteOffset = ir.CreateMul(index, barrier.getStructSize());
+      Value *const live_var_mem_idxs[] = {byteOffset};
+      live_var_ptr =
+          ir.CreateInBoundsGEP(ir.getInt8Ty(), mem_space, live_var_mem_idxs);
+    }
+
+    return live_var_ptr;
+  }
+
+  Value *createLiveVarsPtr(const compiler::utils::BarrierWithLiveVars &barrier,
+                           IRBuilder<> &ir, Value *dim_0, Value *dim_1,
+                           Value *dim_2, Value *VF = nullptr) {
+    Value *const mem_space = barrier.getMemSpace();
+    if (!mem_space) {
+      return nullptr;
+    }
+
+    // Calculate the offset for where the live variables of the current
+    // work item (within the nested loops) are stored.
+    // Loop i,j,k  -->  ((i * dim1) + j) * size0 + k
+    // memory access pattern should not depend on the vectorization
+    // dimension
+    auto *const i_offset = ir.CreateMul(dim_2, localSizeDim[workItemDim1]);
+    auto *const j_offset =
+        ir.CreateMul(ir.CreateAdd(i_offset, dim_1), barrier.getSize0());
+    auto *const k_offset = VF ? ir.CreateUDiv(dim_0, VF) : dim_0;
+    auto *const offset = ir.CreateAdd(j_offset, k_offset);
+
+    return createLinearLiveVarsPtr(barrier, ir, offset);
+  }
+
+  void
+  recreateDebugIntrinsics(const compiler::utils::BarrierWithLiveVars &barrier,
+                          BasicBlock *block, StoreInst *SI) {
+    DIBuilder DIB(module, /*AllowUnresolved*/ false);
+    auto RecreateDebugIntrinsic = [&](DILocalVariable *const old_var,
+                                      const unsigned live_var_offset) {
+      const uint64_t dwPlusOp = dwarf::DW_OP_plus_uconst;
+      // Use a DWARF expression to point to byte offset in struct where
+      // the variable lives. This involves dereferencing the pointer
+      // stored in `live_vars_debug_addr` to get the start of the live
+      // vars struct, then using a byte offset into the struct for the
+      // particular variable.
+      auto expr = DIB.createExpression(
+          ArrayRef<uint64_t>{dwarf::DW_OP_deref, dwPlusOp, live_var_offset});
+      // Remap this debug variable to its new scope.
+      auto *new_var = DIB.createAutoVariable(
+          block->getParent()->getSubprogram(), old_var->getName(),
+          old_var->getFile(), old_var->getLine(), old_var->getType(),
+          /*AlwaysPreserve=*/false, DINode::FlagZero,
+          old_var->getAlignInBits());
+
+      // Create intrinsic
+
+#if LLVM_VERSION_LESS(21, 0)
+      assert(module.IsNewDbgInfoFormat &&
+             "Modules should be using the new debug info format");
+#endif
+      auto *const DVR =
+          static_cast<DbgVariableRecord *>(cast<DbgRecord *>(DIB.insertDeclare(
+              barrier.getDebugAddr(), new_var, expr, wrapperDbgLoc, block)));
+
+      // This is nasty, but LLVM errors out on trailing debug info, we need a
+      // subsequent instruction even if we delete it immediately afterwards.
+      auto *DummyInst = new UnreachableInst(module.getContext(), block);
+
+      // Bit of a HACK to produce the same debug output as the Mem2Reg
+      // pass used to do.
+      ConvertDebugDeclareToDebugValue(DVR, SI, DIB);
+
+      DummyInst->eraseFromParent();
+    };
+    for (auto debug_pair : barrier.getDebugDbgVariableRecords()) {
+      RecreateDebugIntrinsic(debug_pair.first->getVariable(),
+                             debug_pair.second);
+    }
+  }
+
+  void
+  createWorkItemLoopBody(const compiler::utils::BarrierWithLiveVars &barrier,
+                         IRBuilder<> &ir, BasicBlock *block, unsigned i,
+                         Value *dim_0, Value *dim_1, Value *dim_2,
+                         Value *accumulator = nullptr, Value *VF = nullptr,
+                         Value *offset = nullptr) {
+    auto new_kernel_args = args;
+    if (accumulator) {
+      new_kernel_args.push_back(accumulator);
+    }
+
+    // If the work item ID is a nullptr we take it to mean this barrier region
+    // doesn't need to use the barrier struct.
+    if (dim_0) {
+      assert(dim_1 && dim_2 && "unexpected null Work item IDs");
+
+      // set our local id
+      auto *const local_id = offset ? ir.CreateAdd(offset, dim_0) : dim_0;
+      ir.CreateCall(set_local_id,
+                    {ConstantInt::get(i32Ty, workItemDim0), local_id})
+          ->setCallingConv(set_local_id->getCallingConv());
+
+      auto *const live_var_ptr =
+          createLiveVarsPtr(barrier, ir, dim_0, dim_1, dim_2, VF);
+      if (live_var_ptr) {
+        new_kernel_args.push_back(live_var_ptr);
+
+        if (auto *debug_addr = barrier.getDebugAddr()) {
+          // Update the alloca holding the address of the live vars struct for
+          // currently executing work item.
+          auto *const live_var_ptr_cast =
+              ir.CreatePointerBitCastOrAddrSpaceCast(
+                  live_var_ptr, debug_addr->getAllocatedType());
+          auto *const SI = ir.CreateStore(live_var_ptr_cast, debug_addr);
+
+          // Recreate all the debug intrinsics pointing at location in live
+          // variables struct. We only need to do this once before the first
+          // barrier.
+          if (i == compiler::utils::kBarrier_FirstID) {
+            recreateDebugIntrinsics(barrier, block, SI);
+          }
+        }
+      }
+    }
+
+    auto &subkernel = *barrier.getSubkernel(i);
+
+    // call the original function now we've setup all the info!
+    CallInst *ci = ir.CreateCall(&subkernel, new_kernel_args);
+    // add a debug location for this call so that later inlining correctly
+    // updates the debug metadata of all inlined instructions.
+    if (wrapperDbgLoc) {
+      ci->setDebugLoc(wrapperDbgLoc);
+    }
+    ci->setCallingConv(subkernel.getCallingConv());
+    ci->setAttributes(compiler::utils::getCopiedFunctionAttrs(subkernel));
+
+    // And update the location of where we need to go to next (if we need to)
+    const auto &successors = barrier.getSuccessorIds(i);
+    if (successors.size() > 1) {
+      ir.CreateStore(ci, nextID);
+    }
+  }
+
+  // Create a 1D loop to execute all the work items in a 'barrier', reducing
+  // across an accumulator.
+  std::pair<BasicBlock *, Value *>
+  makeReductionLoop(const compiler::utils::BarrierWithLiveVars &barrier,
+                    const compiler::utils::GroupCollective &WGC,
+                    BasicBlock *block, Value *op, Value *accumulator) {
+    auto *const accTy = accumulator->getType();
+    Function *const func = block->getParent();
+
+    // Induction variables
+    auto *const totalSize = barrier.getTotalSize();
+
+    compiler::utils::CreateLoopOpts inner_opts;
+    inner_opts.IVs = {accumulator};
+    inner_opts.disableVectorize = true;
+
+    BasicBlock *preheader = block;
+    BasicBlock *exitBlock = nullptr;
+    PHINode *resultPhi = nullptr;
+
+    auto *const zero =
+        Constant::getNullValue(compiler::utils::getSizeType(module));
+
+    if (auto *const loopLimitConst = dyn_cast<Constant>(totalSize)) {
+      if (loopLimitConst->isZeroValue()) {
+        // No iterations at all!
+        return {block, accumulator};
+      }
+      preheader = block;
+    } else {
+      preheader =
+          BasicBlock::Create(context, "ca_work_group_reduce_preheader", func);
+
+      exitBlock =
+          BasicBlock::Create(context, "ca_work_group_reduce_exit", func);
+      preheader->moveAfter(block);
+      exitBlock->moveAfter(preheader);
+
+      auto *const needLoop = CmpInst::Create(
+          Instruction::ICmp, CmpInst::ICMP_NE, zero, totalSize, "", block);
+
+      BranchInst::Create(preheader, exitBlock, needLoop, block);
+
+      resultPhi = PHINode::Create(accTy, 2, "WGC_reduce", exitBlock);
+      resultPhi->addIncoming(accumulator, block);
+    }
+
+    BasicBlock *latchBlock = nullptr;
+
+    // linearly looping through the work items
+    exitBlock = compiler::utils::createLoop(
+        preheader, exitBlock, zero, totalSize, inner_opts,
+        [&](BasicBlock *block, Value *index, ArrayRef<Value *> ivs,
+            MutableArrayRef<Value *> ivsNext) -> BasicBlock * {
+          IRBuilder<> ir(block);
+          auto *const liveVars = createLinearLiveVarsPtr(barrier, ir, index);
+          compiler::utils::Barrier::LiveValuesHelper live_values(barrier, block,
+                                                                 liveVars);
+
+          IRBuilder<> ir_load(block);
+          auto *const itemOp =
+              live_values.getReload(op, ir_load, "_load", /*reuse*/ true);
+
+          // Do the reduction here..
+          accumulator = compiler::utils::createBinOpForRecurKind(
+              ir, ivs[0], itemOp, WGC.Recurrence);
+          ivsNext[0] = accumulator;
+          latchBlock = block;
+
+          return block;
+        });
+
+    if (!resultPhi) {
+      assert(exitBlock != latchBlock && "createLoop didn't create a loop");
+      resultPhi = PHINode::Create(accTy, 1, "WGC_reduce", exitBlock);
+    }
+    resultPhi->addIncoming(accumulator, latchBlock);
+    return {exitBlock, resultPhi};
+  }
+
+  void getUniformValues(BasicBlock *block,
+                        const compiler::utils::BarrierWithLiveVars &barrier,
+                        MutableArrayRef<Value *> values) {
+    auto *const zero =
+        Constant::getNullValue(compiler::utils::getSizeType(module));
+    IRBuilder<> ir(block);
+    auto *const barrier0 = ir.CreateInBoundsGEP(barrier.getLiveVarsType(),
+                                                barrier.getMemSpace(), {zero});
+    compiler::utils::Barrier::LiveValuesHelper live_values(barrier, block,
+                                                           barrier0);
+    for (auto &value : values) {
+      value = live_values.getReload(value, ir, "_load", true);
+    }
+  }
+
+  std::optional<compiler::utils::GroupCollective>
+  getBarrierGroupCollective(const compiler::utils::BarrierWithLiveVars &Barrier,
+                            unsigned BarrierID) {
+    auto *const BarrierCall = Barrier.getBarrierCall(BarrierID);
+    if (!BarrierCall) {
+      return std::nullopt;
+    }
+
+    auto Builtin = BI.analyzeBuiltin(*BarrierCall->getCalledFunction());
+    assert(Builtin && "Barrier call must be a known builtin");
+    return BI.isMuxGroupCollective(Builtin->ID);
+  }
+
+  std::tuple<BasicBlock *, Value *,
+             std::optional<compiler::utils::GroupCollective>>
+  makeWorkGroupCollectiveLoops(BasicBlock *block, unsigned barrierID) {
+    auto *const groupCall = barrierMain.getBarrierCall(barrierID);
+    if (!groupCall) {
+      return {block, nullptr, std::nullopt};
+    }
+
+    auto Info = getBarrierGroupCollective(barrierMain, barrierID);
+    if (!Info || !Info->isWorkGroupScope()) {
+      return {block, nullptr, std::nullopt};
+    }
+
+    switch (Info->Op) {
+    case compiler::utils::GroupCollective::OpKind::Reduction:
+    case compiler::utils::GroupCollective::OpKind::All:
+    case compiler::utils::GroupCollective::OpKind::Any: {
+      auto *const ty = groupCall->getType();
+      auto *const accumulator =
+          compiler::utils::getNeutralVal(Info->Recurrence, ty);
+      auto [loop_exit_block, accum] = makeReductionLoop(
+          barrierMain, *Info, block, groupCall->getOperand(1), accumulator);
+      if (barrierTail) {
+        auto *const groupTailInst = barrierTail->getBarrierCall(barrierID);
+        std::tie(loop_exit_block, accum) =
+            makeReductionLoop(*barrierTail, *Info, loop_exit_block,
+                              groupTailInst->getOperand(1), accum);
+      }
+      if (groupCall->hasName()) {
+        accum->takeName(groupCall);
+      }
+      return std::make_tuple(loop_exit_block, accum, Info);
+    }
+    case compiler::utils::GroupCollective::OpKind::ScanInclusive:
+    case compiler::utils::GroupCollective::OpKind::ScanExclusive: {
+      auto *const ty = groupCall->getType();
+      auto *const accumulator =
+          compiler::utils::getIdentityVal(Info->Recurrence, ty);
+      return {block, accumulator, Info};
+    }
+    case compiler::utils::GroupCollective::OpKind::Broadcast: {
+      // First we need to get the item ID values from the barrier struct.
+      // These should be uniform but they may still be variables. It should
+      // be safe to get them from the barrier struct at index zero.
+      auto *const zero =
+          Constant::getNullValue(compiler::utils::getSizeType(module));
+
+      Function *const func = block->getParent();
+      BasicBlock *mainUniformBlock = block;
+      BasicBlock *tailUniformBlock = nullptr;
+
+      auto *const totalSize = barrierMain.getTotalSize();
+      if (auto *const loopLimitConst = dyn_cast<Constant>(totalSize)) {
+        // If we know for a fact that the main struct has at least one item,
+        // we can just use that. Otherwise, we need to use the tail struct.
+        if (loopLimitConst->isZeroValue()) {
+          mainUniformBlock = nullptr;
+          if (barrierTail) {
+            tailUniformBlock = block;
+          }
+        }
+      } else if (barrierTail) {
+        // If we have a variable number of main items, it could be zero at
+        // runtime, so we need an alternative way to get the values.
+        mainUniformBlock =
+            BasicBlock::Create(context, "ca_main_uniform_load", func);
+        tailUniformBlock =
+            BasicBlock::Create(context, "ca_tail_uniform_load", func);
+
+        auto *const needTail = CmpInst::Create(
+            Instruction::ICmp, CmpInst::ICMP_EQ, totalSize, zero, "", block);
+        BranchInst::Create(tailUniformBlock, mainUniformBlock, needTail, block);
+      }
+
+      if (!mainUniformBlock && !tailUniformBlock) {
+        return {block, nullptr, std::nullopt};
+      }
+
+      Value *idsMain[] = {zero, zero, zero};
+      Value *idsTail[] = {zero, zero, zero};
+      if (mainUniformBlock) {
+        idsMain[0] = groupCall->getOperand(2);
+        idsMain[1] = groupCall->getOperand(3);
+        idsMain[2] = groupCall->getOperand(4);
+        getUniformValues(mainUniformBlock, barrierMain, idsMain);
+      }
+
+      if (tailUniformBlock) {
+        auto *const tailGroupCall = barrierTail->getBarrierCall(barrierID);
+        assert(tailGroupCall &&
+               "No corresponding work group broadcast in tail kernel");
+        idsTail[0] = tailGroupCall->getOperand(2);
+        idsTail[1] = tailGroupCall->getOperand(3);
+        idsTail[2] = tailGroupCall->getOperand(4);
+        getUniformValues(tailUniformBlock, *barrierTail, idsTail);
+
+        if (mainUniformBlock) {
+          // If both barrier structs had to be used, we need to merge the
+          // result.
+          block = BasicBlock::Create(context, "ca_merge_uniform_load", func);
+          BranchInst::Create(block, tailUniformBlock);
+          BranchInst::Create(block, mainUniformBlock);
+
+          for (size_t i = 0; i != 3; ++i) {
+            auto *mergePhi = PHINode::Create(idsMain[i]->getType(), 2,
+                                             "uniform_merge", block);
+            mergePhi->addIncoming(idsMain[i], mainUniformBlock);
+            mergePhi->addIncoming(idsTail[i], tailUniformBlock);
+            idsMain[i] = mergePhi;
+          }
+        } else {
+          // Otherwise we can use the tail.
+          for (size_t i = 0; i != 3; ++i) {
+            idsMain[i] = idsTail[i];
+          }
+        }
+      }
+
+      IRBuilder<> ir(block);
+      auto *const op = groupCall->getOperand(1);
+
+      // Compute the address of the value in the main barrier struct
+      auto *const VF = ir.CreateElementCount(
+          compiler::utils::getSizeType(module), barrierMain.getVFInfo().vf);
+      auto *const liveVars = createLiveVarsPtr(barrierMain, ir, idsMain[0],
+                                               idsMain[1], idsMain[2], VF);
+      compiler::utils::Barrier::LiveValuesHelper live_values(barrierMain, block,
+                                                             liveVars);
+      auto *const GEPmain = live_values.getGEP(op);
+      assert(GEPmain && "Could not get broadcasted value");
+
+      if (barrierTail) {
+        const bool VP = barrierTail->getVFInfo().IsVectorPredicated;
+
+        // Compute the address of the value in the tail barrier struct
+        auto *const offsetDim0 = ir.CreateSub(idsMain[0], mainLoopLimit);
+        auto *const liveVarsTail =
+            createLiveVarsPtr(*barrierTail, ir, offsetDim0, idsMain[1],
+                              idsMain[2], VP ? VF : nullptr);
+        compiler::utils::Barrier::LiveValuesHelper live_values(
+            *barrierTail, block, liveVarsTail);
+
+        auto *const opTail =
+            barrierTail->getBarrierCall(barrierID)->getOperand(1);
+        auto *const GEPtail = live_values.getGEP(opTail);
+        assert(GEPtail && "Could not get tail-broadcasted value");
+
+        // Select the main GEP or the tail GEP to load from
+        auto *const cond = ir.CreateICmpUGE(idsMain[0], mainLoopLimit);
+
+        auto *const select = ir.CreateSelect(cond, GEPtail, GEPmain);
+
+        auto *const result = ir.CreateLoad(op->getType(), select);
+        result->takeName(groupCall);
+
+        return {block, result, Info};
+      } else {
+        auto *const result = ir.CreateLoad(op->getType(), GEPmain);
+        result->takeName(groupCall);
+        return {block, result, Info};
+      }
+    }
+    default:
+      break;
+    }
+    return {block, nullptr, std::nullopt};
+  }
+
+  // Create loops to execute all the main work items, and then all the
+  // left-over tail work items at the end.
+  BasicBlock *makeWorkItemLoops(BasicBlock *block, unsigned barrierID) {
+    Value *accum = nullptr;
+    std::optional<compiler::utils::GroupCollective> collective;
+    std::tie(block, accum, collective) =
+        makeWorkGroupCollectiveLoops(block, barrierID);
+
+    // Work-group scans should be using linear work-item loops.
+    assert((!collective || !collective->isScan()) && "No support for scans");
+
+    auto *const zero =
+        Constant::getNullValue(compiler::utils::getSizeType(module));
+    auto *const i32Zero = Constant::getNullValue(i32Ty);
+    auto *const func = block->getParent();
+
+    // The subgroup induction variable, set to the value of the subgroup ID at
+    // the end of the last loop (i.e. beginning of the next loop)
+    Value *nextSubgroupIV = i32Zero;
+
+    // looping through num groups in the first (innermost)
+    // dimension
+    BasicBlock *mainPreheaderBB = block;
+    BasicBlock *mainExitBB = nullptr;
+
+    // We need to ensure any subgroup IV is defined on the path in which
+    // the vector loop is skipped.
+    PHINode *subgroupMergePhi = nullptr;
+
+    // If we are emitting a tail, we might need to bypass the vector loop (if
+    // the local size is less than the vector width).
+    if (emitTail) {
+      if (auto *const loopLimitConst = dyn_cast<Constant>(mainLoopLimit)) {
+        if (loopLimitConst->isZeroValue()) {
+          // No vector iterations at all!
+          mainPreheaderBB = nullptr;
+          mainExitBB = block;
+        }
+      } else {
+        mainPreheaderBB = BasicBlock::Create(
+            context, "ca_work_item_x_vector_preheader", func);
+
+        mainExitBB =
+            BasicBlock::Create(context, "ca_work_item_x_vector_exit", func);
+        mainPreheaderBB->moveAfter(block);
+        mainExitBB->moveAfter(mainPreheaderBB);
+
+        if (!noExplicitSubgroups) {
+          subgroupMergePhi = PHINode::Create(i32Ty, 2, "", mainExitBB);
+          subgroupMergePhi->addIncoming(i32Zero, block);
+        }
+
+        auto *const needMain =
+            CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, zero,
+                            mainLoopLimit, "", block);
+
+        BranchInst::Create(mainPreheaderBB, mainExitBB, needMain, block);
+      }
+    }
+
+    assert((mainPreheaderBB || !wrapperHasMain) &&
+           "Vector loops in one barrier block but not another?");
+
+    if (mainPreheaderBB) {
+      wrapperHasMain = true;
+      // Subgroup induction variables
+      compiler::utils::CreateLoopOpts outer_opts;
+      if (!noExplicitSubgroups) {
+        outer_opts.IVs = {i32Zero};
+      }
+
+      // looping through num groups in the third (outermost) dimension
+      mainExitBB = compiler::utils::createLoop(
+          mainPreheaderBB, mainExitBB, zero, localSizeDim[workItemDim2],
+          outer_opts,
+          [&](BasicBlock *block, Value *dim_2, ArrayRef<Value *> ivs2,
+              MutableArrayRef<Value *> ivsNext2) -> BasicBlock * {
+            // if we need to set the local id, do so here.
+            IRBuilder<> ir(block);
+            ir.CreateCall(set_local_id,
+                          {ConstantInt::get(i32Ty, workItemDim2), dim_2})
+                ->setCallingConv(set_local_id->getCallingConv());
+
+            compiler::utils::CreateLoopOpts middle_opts;
+            middle_opts.IVs = ivs2.vec();
+
+            // looping through num groups in the second dimension
+            BasicBlock *exit1 = compiler::utils::createLoop(
+                block, nullptr, zero, localSizeDim[workItemDim1], middle_opts,
+                [&](BasicBlock *block, Value *dim_1, ArrayRef<Value *> ivs1,
+                    MutableArrayRef<Value *> ivsNext1) -> BasicBlock * {
+                  IRBuilder<> ir(block);
+                  ir.CreateCall(set_local_id,
+                                {ConstantInt::get(i32Ty, workItemDim1), dim_1})
+                      ->setCallingConv(set_local_id->getCallingConv());
+
+                  // Materialize the scale factor at the beginning of the
+                  // preheader
+                  IRBuilder<> irph(mainPreheaderBB,
+                                   mainPreheaderBB->getFirstInsertionPt());
+                  auto *VF = irph.CreateElementCount(
+                      compiler::utils::getSizeType(module),
+                      barrierMain.getVFInfo().vf);
+
+                  compiler::utils::CreateLoopOpts inner_opts;
+                  inner_opts.indexInc = VF;
+                  inner_opts.IVs = ivs1.vec();
+
+                  BasicBlock *exit0 = compiler::utils::createLoop(
+                      block, nullptr, zero, mainLoopLimit, inner_opts,
+                      [&](BasicBlock *block, Value *dim_0,
+                          ArrayRef<Value *> ivs0,
+                          MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
+                        IRBuilder<> ir(block);
+
+                        if (!noExplicitSubgroups) {
+                          // set our subgroup id
+                          ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                              ->setCallingConv(
+                                  set_subgroup_id->getCallingConv());
+                        }
+
+                        createWorkItemLoopBody(barrierMain, ir, block,
+                                               barrierID, dim_0, dim_1, dim_2,
+                                               accum, VF);
+
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
+
+                        return block;
+                      });
+
+                  if (!noExplicitSubgroups) {
+                    // Don't forget to update the subgroup IV phi.
+                    ivsNext1[0] = nextSubgroupIV;
+                  }
+
+                  return exit0;
+                });
+
+            if (!noExplicitSubgroups) {
+              // Don't forget to update the subgroup IV phi.
+              ivsNext2[0] = nextSubgroupIV;
+
+              if (subgroupMergePhi) {
+                subgroupMergePhi->addIncoming(nextSubgroupIV, exit1);
+              }
+            }
+
+            return exit1;
+          });
+    }
+
+    // looping through num groups in the first
+    // (innermost) dimension
+    BasicBlock *tailPreheaderBB = mainExitBB;
+    BasicBlock *tailExitBB = nullptr;
+
+    if (emitTail && peel) {
+      // We might need to bypass the tail loop.
+      if (auto *const peelConst = dyn_cast<Constant>(peel)) {
+        if (peelConst->isZeroValue()) {
+          // No tail iterations at all!
+          tailPreheaderBB = nullptr;
+          tailExitBB = mainExitBB;
+        }
+      } else {
+        tailPreheaderBB = BasicBlock::Create(
+            context, "ca_work_item_x_scalar_preheader", func);
+
+        tailExitBB =
+            BasicBlock::Create(context, "ca_work_item_x_scalar_exit", func);
+        tailPreheaderBB->moveAfter(mainExitBB);
+        tailExitBB->moveAfter(tailPreheaderBB);
+
+        auto *const needPeeling = CmpInst::Create(
+            Instruction::ICmp, CmpInst::ICMP_NE, zero, peel, "", mainExitBB);
+
+        BranchInst::Create(tailPreheaderBB, tailExitBB, needPeeling,
+                           mainExitBB);
+      }
+    } else {
+      tailPreheaderBB = nullptr;
+      tailExitBB = mainExitBB;
+    }
+
+    assert((tailPreheaderBB || !wrapperHasTail) &&
+           "Tail loops in one barrier block but not another?");
+
+    if (tailPreheaderBB) {
+      assert(barrierTail);
+      wrapperHasTail = true;
+      // Subgroup induction variables
+      compiler::utils::CreateLoopOpts outer_opts;
+      if (!noExplicitSubgroups) {
+        outer_opts.IVs = {subgroupMergePhi ? subgroupMergePhi : nextSubgroupIV};
+      }
+
+      // looping through num groups in the third (outermost) dimension
+      tailExitBB = compiler::utils::createLoop(
+          tailPreheaderBB, tailExitBB, zero, localSizeDim[workItemDim2],
+          outer_opts,
+          [&](BasicBlock *block, Value *dim_2, ArrayRef<Value *> ivs2,
+              MutableArrayRef<Value *> ivsNext2) -> BasicBlock * {
+            // set the local id
+            IRBuilder<> ir(block);
+            ir.CreateCall(set_local_id,
+                          {ConstantInt::get(i32Ty, workItemDim2), dim_2})
+                ->setCallingConv(set_local_id->getCallingConv());
+
+            compiler::utils::CreateLoopOpts middle_opts;
+            middle_opts.IVs = ivs2.vec();
+
+            // looping through num groups in the second dimension
+            BasicBlock *exit1 = compiler::utils::createLoop(
+                block, nullptr, zero, localSizeDim[workItemDim1], middle_opts,
+                [&](BasicBlock *block, Value *dim_1, ArrayRef<Value *> ivs1,
+                    MutableArrayRef<Value *> ivsNext1) -> BasicBlock * {
+                  IRBuilder<> ir(block);
+                  ir.CreateCall(set_local_id,
+                                {ConstantInt::get(i32Ty, workItemDim1), dim_1})
+                      ->setCallingConv(set_local_id->getCallingConv());
+
+                  compiler::utils::CreateLoopOpts inner_opts;
+                  inner_opts.IVs = ivs1.vec();
+                  inner_opts.disableVectorize = true;
+
+                  BasicBlock *exit0 = compiler::utils::createLoop(
+                      block, nullptr, zero, peel, inner_opts,
+                      [&](BasicBlock *block, Value *dim_0,
+                          ArrayRef<Value *> ivs0,
+                          MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
+                        IRBuilder<> ir(block);
+
+                        if (!noExplicitSubgroups) {
+                          // set our subgroup id
+                          ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                              ->setCallingConv(
+                                  set_subgroup_id->getCallingConv());
+                        }
+
+                        createWorkItemLoopBody(
+                            *barrierTail, ir, block, barrierID, dim_0, dim_1,
+                            dim_2, accum, /*VF*/ nullptr, mainLoopLimit);
+
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1));
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
+
+                        return block;
+                      });
+
+                  if (!noExplicitSubgroups) {
+                    // Don't forget to update the subgroup IV phi.
+                    ivsNext1[0] = nextSubgroupIV;
+                  }
+
+                  return exit0;
+                });
+
+            if (!noExplicitSubgroups) {
+              // Don't forget to update the subgroup IV phi.
+              ivsNext2[0] = nextSubgroupIV;
+            }
+
+            return exit1;
+          });
+    }
+    return tailExitBB;
+  }
+
+  // Create loops to execute all work items in local linear ID order.
+  BasicBlock *makeLinearWorkItemLoops(BasicBlock *block, unsigned barrierID) {
+    Value *accum = nullptr;
+    std::optional<compiler::utils::GroupCollective> collective;
+    std::tie(block, accum, collective) =
+        makeWorkGroupCollectiveLoops(block, barrierID);
+
+    bool isScan = collective && collective->isScan();
+    bool isExclusiveScan =
+        isScan && collective->Op ==
+                      compiler::utils::GroupCollective::OpKind::ScanExclusive;
+    // The scan types can differ between 'main' and 'tail' kernels.
+    bool isTailExclusiveScan = false;
+    if (isScan && barrierTail) {
+      const auto tailInfo = getBarrierGroupCollective(*barrierTail, barrierID);
+      assert(tailInfo && "No corresponding work group scan in tail kernel");
+      isTailExclusiveScan =
+          tailInfo->Op ==
+          compiler::utils::GroupCollective::OpKind::ScanExclusive;
+    }
+
+    auto *const zero =
+        Constant::getNullValue(compiler::utils::getSizeType(module));
+    auto *const i32Zero = Constant::getNullValue(i32Ty);
+    auto *const func = block->getParent();
+
+    // The subgroup induction variable, set to the value of the subgroup ID at
+    // the end of the last loop (i.e. beginning of the next loop)
+    Value *nextSubgroupIV = noExplicitSubgroups ? nullptr : i32Zero;
+
+    // The work-group scan induction variable, set to the current scan value at
+    // the end of the last loop (i.e. beginning of the next loop)
+    Value *nextScanIV = isScan ? accum : nullptr;
+
+    // We need to ensure any subgroup IV is defined on the path in which
+    // the vector loop is skipped.
+    PHINode *subgroupMergePhi = nullptr;
+    // Same with the scan IV
+    PHINode *scanMergePhi = nullptr;
+
+    compiler::utils::CreateLoopOpts outer_opts;
+    outer_opts.IVs = {nextSubgroupIV, nextScanIV};
+    outer_opts.loopIVNames = {"sg.z", "scan.z"};
+
+    // looping through num groups in the third (outermost) dimension
+    return compiler::utils::createLoop(
+        block, nullptr, zero, localSizeDim[workItemDim2], outer_opts,
+        [&](BasicBlock *block, Value *dim_2, ArrayRef<Value *> ivs2,
+            MutableArrayRef<Value *> ivsNext2) -> BasicBlock * {
+          // set the local id
+          IRBuilder<> ir(block);
+          ir.CreateCall(set_local_id,
+                        {ConstantInt::get(i32Ty, workItemDim2), dim_2})
+              ->setCallingConv(set_local_id->getCallingConv());
+
+          compiler::utils::CreateLoopOpts middle_opts;
+          middle_opts.IVs = ivs2.vec();
+          middle_opts.loopIVNames = {"sg.y", "scan.y"};
+
+          // looping through num groups in the second dimension
+          BasicBlock *exit1 = compiler::utils::createLoop(
+              block, nullptr, zero, localSizeDim[workItemDim1], middle_opts,
+              [&](BasicBlock *block, Value *dim_1, ArrayRef<Value *> ivs1,
+                  MutableArrayRef<Value *> ivsNext1) -> BasicBlock * {
+                IRBuilder<> ir(block);
+                ir.CreateCall(set_local_id,
+                              {ConstantInt::get(i32Ty, workItemDim1), dim_1})
+                    ->setCallingConv(set_local_id->getCallingConv());
+
+                // looping through num groups in the first (innermost)
+                // dimension
+                BasicBlock *mainPreheaderBB = block;
+                BasicBlock *mainExitBB = nullptr;
+
+                // If we are emitting a tail, we might need to bypass the
+                // main loop (if the local size is less than the main loop
+                // width).
+                if (emitTail) {
+                  if (auto *const loopLimitConst =
+                          dyn_cast<Constant>(mainLoopLimit)) {
+                    if (loopLimitConst->isZeroValue()) {
+                      // No main iterations at all!
+                      mainPreheaderBB = nullptr;
+                      mainExitBB = block;
+                      if (!noExplicitSubgroups) {
+                        nextSubgroupIV = ivs1[0];
+                      }
+                      if (isScan) {
+                        nextScanIV = ivs1[1];
+                      }
+                    }
+                  } else {
+                    mainPreheaderBB = BasicBlock::Create(
+                        context, "ca_work_item_x_main_preheader", func);
+
+                    mainExitBB = BasicBlock::Create(
+                        context, "ca_work_item_x_main_exit", func);
+                    mainPreheaderBB->moveAfter(block);
+                    mainExitBB->moveAfter(mainPreheaderBB);
+
+                    if (!noExplicitSubgroups) {
+                      subgroupMergePhi =
+                          PHINode::Create(i32Ty, 2, "sg.merge", mainExitBB);
+                      subgroupMergePhi->addIncoming(ivs1[0], block);
+                    }
+
+                    if (isScan) {
+                      scanMergePhi = PHINode::Create(accum->getType(), 2,
+                                                     "scan.merge", mainExitBB);
+                      scanMergePhi->addIncoming(ivs1[1], block);
+                    }
+
+                    auto *const needMain =
+                        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
+                                        zero, mainLoopLimit, "", block);
+
+                    BranchInst::Create(mainPreheaderBB, mainExitBB, needMain,
+                                       block);
+                  }
+                }
+
+                assert((mainPreheaderBB || !wrapperHasMain) &&
+                       "Main loops in one barrier block but not another?");
+
+                if (mainPreheaderBB) {
+                  wrapperHasMain = true;
+                  BasicBlock *mainLoopBB = nullptr;
+
+                  // Materialize the scale factor at the beginning of the
+                  // preheader
+                  IRBuilder<> irph(mainPreheaderBB,
+                                   mainPreheaderBB->getFirstInsertionPt());
+                  auto *VF = irph.CreateElementCount(
+                      compiler::utils::getSizeType(module),
+                      barrierMain.getVFInfo().vf);
+
+                  compiler::utils::CreateLoopOpts inner_vf_opts;
+                  inner_vf_opts.indexInc = VF;
+                  inner_vf_opts.IVs = ivs1.vec();
+                  inner_vf_opts.loopIVNames = {"sg.x.main", "scan.x.main"};
+
+                  mainExitBB = compiler::utils::createLoop(
+                      mainPreheaderBB, mainExitBB, zero, mainLoopLimit,
+                      inner_vf_opts,
+                      [&](BasicBlock *block, Value *dim_0,
+                          ArrayRef<Value *> ivs0,
+                          MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
+                        IRBuilder<> ir(block);
+
+                        if (!noExplicitSubgroups) {
+                          // set our subgroup id
+                          ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                              ->setCallingConv(
+                                  set_subgroup_id->getCallingConv());
+                        }
+
+                        if (isScan) {
+                          auto *const barrierCall =
+                              barrierMain.getBarrierCall(barrierID);
+                          auto *const liveVars = createLiveVarsPtr(
+                              barrierMain, ir, dim_0, dim_1, dim_2, VF);
+                          compiler::utils::Barrier::LiveValuesHelper
+                              live_values(barrierMain, block, liveVars);
+                          auto *const itemOp = live_values.getReload(
+                              barrierCall->getOperand(1), ir, "_load",
+                              /*reuse*/ true);
+                          nextScanIV = compiler::utils::createBinOpForRecurKind(
+                              ir, ivs0[1], itemOp, collective->Recurrence);
+                          accum = isExclusiveScan ? ivs0[1] : nextScanIV;
+                          ivsNext0[1] = nextScanIV;
+                        }
+
+                        createWorkItemLoopBody(barrierMain, ir, block,
+                                               barrierID, dim_0, dim_1, dim_2,
+                                               accum, VF);
+
+                        if (!noExplicitSubgroups) {
+                          nextSubgroupIV =
+                              ir.CreateAdd(ivs0[0], ConstantInt::get(i32Ty, 1),
+                                           "sg.x.main.inc");
+                          ivsNext0[0] = nextSubgroupIV;
+                        }
+
+                        // Move the exit after the loop block, as it reads more
+                        // logically.
+                        mainLoopBB = block;
+                        if (mainExitBB) {
+                          mainExitBB->moveAfter(mainLoopBB);
+                        }
+
+                        return block;
+                      });
+
+                  if (subgroupMergePhi) {
+                    subgroupMergePhi->addIncoming(nextSubgroupIV, mainLoopBB);
+                    nextSubgroupIV = subgroupMergePhi;
+                  }
+
+                  if (scanMergePhi) {
+                    scanMergePhi->addIncoming(nextScanIV, mainLoopBB);
+                    nextScanIV = scanMergePhi;
+                  }
+                }
+                assert(mainExitBB && "didn't create a loop exit block!");
+
+                // looping through num groups in the first
+                // (innermost) dimension
+                BasicBlock *tailPreheaderBB = mainExitBB;
+                BasicBlock *tailExitBB = nullptr;
+
+                if (emitTail && peel) {
+                  // We might need to bypass the tail loop.
+                  if (auto *const peelConst = dyn_cast<Constant>(peel)) {
+                    if (peelConst->isZeroValue()) {
+                      // No tail iterations at all!
+                      tailPreheaderBB = nullptr;
+                      tailExitBB = mainExitBB;
+                    }
+                  } else {
+                    tailPreheaderBB = BasicBlock::Create(
+                        context, "ca_work_item_x_tail_preheader", func);
+
+                    tailExitBB = BasicBlock::Create(
+                        context, "ca_work_item_x_tail_exit", func);
+                    tailPreheaderBB->moveAfter(mainExitBB);
+                    tailExitBB->moveAfter(tailPreheaderBB);
+
+                    auto *const needPeeling =
+                        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
+                                        zero, peel, "", mainExitBB);
+
+                    BranchInst::Create(tailPreheaderBB, tailExitBB, needPeeling,
+                                       mainExitBB);
+                  }
+                } else {
+                  tailPreheaderBB = nullptr;
+                  tailExitBB = mainExitBB;
+                }
+
+                assert((tailPreheaderBB || !wrapperHasTail) &&
+                       "Tail loops in one barrier block but not another?");
+
+                if (tailPreheaderBB) {
+                  assert(barrierTail);
+                  wrapperHasTail = true;
+                  // Subgroup induction variables
+                  SmallVector<Value *, 2> subgroupIVs0 = {nextSubgroupIV,
+                                                          nextScanIV};
+
+                  BasicBlock *tailLoopBB = nullptr;
+                  if (barrierTail->getVFInfo().IsVectorPredicated) {
+                    IRBuilder<> ir(tailPreheaderBB);
+                    if (!noExplicitSubgroups) {
+                      // set our subgroup id
+                      ir.CreateCall(set_subgroup_id, {subgroupIVs0[0]})
+                          ->setCallingConv(set_subgroup_id->getCallingConv());
+                    }
+
+                    if (isScan) {
+                      assert(barrierTail);
+                      auto *const barrierCall =
+                          barrierTail->getBarrierCall(barrierID);
+                      auto *const liveVars = createLiveVarsPtr(
+                          *barrierTail, ir, zero, dim_1, dim_2, nullptr);
+                      compiler::utils::Barrier::LiveValuesHelper live_values(
+                          *barrierTail, tailPreheaderBB, liveVars);
+                      auto *const itemOp = live_values.getReload(
+                          barrierCall->getOperand(1), ir, "_load",
+                          /*reuse*/ true);
+                      nextScanIV = compiler::utils::createBinOpForRecurKind(
+                          ir, subgroupIVs0[1], itemOp, collective->Recurrence);
+                      accum =
+                          isTailExclusiveScan ? subgroupIVs0[1] : nextScanIV;
+                    }
+
+                    createWorkItemLoopBody(*barrierTail, ir, tailPreheaderBB,
+                                           barrierID, zero, dim_1, dim_2, accum,
+                                           /*VF*/ nullptr, mainLoopLimit);
+
+                    if (!noExplicitSubgroups) {
+                      nextSubgroupIV = ir.CreateAdd(subgroupIVs0[0],
+                                                    ConstantInt::get(i32Ty, 1),
+                                                    "sg.x.tail.inc");
+                    }
+
+                    assert(tailExitBB);
+                    ir.CreateBr(tailExitBB);
+                    tailLoopBB = tailPreheaderBB;
+                  } else {
+                    compiler::utils::CreateLoopOpts inner_scalar_opts;
+                    inner_scalar_opts.disableVectorize = true;
+                    inner_scalar_opts.IVs.assign(subgroupIVs0.begin(),
+                                                 subgroupIVs0.end());
+                    inner_scalar_opts.loopIVNames = {"sg.x.tail",
+                                                     "scan.x.tail"};
+
+                    tailExitBB = compiler::utils::createLoop(
+                        tailPreheaderBB, tailExitBB, zero, peel,
+                        inner_scalar_opts,
+                        [&](BasicBlock *block, Value *dim_0,
+                            ArrayRef<Value *> ivs0,
+                            MutableArrayRef<Value *> ivsNext0) -> BasicBlock * {
+                          IRBuilder<> ir(block);
+
+                          if (!noExplicitSubgroups) {
+                            // set our subgroup id
+                            ir.CreateCall(set_subgroup_id, {ivs0[0]})
+                                ->setCallingConv(
+                                    set_subgroup_id->getCallingConv());
+                          }
+
+                          if (isScan) {
+                            assert(barrierTail);
+                            auto *const barrierCall =
+                                barrierTail->getBarrierCall(barrierID);
+                            auto *const liveVars = createLiveVarsPtr(
+                                *barrierTail, ir, dim_0, dim_1, dim_2, nullptr);
+                            compiler::utils::Barrier::LiveValuesHelper
+                                live_values(*barrierTail, block, liveVars);
+                            auto *const itemOp = live_values.getReload(
+                                barrierCall->getOperand(1), ir, "_load",
+                                /*reuse*/ true);
+                            nextScanIV =
+                                compiler::utils::createBinOpForRecurKind(
+                                    ir, ivs0[1], itemOp,
+                                    collective->Recurrence);
+                            accum = isTailExclusiveScan ? ivs0[1] : nextScanIV;
+                            ivsNext0[1] = nextScanIV;
+                          }
+
+                          createWorkItemLoopBody(
+                              *barrierTail, ir, block, barrierID, dim_0, dim_1,
+                              dim_2, accum, /*VF*/ nullptr, mainLoopLimit);
+
+                          if (!noExplicitSubgroups) {
+                            nextSubgroupIV = ir.CreateAdd(
+                                ivs0[0], ConstantInt::get(i32Ty, 1),
+                                "sg.x.tail.inc");
+                            ivsNext0[0] = nextSubgroupIV;
+                          }
+
+                          tailLoopBB = block;
+                          // Move the exit after the loop block, as it reads
+                          // more logically.
+                          if (tailExitBB) {
+                            tailExitBB->moveAfter(tailLoopBB);
+                          }
+
+                          return block;
+                        });
+                  }
+
+                  // Merge the main and tail subgroup IVs together in the
+                  // tail exit, since we may have skipped either main or
+                  // tail loops.
+                  if (subgroupMergePhi) {
+                    auto *scalarSubgroupIV = nextSubgroupIV;
+                    nextSubgroupIV = PHINode::Create(
+                        i32Ty, 2, "sg.main.tail.merge", tailExitBB);
+                    cast<PHINode>(nextSubgroupIV)
+                        ->addIncoming(scalarSubgroupIV, tailLoopBB);
+                    cast<PHINode>(nextSubgroupIV)
+                        ->addIncoming(subgroupMergePhi, mainExitBB);
+                  }
+
+                  if (scanMergePhi) {
+                    auto *scalarScanIV = nextScanIV;
+                    nextScanIV =
+                        PHINode::Create(accum->getType(), 2,
+                                        "scan.main.tail.merge", tailExitBB);
+                    cast<PHINode>(nextScanIV)
+                        ->addIncoming(scalarScanIV, tailLoopBB);
+                    cast<PHINode>(nextScanIV)
+                        ->addIncoming(scanMergePhi, mainExitBB);
+                  }
+                }
+
+                if (!noExplicitSubgroups) {
+                  // Don't forget to update the subgroup IV phi.
+                  ivsNext1[0] = nextSubgroupIV;
+                }
+
+                if (isScan) {
+                  // ... or the scan IV phi.
+                  ivsNext1[1] = nextScanIV;
+                }
+
+                return tailExitBB;
+              });
+
+          if (!noExplicitSubgroups) {
+            // Don't forget to update the subgroup IV phi.
+            ivsNext2[0] = nextSubgroupIV;
+          }
+          if (isScan) {
+            // ... or the scan IV phi.
+            ivsNext2[1] = nextScanIV;
+          }
+
+          return exit1;
+        });
+  }
+
+  // It executes only the first work item in the work group
+  BasicBlock *makeRunOneWorkItem(BasicBlock *block, unsigned barrierID) {
+    // "Once" scheduled barriers shouldn't need the local id set.
+    IRBuilder<> ir(block);
+    createWorkItemLoopBody(barrierTail ? *barrierTail : barrierMain, ir, block,
+                           barrierID, nullptr, nullptr, nullptr, nullptr);
+    return block;
+  }
+};
+
+// Emits code to set up the storage allocated to a live-vars structure.
+//
+// Allocates enough space for sizeZ * sizeY * sizeX work-items. Note that Z/Y/X
+// here corresponds to the current outermost to innermost vectorized
+// dimensions, rather than in their absolutist sense.
+void setUpLiveVarsAlloca(compiler::utils::BarrierWithLiveVars &barrier,
+                         IRBuilder<> &B, Value *const sizeZ, Value *const sizeY,
+                         Value *const sizeX, StringRef name, bool isDebug) {
+  barrier.setSize0(sizeX);
+  Value *const live_var_size = B.CreateMul(sizeX, B.CreateMul(sizeY, sizeZ));
+  barrier.setTotalSize(live_var_size);
+  AllocaInst *live_var_mem_space;
+  auto &m = *B.GetInsertBlock()->getModule();
+  auto *const size_ty = compiler::utils::getSizeType(m);
+  const auto scalablesSize = barrier.getLiveVarMemSizeScalable();
+  if (scalablesSize == 0) {
+    live_var_mem_space =
+        B.CreateAlloca(barrier.getLiveVarsType(), live_var_size, name);
+    live_var_mem_space->setAlignment(
+        MaybeAlign(barrier.getLiveVarMaxAlignment()).valueOrOne());
+    barrier.setMemSpace(live_var_mem_space);
+  } else {
+    const auto fixedSize = barrier.getLiveVarMemSizeFixed();
+    // We ensure that the VFs are the same between the main and tail.
+    auto *const vscale =
+        B.CreateElementCount(size_ty, ElementCount::getScalable(scalablesSize));
+    auto *const structSize =
+        B.CreateAdd(vscale, ConstantInt::get(size_ty, fixedSize));
+    auto *const buffer_size = B.CreateMul(structSize, live_var_size);
+
+    live_var_mem_space = B.CreateAlloca(B.getInt8Ty(), buffer_size, name);
+    live_var_mem_space->setAlignment(
+        MaybeAlign(barrier.getLiveVarMaxAlignment()).valueOrOne());
+    barrier.setMemSpace(live_var_mem_space);
+    barrier.setStructSize(structSize);
+  }
+
+  if (isDebug) {
+    barrier.setDebugAddr(B.CreateAlloca(live_var_mem_space->getType(), nullptr,
+                                        "live_vars_peel_dbg"));
+  }
+}
+
+} // namespace
+
+Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
+    BarrierWithLiveVars &barrierMain, BarrierWithLiveVars *barrierTail,
+    StringRef baseName, Module &M, compiler::utils::BuiltinInfo &BI) {
+  Function &mainF = barrierMain.getFunc();
+
+  // The reference function is that which we expect to hold the reference
+  // version of various pieces of data, such as metadata. It's the tail
+  // function if one exists, else it's the main function.
+  Function &refF = barrierTail ? barrierTail->getFunc() : barrierMain.getFunc();
+
+  const bool emitTail = barrierTail != nullptr;
+
+  auto mainInfo = barrierMain.getVFInfo();
+  auto tailInfo =
+      emitTail ? barrierTail->getVFInfo() : std::optional<VectorizationInfo>();
+
+  const auto workItemDim0 = 0;
+  const auto workItemDim1 = 1;
+  const auto workItemDim2 = 2;
+
+  LLVMContext &context = M.getContext();
+
+  Function *new_wrapper =
+      createKernelWrapperFunction(mainF, ".mux-barrier-wrapper");
+
+  new_wrapper->setName(baseName + ".mux-barrier-wrapper");
+  // Ensure the base name is recorded
+  setBaseFnName(*new_wrapper, baseName);
+
+  // An inlinable function call in a function with debug info *must* be given
+  // a debug location.
+  DILocation *wrapperDbgLoc = nullptr;
+  if (new_wrapper->getSubprogram()) {
+    wrapperDbgLoc = DILocation::get(context, /*line*/ 0, /*col*/ 0,
+                                    new_wrapper->getSubprogram());
+  }
+
+  IRBuilder<> entryIR(BasicBlock::Create(context, "entry", new_wrapper));
+
+  auto *const i32Ty = Type::getInt32Ty(context);
+
+  auto sizeTyBytes = getSizeTypeBytes(M);
+
+  auto *VF = entryIR.CreateElementCount(compiler::utils::getSizeType(M),
+                                        barrierMain.getVFInfo().vf);
+  Value *localSizeDim[3];
+
+  if (auto wgs = parseRequiredWGSMetadata(refF)) {
+    localSizeDim[0] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[0]);
+    localSizeDim[1] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[1]);
+    localSizeDim[2] = entryIR.getIntN(8 * sizeTyBytes, (*wgs)[2]);
+  } else {
+    const uint32_t max_work_dim = parseMaxWorkDimMetadata(refF).value_or(3);
+
+    // Fill out a default local size of 1x1x1.
+    std::fill(std::begin(localSizeDim), std::end(localSizeDim),
+              entryIR.getIntN(8 * sizeTyBytes, 1));
+
+    auto *const get_local_size =
+        BI.getOrDeclareMuxBuiltin(eMuxBuiltinGetLocalSize, M);
+    assert(get_local_size && "Missing __mux_get_local_size");
+
+    auto ci0 =
+        entryIR.CreateCall(get_local_size, entryIR.getInt32(0), "local_size.x");
+    ci0->setCallingConv(get_local_size->getCallingConv());
+    localSizeDim[0] = ci0;
+
+    if (max_work_dim > 1) {
+      auto ci1 = entryIR.CreateCall(get_local_size, entryIR.getInt32(1),
+                                    "local_size.y");
+      ci1->setCallingConv(get_local_size->getCallingConv());
+      localSizeDim[1] = ci1;
+    }
+
+    if (max_work_dim > 2) {
+      auto ci2 = entryIR.CreateCall(get_local_size, entryIR.getInt32(2),
+                                    "local_size.z");
+      ci2->setCallingConv(get_local_size->getCallingConv());
+      localSizeDim[2] = ci2;
+    }
+  }
+
+  // Assume that local sizes are never zero. This prevents LLVM "saving" our
+  // loops by inserting llvm.umax (or its equivalent) to stop the loops we're
+  // about to create from causing headaches:
+  //   %iv.next = add i64 nuw %iv, 1
+  //   %exit = icmp eq i64 %iv.next, %localsizeY
+  //   br i1 %exit, label %exit.the.loop, %continue.the.loop
+  // If LLVM doesn't know that %localsizey is never zero, it rightly determines
+  // that a zero size would cause problems, since we'd have to overflow our i64
+  // to exit the loop, but we've marked the increment as 'nuw'. So it inserts
+  // an llvm.umax to ensure the size is at least 1. Since we know our local
+  // sizes are never zero, an llvm.assume intrinsic prevents this from
+  // happening.
+  // We want to insert a call to __mux__set_max_sub_group_size after these
+  // assumptions, to keep track of the last one we've inserted.
+  for (auto i = 0; i < 3; i++) {
+    auto *const nonZero = entryIR.CreateICmpNE(
+        localSizeDim[i], ConstantInt::get(localSizeDim[i]->getType(), 0));
+    entryIR.CreateAssumption(nonZero);
+  }
+
+  // There are four cases:
+  //
+  // 1. If !emitTail: in this case, only the main function will be called. The
+  // main function may be a scalar function, may be a predicated vector
+  // function, or may be an unpredicated vector function where the local size is
+  // known to be a multiple of the vectorization factor.
+  //
+  // 2. Otherwise, if tailInfo->IsVectorPredicated: in this case, the main
+  // function will be unpredicated and will be called for any multiples of vf,
+  // and one tail call will handle any remainder. vf of the main function and
+  // the tail function are the same.
+  //
+  // 3. Otherwise, if hasNoExplicitSubgroups(refF): in this case, the main
+  // function will be unpredicated and will be called for any multiples of vf,
+  // and one tail loop will handle any remainder. vf of the main function is
+  // used.
+  //
+  // 4. Otherwise: if local_size_x is a multiple of the main function's vf, the
+  // main function will handle the full loop and the main function's vf is used,
+  // else the tail function will handle the full loop and the tail function's vf
+  // is used.
+  //
+  // Unless hasNoExplicitSubgroups(refF), the subgroups are calculated as
+  //
+  //    get_max_sub_group_size() = min(vf, local_size_x)
+  //    get_num_sub_groups() = ((local_size_x + vector_width - 1) / vf)
+  //      * local_size_y * local_size_z
+  //
+  // If hasNoExplicitSubgroups(refF) (even for cases 1 and 2), the subgroups are
+  // not calculated.
+
+  const bool noExplicitSubgroups = hasNoExplicitSubgroups(refF);
+
+  Value *mainLoopLimit = localSizeDim[workItemDim0];
+  Value *peel = nullptr;
+
+  Value *effectiveVF = VF;
+
+  if (emitTail) {
+    auto *const rem = entryIR.CreateSRem(mainLoopLimit, VF, "rem");
+    if (tailInfo->IsVectorPredicated || noExplicitSubgroups) {
+      peel = rem;
+    } else {
+      // We must have no more than one iteration with a subgroup size below the
+      // maximum subgroup size. To meet this requirement, if the tail is scalar
+      // and the vector size does not divide the workgroup size, do not use the
+      // vectorized kernel at all.
+      auto *const remcond = entryIR.CreateICmpNE(
+          rem, Constant::getNullValue(rem->getType()), "remcond");
+      peel = entryIR.CreateSelect(
+          remcond, mainLoopLimit,
+          Constant::getNullValue(mainLoopLimit->getType()), "peel");
+      effectiveVF =
+          entryIR.CreateSelect(remcond,
+                               entryIR.CreateElementCount(
+                                   VF->getType(), barrierTail->getVFInfo().vf),
+                               VF);
+    }
+    mainLoopLimit = entryIR.CreateSub(mainLoopLimit, peel, "mainLoopLimit");
+  }
+
+  // Set the subgroup maximum size and number of subgroups in this kernel
+  // wrapper.
+  if (!noExplicitSubgroups) {
+    auto setMaxSubgroupSizeFn =
+        BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetMaxSubGroupSize, M);
+    assert(setMaxSubgroupSizeFn && "Missing __mux_set_max_sub_group_size");
+    auto setNumSubgroupsFn =
+        BI.getOrDeclareMuxBuiltin(eMuxBuiltinSetNumSubGroups, M);
+    assert(setNumSubgroupsFn && "Missing __mux_set_num_sub_groups");
+    auto *const localSizeInVecDim = localSizeDim[workItemDim0];
+    auto *const localSizeInNonVecDim = entryIR.CreateMul(
+        localSizeDim[workItemDim1], localSizeDim[workItemDim2], "wg.yz");
+    auto *maxSubgroupSize = entryIR.CreateBinaryIntrinsic(
+        Intrinsic::umin, localSizeInVecDim, effectiveVF, {}, "sg.x");
+    entryIR.CreateCall(setMaxSubgroupSizeFn,
+                       {entryIR.CreateTrunc(maxSubgroupSize, i32Ty)});
+    auto *const numSubgroupsInVecDim = entryIR.CreateUDiv(
+        entryIR.CreateAdd(
+            localSizeInVecDim,
+            entryIR.CreateSub(effectiveVF,
+                              ConstantInt::get(effectiveVF->getType(), 1))),
+        effectiveVF, "sgs.x");
+    auto *const numSubgroups =
+        entryIR.CreateMul(numSubgroupsInVecDim, localSizeInNonVecDim, "sgs");
+    entryIR.CreateCall(setNumSubgroupsFn,
+                       {entryIR.CreateTrunc(numSubgroups, i32Ty)});
+  }
+
+  if (barrierMain.hasLiveVars()) {
+    // The size in the first dimension is divided by the vectorization factor.
+    // When vector-predicated, this result is rounded up: (LIM + VF - 1) / VF.
+    // This catches cases where we need two loop iterations, e.g., VF=4 and
+    // size=7, where rounding down would give one.
+    Value *numerator = mainLoopLimit;
+    if (mainInfo.IsVectorPredicated) {
+      Value *const vf_minus_1 =
+          entryIR.CreateSub(VF, ConstantInt::get(VF->getType(), 1));
+      numerator = entryIR.CreateAdd(mainLoopLimit, vf_minus_1);
+    }
+    Value *const size0 = entryIR.CreateUDiv(numerator, VF);
+
+    setUpLiveVarsAlloca(barrierMain, entryIR, localSizeDim[workItemDim2],
+                        localSizeDim[workItemDim1], size0, "live_variables",
+                        IsDebug);
+  }
+
+  // Amazingly, it's possible for the tail kernel to have live vars in its
+  // barriers, even when the main kernel does not.
+  if (emitTail && barrierTail->hasLiveVars()) {
+    Value *size0 = peel;
+    if (tailInfo->IsVectorPredicated) {
+      // If the tail is predicated, it will only have a single (vectorized) item
+      // along the X axis, or none.
+      auto *const hasLeftover = entryIR.CreateICmp(
+          CmpInst::ICMP_NE, peel, ConstantInt::get(peel->getType(), 0),
+          "tail.has.vp");
+      size0 = entryIR.CreateZExt(hasLeftover, peel->getType());
+    }
+    setUpLiveVarsAlloca(*barrierTail, entryIR, localSizeDim[workItemDim2],
+                        localSizeDim[workItemDim1], size0,
+                        "live_variables_peel", IsDebug);
+  }
+
+  // next means next barrier id. This variable is uninitialized to begin with,
+  // and is set by the first pass below
+  IntegerType *index_type = i32Ty;
+  AllocaInst *nextID =
+      entryIR.CreateAlloca(index_type, nullptr, "next_barrier_id");
+
+  std::map<unsigned, BasicBlock *> bbs;
+  // The vectorized kernel has been further optimized and may have removed
+  // unreachable barriers that are still present in the scalar kernel. But if
+  // they are unreachable, we know they must also be unreachable in the scalar
+  // kernel even if we have not yet detected that.
+
+  for (auto &[i, subkernel] : barrierMain.getSubkernels()) {
+    bbs[i] = BasicBlock::Create(context, "sw.bb", new_wrapper);
+  }
+
+  ScheduleGenerator schedule(M, barrierMain, barrierTail, BI);
+  schedule.workItemDim0 = workItemDim0;
+  schedule.workItemDim1 = workItemDim1;
+  schedule.workItemDim2 = workItemDim2;
+  schedule.localSizeDim[0] = localSizeDim[0];
+  schedule.localSizeDim[1] = localSizeDim[1];
+  schedule.localSizeDim[2] = localSizeDim[2];
+  schedule.wrapperDbgLoc = wrapperDbgLoc;
+  schedule.nextID = nextID;
+  schedule.mainLoopLimit = mainLoopLimit;
+  schedule.noExplicitSubgroups = noExplicitSubgroups;
+  schedule.emitTail = emitTail;
+  schedule.peel = peel;
+
+  // Make call instruction for first new kernel. It follows wrapper function's
+  // parameters.
+  for (auto &arg : new_wrapper->args()) {
+    schedule.args.push_back(&arg);
+  }
+
+  // Branch directly into the first basic block.
+  entryIR.CreateBr(bbs[kBarrier_FirstID]);
+
+  for (auto &[i_, subkernel_] : barrierMain.getSubkernels()) {
+    auto i = i_;
+
+    // Keep it linear
+    BasicBlock *const block = bbs[i];
+    block->moveAfter(&new_wrapper->back());
+
+    if (i == kBarrier_EndID) {
+      // This basic block breaks us out of our function, thus we return!
+      ReturnInst::Create(context, block);
+    } else {
+      // Re-issue the barrier's memory fence before the work-item loops
+      if (auto *const CI = barrierMain.getBarrierCall(i)) {
+        auto *const callee = CI->getCalledFunction();
+        const auto builtin = BI.analyzeBuiltin(*callee);
+        if (builtin &&
+            builtin->ID == compiler::utils::eMuxBuiltinWorkGroupBarrier) {
+          IRBuilder<> B(block);
+          auto *MemBarrier =
+              BI.getOrDeclareMuxBuiltin(eMuxBuiltinMemBarrier, M);
+          assert(MemBarrier);
+          Value *Ops[2] = {CI->getOperand(1), CI->getOperand(2)};
+
+          auto *const Call = B.CreateCall(MemBarrier, Ops);
+
+          // Patch up any operands that were non-constants by fetching them from
+          // the barrier struct. We do this after creating the call because we
+          // need an instruction to function as an insert point.
+          if (!isa<Constant>(Ops[0]) || !isa<Constant>(Ops[1])) {
+            // We expect these values to be uniform so it should be safe to get
+            // from the barrier struct at index zero. Barriers are convergent,
+            // so there should be no chance that the value does not exist.
+            auto *const zero =
+                Constant::getNullValue(compiler::utils::getSizeType(M));
+            IRBuilder<> ir(Call);
+            auto *const barrier0 =
+                ir.CreateInBoundsGEP(barrierMain.getLiveVarsType(),
+                                     barrierMain.getMemSpace(), {zero});
+
+            Barrier::LiveValuesHelper live_values(barrierMain, Call, barrier0);
+
+            size_t op_index = 0;
+            for (auto *const op : Ops) {
+              if (!isa<Constant>(op)) {
+                auto *const new_op =
+                    live_values.getReload(op, ir, "_load", /*reuse*/ true);
+                Call->setArgOperand(op_index, new_op);
+              }
+              ++op_index;
+            }
+          }
+          Call->setDebugLoc(wrapperDbgLoc);
+        }
+      }
+
+      auto *const exitBlock = [&]() {
+        switch (barrierMain.getSchedule(i)) {
+        case BarrierSchedule::Unordered:
+        case BarrierSchedule::ScalarTail:
+          if (tailInfo && tailInfo->IsVectorPredicated) {
+            return schedule.makeLinearWorkItemLoops(block, i);
+          }
+          return schedule.makeWorkItemLoops(block, i);
+
+        case BarrierSchedule::Once:
+          return schedule.makeRunOneWorkItem(block, i);
+
+        case BarrierSchedule::Linear:
+          return schedule.makeLinearWorkItemLoops(block, i);
+        }
+
+        llvm_unreachable("Unexpected barrier schedule enum");
+      }();
+
+      // the last basic block in our function!
+      IRBuilder<> exitIR(exitBlock);
+
+      const auto &successors = barrierMain.getSuccessorIds(i);
+      const auto num_succ = successors.size();
+
+      if (num_succ == 1) {
+        // If there is only one successor, we can branch directly to it
+        exitIR.CreateBr(bbs.find(successors.front())->second);
+      } else if (num_succ == 2) {
+        // If there are exactly two successors, we can use a conditional branch
+        auto *const bb_id = ConstantInt::get(index_type, successors[0]);
+        auto *const br_block =
+            BasicBlock::Create(context, "barrier.branch", new_wrapper);
+        auto *const ld_next_id = new LoadInst(index_type, nextID, "", br_block);
+        auto *const cmp_id =
+            CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, ld_next_id,
+                            bb_id, "", br_block);
+        BranchInst::Create(bbs.find(successors[0])->second,
+                           bbs.find(successors[1])->second, cmp_id, br_block);
+
+        exitIR.CreateBr(br_block);
+      } else if (num_succ == 0) {
+        // If a barrier region has no successor, we just emit a call to
+        // llvm.trap and unreachable. A barrier region can have zero successors
+        // if all its terminators end in unreachable. Since there are no
+        // successors, it is not possible to continue and therefore we emit an
+        // unreachable here.
+
+        // TODO: we should be flagging up unreachables sooner, so that we avoid
+        // wrapping barrier regions with no successors with work item loops,
+        // and we should also make sure that the barrier region has no
+        // successors because of all its terminators ending in unreachable.
+        // If it's not the case we may want to handle that differently.
+        auto trap =
+            M.getOrInsertFunction("llvm.trap", Type::getVoidTy(context));
+        exitIR.CreateCall(trap);
+        exitIR.CreateUnreachable();
+      } else {
+        // Make a basic block with a switch to jump to the next subkernel
+        auto *const switch_body =
+            BasicBlock::Create(context, "barrier.switch", new_wrapper);
+        LoadInst *const ld_next_id =
+            new LoadInst(index_type, nextID, "", switch_body);
+        SwitchInst *const sw = SwitchInst::Create(
+            ld_next_id, bbs.find(successors[0])->second, num_succ, switch_body);
+        for (const auto i : successors) {
+          sw->addCase(ConstantInt::get(index_type, i), bbs.find(i)->second);
+        }
+        exitIR.CreateBr(switch_body);
+      }
+    }
+  }
+
+  bbs[kBarrier_EndID]->moveAfter(&new_wrapper->back());
+  bbs[kBarrier_EndID]->setName("kernel.exit");
+
+  // Remap any constant expression which take a reference to the old function
+  // FIXME: What about the main function?
+  for (auto *user : make_early_inc_range(refF.users())) {
+    if (ConstantExpr *constant = dyn_cast<ConstantExpr>(user)) {
+      remapConstantExpr(constant, &refF, new_wrapper);
+    } else if (ConstantArray *ca = dyn_cast<ConstantArray>(user)) {
+      remapConstantArray(ca, &refF, new_wrapper);
+    } else if (!isa<CallInst>(user)) {
+      llvm_unreachable(
+          "Cannot handle user of function being anything other than a "
+          "ConstantExpr, ConstantArray or CallInst");
+    }
+  }
+  // We output the number of uses here to lit test that the number of uses was
+  // not increased by the remap functions.
+  LLVM_DEBUG(dbgs() << "Uses of " << refF.getName() << ": " << refF.getNumUses()
+                    << "\n");
+
+  // Forcibly disable the tail info if we know we've omitted it.
+  if (!schedule.wrapperHasMain || !schedule.wrapperHasTail) {
+    // If we're missing a main loop then the tail loop becomes the main from
+    // the perspective of the metadata: have that steal the tail loop info. We
+    // should always have a main loop with an optional tail.
+    if (!schedule.wrapperHasMain) {
+      if (schedule.wrapperHasTail && tailInfo) {
+        mainInfo = *tailInfo;
+      } else {
+        // If we have neither a main nor a tail (which may happen at kernel
+        // compile time but we should never actually execute such a kernel -
+        // we already assume the local sizes are never zero, see elsewhere in
+        // this pass) then encode a token info metadata of 1.
+        mainInfo = VectorizationInfo{ElementCount::getFixed(1), workItemDim0,
+                                     /*isVectorPredicated*/ false};
+      }
+    }
+    tailInfo = std::nullopt;
+  }
+
+  encodeWrapperFnMetadata(*new_wrapper, mainInfo, tailInfo);
+
+  // The subkernels can be marked as internal since its external uses have been
+  // superceded by this wrapper. This will help it get DCE'd once inlined. Any
+  // existing calls to this subkernel (e.g., another kernel calling this
+  // kernel) will prevent it from being removed unnecessarily.
+  barrierMain.getFunc().setLinkage(Function::InternalLinkage);
+  if (barrierTail) {
+    barrierTail->getFunc().setLinkage(Function::InternalLinkage);
+  }
+
+  return new_wrapper;
+}
+
+struct BarrierWrapperInfo {
+  StringRef BaseName;
+  // Information about the 'main' kernel
+  Function *MainF;
+  compiler::utils::VectorizationInfo MainInfo;
+  // Optional information about the 'tail' kernel
+  Function *TailF = nullptr;
+  std::optional<compiler::utils::VectorizationInfo> TailInfo = std::nullopt;
+  // A 'tail' kernel which was explicitly omitted.
+  Function *SkippedTailF = nullptr;
+};
+
+PreservedAnalyses
+compiler::utils::WorkItemLoopsPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  // Cache the functions we're interested in as this pass introduces new ones
+  // which we don't want to run over.
+  SmallVector<BarrierWrapperInfo, 4> MainTailPairs;
+  const auto &GSGI = MAM.getResult<compiler::utils::SubgroupAnalysis>(M);
+
+  for (auto &F : M.functions()) {
+    if (!isKernelEntryPt(F)) {
+      continue;
+    }
+
+    const auto BaseName = getBaseFnNameOrFnName(F);
+    auto VeczToOrigFnData = parseVeczToOrigFnLinkMetadata(F);
+
+    const auto WorkItemDim0 = 0;
+
+    const VectorizationInfo scalarTailInfo{ElementCount::getFixed(1),
+                                           WorkItemDim0,
+                                           /*IsVectorPredicated*/ false};
+
+    if (!VeczToOrigFnData) {
+      // If there was no vectorization metadata, it's a scalar kernel.
+      MainTailPairs.push_back({BaseName, &F, scalarTailInfo});
+      continue;
+    }
+
+    // If we got a vectorized kernel, wrap it using the vectorization factor.
+    const auto MainInfo = VeczToOrigFnData->second;
+
+    // Start out assuming scalar tail, which is the default behaviour...
+    auto TailInfo = scalarTailInfo;
+    auto *TailFunc = VeczToOrigFnData->first;
+    // ... and search for a linked vector-predicated tail, which we prefer.
+    if (!MainInfo.IsVectorPredicated && TailFunc) {
+      SmallVector<LinkMetadataResult, 4> LinkedFns;
+      parseOrigToVeczFnLinkMetadata(*TailFunc, LinkedFns);
+      for (const auto &Link : LinkedFns) {
+        // Restrict our option to strict VF==VF matches.
+        if (Link.first != &F && Link.second.vf == MainInfo.vf &&
+            Link.second.IsVectorPredicated) {
+          TailFunc = Link.first;
+          TailInfo = Link.second;
+          break;
+        }
+      }
+    }
+
+    std::optional<size_t> LocalSizeInVecDim;
+    if (auto WGS = parseRequiredWGSMetadata(F)) {
+      LocalSizeInVecDim = (*WGS)[WorkItemDim0];
+    }
+
+    // We can skip the tail in the following circumstances:
+    // * If we have no tail function (trusting that this is okay)
+    // * Vector-predicated kernels handle their own tails
+    // * The user has explicitly forced us to omit tails
+    // * We can prove that the vectorization factor fits the required/known
+    //   local work-group size
+    if (!TailFunc || MainInfo.IsVectorPredicated || ForceNoTail ||
+        (LocalSizeInVecDim && !MainInfo.vf.isScalable() &&
+         *LocalSizeInVecDim % MainInfo.vf.getKnownMinValue() == 0)) {
+      MainTailPairs.push_back({BaseName, &F, MainInfo, /*TailF*/ nullptr,
+                               /*TailInfo*/ std::nullopt,
+                               /*SkippedTailF*/ TailFunc});
+    } else {
+      // Else, emit a tail using the tail function.
+      MainTailPairs.push_back({BaseName, &F, MainInfo, TailFunc, TailInfo});
+    }
+  }
+
+  if (MainTailPairs.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  // Prune redundant wrappers we don't want to create for the sake of compile
+  // time.
+  SmallPtrSet<const Function *, 4> RedundantMains;
+  for (const auto &P : MainTailPairs) {
+    // If we're creating a wrapper with a skipped 'tail' or a scalar 'tail', we
+    // don't want to create another wrapper where the scalar tail is the
+    // 'main', unless that tail is useful as a fallback sub-group kernel. A
+    // fallback sub-group kernel is one for which:
+    // * The 'main' has a required sub-group size that isn't the scalar size.
+    // * The 'main' and 'tail' kernels both make use of sub-group builtins. If
+    // neither do, there's no need for the fallback.
+    // * The 'main' kernel uses sub-groups but the 'main' vectorization factor
+    // cleanly divides the known local work-group size.
+    if (P.SkippedTailF || (P.TailInfo && P.TailInfo->vf.isScalar())) {
+      const auto *TailF = P.SkippedTailF ? P.SkippedTailF : P.TailF;
+      if (getReqdSubgroupSize(*P.MainF).value_or(1) != 1 ||
+          (!GSGI.usesSubgroups(*P.MainF) && !GSGI.usesSubgroups(*TailF))) {
+        RedundantMains.insert(TailF);
+      } else if (auto wgs = parseRequiredWGSMetadata(*P.MainF)) {
+        const uint64_t local_size_x = wgs.value()[0];
+        if (!P.MainInfo.IsVectorPredicated &&
+            !(local_size_x % P.MainInfo.vf.getKnownMinValue())) {
+          RedundantMains.insert(TailF);
+        }
+      }
+    }
+    // If we're creating a wrapper with a VP 'tail', we don't want to create
+    // another wrapper where the VP is the 'main'
+    if (!P.MainInfo.IsVectorPredicated && P.TailInfo &&
+        P.TailInfo->IsVectorPredicated) {
+      RedundantMains.insert(P.TailF);
+    }
+  }
+
+  MainTailPairs.erase(
+      std::remove_if(MainTailPairs.begin(), MainTailPairs.end(),
+                     [&RedundantMains](const BarrierWrapperInfo &I) {
+                       return RedundantMains.contains(I.MainF);
+                     }),
+      MainTailPairs.end());
+
+  SmallPtrSet<Function *, 4> Wrappers;
+  auto &BI = MAM.getResult<BuiltinInfoAnalysis>(M);
+
+  for (const auto &P : MainTailPairs) {
+    assert(P.MainF && "Missing main function");
+    // Construct the main barrier
+    BarrierWithLiveVars MainBarrier(M, *P.MainF, P.MainInfo, IsDebug);
+    MainBarrier.Run(MAM);
+
+    // Tail kernels are optional
+    if (!P.TailF) {
+      Wrappers.insert(
+          makeWrapperFunction(MainBarrier, nullptr, P.BaseName, M, BI));
+    } else {
+      // Construct the tail barrier
+      assert(P.TailInfo && "Missing tail info");
+      BarrierWithLiveVars TailBarrier(M, *P.TailF, *P.TailInfo, IsDebug);
+      TailBarrier.Run(MAM);
+
+      Wrappers.insert(
+          makeWrapperFunction(MainBarrier, &TailBarrier, P.BaseName, M, BI));
+    }
+  }
+
+  // At this point we mandate that any kernels that haven't been wrapped with
+  // work-item loops can't be kernels, nor entry points.
+  for (auto &F : M) {
+    if (isKernelEntryPt(F) && !Wrappers.contains(&F)) {
+      dropIsKernel(F);
+      // FIXME: Also mark them as internal in case they contain symbols we
+      // haven't resolved as part of the work-item loop wrapping process. We
+      // rely on GlobalOptPass to remove such functions; this is the same root
+      // issue as some mux builtins require DCE for correctness.
+      F.setLinkage(GlobalValue::InternalLinkage);
+    }
+  }
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt
new file mode 100644
index 0000000000000..7aa151998effa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/CMakeLists.txt
@@ -0,0 +1,135 @@
+set(VECZ_PUBLIC_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
+set(VECZ_PRIVATE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/source)
+set(VECZ_PRIVATE_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/source/include)
+
+set(COMMON_SRCS
+  ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/pass.h
+  ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/vecz_choices.h
+  ${VECZ_PUBLIC_INCLUDE_DIR}/vecz/vecz_target_info.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/control_flow_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/divergence_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/instantiation_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/liveness_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/packetization_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/simd_width_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/stride_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/uniform_value_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/vectorizable_function_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/analysis/vectorization_unit_analysis.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/common_gep_elimination_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/control_flow_conversion_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/inline_post_vectorization_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/instantiation_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/interleaved_group_combine_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetization_helpers.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetization_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/packetizer.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/passes.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/printf_scalarizer.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/scalarization_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/scalarizer.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/transform/ternary_transform_pass.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/control_flow_boscc.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/control_flow_roscc.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/debugging.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/ir_cleanup.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/llvm_helpers.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/memory_operations.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/offset_info.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/reachability.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/simd_packet.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_context.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_helpers.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_heuristics.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorization_unit.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vectorizer.h
+  ${VECZ_PRIVATE_INCLUDE_DIR}/vecz_pass_builder.h
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/control_flow_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/divergence_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/instantiation_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/liveness_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/packetization_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/simd_width_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/stride_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/uniform_value_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/vectorizable_function_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/analysis/vectorization_unit_analysis.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/basic_mem2reg_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/builtin_inlining_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/common_gep_elimination_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/control_flow_conversion_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/inline_post_vectorization_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/loop_rotate_custom_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/instantiation_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/interleaved_group_combine_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetization_helpers.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetization_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/packetizer.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/passes.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/pre_linearize_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/printf_scalarizer.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/remove_intptr_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/scalarization_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/scalarizer.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/simplify_infinite_loop_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/squash_small_vectors_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/ternary_transform_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/transform/uniform_reassociation_pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/control_flow_boscc.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/control_flow_roscc.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/debugging.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/ir_cleanup.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/llvm_helpers.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/memory_operations.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/offset_info.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/pass.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/reachability.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/simd_packet.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info_arm.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vector_target_info_riscv.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_choices.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_context.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_helpers.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_heuristics.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorization_unit.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vectorizer.cpp
+  ${VECZ_PRIVATE_SOURCE_DIR}/vecz_pass_builder.cpp
+)
+
+if(MSVC)
+  # Disable: unreferenced formal parameter.
+  list(REMOVE_ITEM VECZ_COMPILE_OPTIONS -we4100)
+  list(APPEND VECZ_COMPILE_OPTIONS -wd4100)
+endif()
+
+add_llvm_component_library(LLVMNativeCPUVecz
+  ${COMMON_SRCS}
+  LINK_COMPONENTS
+  NativeCPUPipeline
+  support
+  core
+  analysis
+  instcombine
+  aggressiveinstcombine
+  transformutils
+  scalaropts
+  ipo
+  passes
+  )
+
+target_include_directories(LLVMNativeCPUVecz
+  PUBLIC $<BUILD_INTERFACE:${VECZ_PUBLIC_INCLUDE_DIR}>
+  PRIVATE $<BUILD_INTERFACE:${VECZ_PRIVATE_INCLUDE_DIR}>
+)
+target_compile_options(LLVMNativeCPUVecz PRIVATE ${VECZ_COMPILE_OPTIONS})
+target_compile_definitions(LLVMNativeCPUVecz PRIVATE
+  ${VECZ_COMPILE_DEFINITIONS})
+
+# Currently disabled by default, these allow us to run lit tests using veczc
+# with the target check-sycl-vecz
+set(NATIVE_CPU_BUILD_VECZ_TEST_TOOLS OFF CACHE BOOL "Build vecz test and tools")
+if (NATIVE_CPU_BUILD_VECZ_TEST_TOOLS)
+  add_subdirectory(tools)
+  add_subdirectory(test)
+endif()
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
new file mode 100644
index 0000000000000..d7e59337fc261
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/pass.h
@@ -0,0 +1,150 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Vecz passes header.
+
+#ifndef VECZ_PASS_H
+#define VECZ_PASS_H
+
+#include <llvm/IR/Function.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/TypeSize.h>
+
+#include <cstdint>
+#include <optional>
+
+#include "vecz/vecz_choices.h"
+
+namespace llvm {
+class ModulePass;
+class StringRef;
+class Module;
+class TargetMachine;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+class BuiltinInfo;
+} // namespace utils
+} // namespace compiler
+
+namespace vecz {
+/// @addtogroup vecz
+/// @{
+
+struct VeczPassOptions {
+  /// @brief boolean choices such as double support, partial scalarization
+  vecz::VectorizationChoices choices;
+
+  /// @brief vectorization factor, including known min and scalable flag
+  llvm::ElementCount factor = llvm::ElementCount::getFixed(1);
+
+  /// @brief automatically work out factor
+  bool vecz_auto = false;
+
+  /// @brief Index of vectorization dimension to use (0 => x, 1 => y, 2 => z).
+  uint32_t vec_dim_idx = 0;
+
+  /// @brief local_size Value specifying the local size for the function (0 is
+  /// unknown)
+  uint64_t local_size = 0;
+};
+
+/// @brief Returns the vectorization options that would vectorize the provided
+/// function to its required sub-group size.
+std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(llvm::Function &);
+
+/// @brief Returns the vectorization options that would vectorize the provided
+/// function to its required sub-group size (if set) or one of the device's
+/// sub-group sizes.
+///
+/// Only returns options if the function uses sub-group operations, as
+/// determined by the SubGroupAnalysis pass.
+///
+/// Tries to find a good fit that produces one of the device's sub-group sizes,
+/// preferring ones which fit the known local work-group size and powers of
+/// two. The device's sub-group sizes can be sorted such that preferable sizes
+/// are placed towards the front.
+std::optional<VeczPassOptions>
+getAutoSubgroupSizeOpts(llvm::Function &, llvm::ModuleAnalysisManager &);
+
+/// @brief Analysis pass which determines on which functions @ref RunVeczPass
+/// should operate.
+class VeczPassOptionsAnalysis
+    : public llvm::AnalysisInfoMixin<VeczPassOptionsAnalysis> {
+  using VeczPassOptionsCallbackFn =
+      std::function<bool(llvm::Function &, llvm::ModuleAnalysisManager &,
+                         llvm::SmallVectorImpl<VeczPassOptions> &)>;
+  friend AnalysisInfoMixin<VeczPassOptionsAnalysis>;
+  static llvm::AnalysisKey Key;
+  VeczPassOptionsCallbackFn queryFunc =
+      [](llvm::Function &F, llvm::ModuleAnalysisManager &,
+         llvm::SmallVectorImpl<VeczPassOptions> &Opts) -> bool {
+    if (F.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
+      return false;
+    }
+    // TODO what are our defaults, here?
+    Opts.emplace_back();
+    return true;
+  };
+
+public:
+  VeczPassOptionsAnalysis() = default;
+  /// @brief explicit constructor which uses the given callback to determine
+  /// whether vectorization should be performed on the passed function. If the
+  /// default constructor is used, all functions with a SPIR calling convention
+  /// will be vectorized
+  explicit VeczPassOptionsAnalysis(VeczPassOptionsCallbackFn queryFunc)
+      : queryFunc(queryFunc) {}
+  using Result = VeczPassOptionsCallbackFn;
+  Result run(llvm::Module &, llvm::ModuleAnalysisManager &) {
+    return queryFunc;
+  }
+};
+
+/// @brief A helper pass which can be used to inspect and test the
+/// vectorization options set on a per-function basis.
+class VeczPassOptionsPrinterPass
+    : public llvm::PassInfoMixin<VeczPassOptionsPrinterPass> {
+  llvm::raw_ostream &OS;
+
+public:
+  explicit VeczPassOptionsPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
+
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+
+/// @brief A new-style module pass that provides a wrapper for using the
+/// the ComputeAorta IR vectorizer. This vectorizes kernels
+/// to vectorization factor specified when the pass is created. In our case this
+/// is typically the local size in the first dimension but there are other
+/// factors to consider when picking the vectorization factor, like being a
+/// power of 2. This pass queries the @ref `VeczShouldRunOnFunctionAnalysis`, so
+/// if you do not wish all kernels to be vectorized, you must ensure your pass
+/// manager's ModuleAnalysisManager is configured with a custom @ref
+/// `VeczShouldRunOnFunctionAnalysis`
+class RunVeczPass : public llvm::PassInfoMixin<RunVeczPass> {
+public:
+  /// @brief llvm's entry point for the PassManager
+  llvm::PreservedAnalyses run(llvm::Module &, llvm::ModuleAnalysisManager &);
+};
+
+/// @}
+} // namespace vecz
+
+#endif // VECZ_PASS_H
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
new file mode 100644
index 0000000000000..64ed72c120d98
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_choices.h
@@ -0,0 +1,294 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Internal Vecz Choices header.
+
+#ifndef VECZ_VECZ_CHOICES_H_INCLUDED
+#define VECZ_VECZ_CHOICES_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/ADT/SmallSet.h>
+#include <llvm/ADT/StringRef.h>
+
+// Forward declaration
+namespace llvm {
+class StringRef;
+class Twine;
+} // namespace llvm
+
+namespace vecz {
+
+/// @brief Describes and holds various Vecz choices.
+///
+/// These choices can affect the code generated and are usually optimization
+/// related. Since they are not always the best choice for a given target, they
+/// are controlled at runtime by this class.
+class VectorizationChoices {
+public:
+  VectorizationChoices();
+  ~VectorizationChoices() = default;
+
+  /// @brief Enumeration with the available choices for Vecz.
+  ///
+  /// These are choices that can affect the code generated, often for
+  /// optimization reasons. The Choices are prefixed by a `e<Category>` prefix,
+  /// where `<Category>` is an arbitrary string to help document the intention
+  /// of the Choice. For example, optimizations are prefixed with
+  /// `eOptimization`.
+  ///
+  /// @note Each Choice has to be uniquely named without taking into account
+  /// it's prefix, i.e. there shouldn't be any Choices sharing the same name
+  /// but with different prefixes. Also, Choices names must not start with
+  /// `"no"`, although different capitalizations (e.g. `"No"`) are allowed.
+  /// Additionally, Choices' names should contain only alphanumeric characters.
+  /// These restrictions are in place to allow for a `Choices` string to be
+  /// parsable easily. See, for example, `parseChoicesString` . If you add a
+  /// new Choice here, please also update the parseChoicesString function, as
+  /// well as the two relevant `cl::opt` in `vectorizer.cpp`.
+  enum Choice {
+    /// @brief An invalid Choice ID, useful for error checking etc. Equals 0.
+    eInvalid = 0,
+    /// @brief Packetize uniform instructions instead of using a vector splat.
+    ///
+    /// When going through the packetization process, the default behaviour when
+    /// encountering a uniform instruction is creating a vector splat
+    /// with its value and stopping the packetization there. This option changes
+    /// that behaviour, and instead makes the packetizer packetize even the
+    /// uniform instructions, provided that they are used by a varying
+    /// instruction.
+    eOptimizationPacketizeUniform,
+    /// @brief Packetize uniform instructions, but only in loops.
+    ///
+    /// This is similar to eOptimizationPacketizeUniform, with the difference
+    /// that it only affects uniform values used inside loops.
+    eOptimizationPacketizeUniformInLoops,
+    /// @brief Emit loops for instantiated call instructions
+    ///
+    /// This will emit instantiated call instruction in loops instead of
+    /// actually instantiating them. It only works when the call instruction has
+    /// no users.
+    eOptimizationInstantiateCallsInLoops,
+    /// @brief Use the BOSCC linearization algorithm during Control-Flow
+    //         Conversion.
+    //
+    //  @note This optimization retains uniform branches by duplicating pieces
+    //  of the code.
+    eLinearizeBOSCC,
+    /// @brief Turn on full scalarization in the Scalarization pass
+    //
+    // This is useful for testing the scalarizer, and isn't intended to deliver
+    // any performance benefits.
+    eFullScalarization,
+    /// @brief Treat division operations as being able to throw CPU exceptions
+    ///
+    /// @note This choice must be enabled for strict correctness on targets that
+    /// support hardware exceptions on division by zero/division overflow, which
+    /// require extra code to prevent traps on inactive vector lanes during
+    /// linearization. However, any trapping behaviour of the input IR may be
+    /// preserved (that is, on positively-executed code paths); it is left to
+    /// the front end to conform to OpenCL spec in this regard.
+    eDivisionExceptions,
+    /// @brief Generate a vector-predicated kernel such that no work-items
+    /// (vector elements) with side effects with IDs beyond the local workgroup
+    /// size are enabled.
+    ///
+    /// @note The exact semantics concerning which operations are
+    /// masked/unmasked are not defined. The guarantee is that the vectorized
+    /// kernel will be safe to execute on workgroups with sizes smaller than
+    /// the vector width. Some architectures may want to predicate beyond that
+    /// remit for performance reasons, even if the vector-predicated operations
+    /// are safe to execute on any input.
+    eVectorPredication,
+    /// @brief Force a default vectorization width, made without
+    /// target-specific knowledge.
+    ///
+    /// @note This is most-commonly used in testing. Packetization may make
+    /// decisions based on the target, which can make testing more difficult.
+    /// This choice forces the default vector register width.
+    eTargetIndependentPacketization,
+  };
+
+  /// @brief Check if a choice is enabled or not
+  /// @param C The choice to check for
+  /// @return true if the choice is enabled, false otherwise
+  bool isEnabled(Choice C) const { return Enabled.contains(C); }
+
+  /// @brief Enable a choice
+  /// @param C The choice to enable
+  /// @return true if the choice was already enabled, false otherwise
+  bool enable(Choice C) {
+    auto res = Enabled.insert(C);
+    return res.second;
+  }
+
+  /// @brief Disable a choice
+  /// @param C The choice to disable
+  /// @return true if the choice was enabled, false otherwise
+  bool disable(Choice C) { return Enabled.erase(C); }
+
+  /// @brief Parse a semicolon separated of Choices to enable or disable
+  ///
+  /// This functions accepts a string of Choices, separated by semicolon, and
+  /// enables or disables them. The Choices are parsed according to the
+  /// following rules:
+  /// - The Choices are separated by a semicolon (';') character
+  /// - Only one separator is allowed between each Choice.
+  /// - Trailing separators are ignored (but only one is allowed still).
+  /// - Choices are specified as they are in their enumerations, without the
+  ///   "e<Category>" prefix.
+  /// - Choices can be prefixed with the "no" prefix (without any whitespace),
+  ///   which specifies that the Choice needs to be disabled instead of being
+  ///   enabled.
+  /// - The "no" prefix only applies to the Choice it is attached to and not to
+  ///   any following Choices.
+  /// - Whitespace between the Choices and the separators, as well as leading
+  ///   and trailing whitespace at the beginning and end of the string, is
+  ///   ignored.
+  ///
+  /// Examples:
+  /// - "PacketizeUniform"
+  /// - "PacketizeUniform;InstantiateCallsInLoops"
+  /// - "PacketizeUniform ;   noInstantiateCallsInLoops \n"
+  /// - " noPacketizeUniform;noInstantiateCallsInLoops; "
+  ///
+  /// @param[in] Str The string containing the Choices to enable/disable
+  /// @return true on success, false if the parsing failed
+  bool parseChoicesString(llvm::StringRef Str);
+
+  /// @brief Convert a Choice name from a string to the matching Choice value
+  ///
+  /// The choices are matched without their e<Category> prefix.
+  ///
+  /// @param[in] Str The string with the Choice name
+  /// @return The Choice name, or eInvalid in case of error
+  static Choice fromString(llvm::StringRef Str);
+
+  //
+  // Specific getters and setters for the most commonly used choices
+  //
+
+  /// @brief Check if the eOptimizationPacketizeUniform choice is set
+  /// @return true if the choice is set, false otherwise
+  bool packetizeUniform() const {
+    return isEnabled(eOptimizationPacketizeUniform);
+  }
+  /// @brief Enable the eOptimizationPacketizeUniform choice
+  /// @return true if eOptimizationPacketizeUniform was already enabled
+  bool enablePacketizeUniform() {
+    return enable(eOptimizationPacketizeUniform);
+  }
+  /// @brief Disable the eOptimizationPacketizeUniform choice
+  /// @return true if eOptimizationPacketizeUniform was enabled
+  bool disablePacketizeUniform() {
+    return disable(eOptimizationPacketizeUniform);
+  }
+
+  /// @brief Check if the eOptimizationPacketizeUniformInLoops choice is set
+  /// @return true if the choice is set, false otherwise
+  bool packetizeUniformInLoops() const {
+    return isEnabled(eOptimizationPacketizeUniformInLoops);
+  }
+  /// @brief Enable the eOptimizationPacketizeUniformInLoops choice
+  /// @return true if eOptimizationPacketizeUniformInLoops was already enabled
+  bool enablePacketizeUniformInLoops() {
+    return enable(eOptimizationPacketizeUniformInLoops);
+  }
+  /// @brief Disable the eOptimizationPacketizeUniformInLoops choice
+  /// @return true if eOptimizationPacketizeUniformInLoops was enabled
+  bool disablePacketizeUniformInLoops() {
+    return disable(eOptimizationPacketizeUniformInLoops);
+  }
+
+  /// @brief Check if the eOptimizationInstantiateCallsInLoops choice is set
+  /// @return true if the choice is set, false otherwise
+  bool instantiateCallsInLoops() const {
+    return isEnabled(eOptimizationInstantiateCallsInLoops);
+  }
+  /// @brief Enable the eOptimizationInstantiateCallsInLoops choice
+  /// @return true if eOptimizationInstantiateCallsInLoops was already enabled
+  bool enableInstantiateCallsInLoops() {
+    return enable(eOptimizationInstantiateCallsInLoops);
+  }
+  /// @brief Disable the eOptimizationInstantiateCallsInLoops choice
+  /// @return true if eOptimizationInstantiateCallsInLoops was enabled
+  bool disableInstantiateCallsInLoops() {
+    return disable(eOptimizationInstantiateCallsInLoops);
+  }
+
+  /// @brief Check if the eLinearizeBOSCC choice is set
+  /// @return true if the choice is set, false otherwise
+  bool linearizeBOSCC() const { return isEnabled(eLinearizeBOSCC); }
+  /// @brief Enable the eLinearizeBOSCC choice
+  /// @return true if eLinearizeBOSCC was already enabled
+  bool enableLinearizeBOSCC() { return enable(eLinearizeBOSCC); }
+  /// @brief Disable the eLinearizeBOSCC choice
+  /// @return true if eLinearizeBOSCC was enabled
+  bool disableLinearizeBOSCC() { return disable(eLinearizeBOSCC); }
+
+  /// @brief Check if the eVectorPredication choice is set
+  /// @return true if the choice is set, false otherwise
+  bool vectorPredication() const { return isEnabled(eVectorPredication); }
+  /// @brief Enable the eVectorPredication choice
+  /// @return true if eVectorPredication was already enabled
+  bool enableVectorPredication() { return enable(eVectorPredication); }
+  /// @brief Disable the eVectorPredication choice
+  /// @return true if eVectorPredication was enabled
+  bool disableVectorPredication() { return disable(eVectorPredication); }
+
+  /// @brief Check if the eTargetIndependentPacketization choice is set
+  /// @return true if the choice is set, false otherwise
+  bool targetIndependentPacketization() const {
+    return isEnabled(eTargetIndependentPacketization);
+  }
+  /// @brief Enable the eTargetIndependentPacketization choice
+  /// @return true if eTargetIndependentPacketization was already enabled
+  bool enableTargetIndependentPacketization() {
+    return enable(eTargetIndependentPacketization);
+  }
+  /// @brief Disable the eTargetIndependentPacketization choice
+  /// @return true if eTargetIndependentPacketization was enabled
+  bool disableTargetIndependentPacketization() {
+    return disable(eTargetIndependentPacketization);
+  }
+
+  struct ChoiceInfo {
+    llvm::StringLiteral name;
+    Choice number;
+    llvm::StringLiteral desc;
+  };
+
+  static llvm::ArrayRef<ChoiceInfo> queryAvailableChoices();
+
+private:
+  /// @brief All the choices enabled
+  llvm::SmallSet<Choice, 2> Enabled;
+
+  /// @brief Print an error message, used by parseChoicesString
+  ///
+  /// The error message will contain the message given as well as the Choices
+  /// string being parsed and the position that the error occured.
+  //
+  /// @param[in] Input The Choices string being parsed
+  /// @param[in] Position The position where the parsin error occured
+  /// @param[in] Msg The accompanying error message
+  static void printChoicesParseError(llvm::StringRef Input, unsigned Position,
+                                     llvm::Twine Msg);
+};
+
+} // namespace vecz
+#endif // VECZ_VECZ_CHOICES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
new file mode 100644
index 0000000000000..490247e70c995
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/include/vecz/vecz_target_info.h
@@ -0,0 +1,716 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief External vecz header.  Contains the API to the vectorizer.
+
+#ifndef VECZ_VECZ_TARGET_INFO_H_INCLUDED
+#define VECZ_VECZ_TARGET_INFO_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/TypeSize.h>
+
+namespace llvm {
+class TargetMachine;
+class TargetTransformInfo;
+class Type;
+} // namespace llvm
+
+namespace vecz {
+class VectorizationContext;
+
+/// @addtogroup vecz
+/// @{
+
+/// @brief Kinds of interleaved memory operations.
+enum InterleavedOperation : int {
+  /// @brief Invalid memory operation.
+  eInterleavedInvalid = 0,
+  /// @brief Store memory operation.
+  eInterleavedStore,
+  /// @brief Load memory operation.
+  eInterleavedLoad,
+  /// @brief Masked Store memory operation.
+  eMaskedInterleavedStore,
+  /// @brief Masked Load memory operation.
+  eMaskedInterleavedLoad
+};
+
+/// @brief Used by the vectorizer to query for target capabilities and
+/// materialize memory intrinsics.
+class TargetInfo {
+public:
+  /// @brief Create a new vector target info instance.
+  /// @param[in] tm LLVM target machine that will be used for compilation, can
+  /// be NULL if no target data is available.
+  TargetInfo(llvm::TargetMachine *tm);
+
+  virtual ~TargetInfo() = default;
+
+  /// @brief Return the target machine.
+  llvm::TargetMachine *getTargetMachine() const { return TM_; }
+
+  /// @brief Create a vector load. If a stride greater than one is used, the
+  /// load will be interleaved, i.e. lanes are loaded from non-contiguous
+  /// memory.
+  ///
+  /// @note ptr refers to the unwidened element type, not the wide type.
+  ///       ptr needs to be 'element aligned'. The element can itself be a
+  ///       vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value from.
+  /// @param[in] stride Distance in elements between two lanes in memory.
+  ///                     A stride of one represents a contiguous load.
+  /// @param[in] alignment The alignment of the load, in bytes
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If null, the operation is unpredicated:
+  /// it is executed on all lanes.
+  ///
+  /// @return IR value that results from the vector load.
+  virtual llvm::Value *createLoad(llvm::IRBuilder<> &builder, llvm::Type *ty,
+                                  llvm::Value *ptr, llvm::Value *stride,
+                                  unsigned alignment,
+                                  llvm::Value *evl = nullptr) const;
+
+  /// @brief Create a vector store. If a stride greater than one is used, the
+  /// store will be interleaved, i.e. lanes are stored to non-contiguous memory.
+  ///
+  /// @note ptr refers to the unwidened element type, not the wide type.
+  ///       ptr needs to be 'element aligned'. The element can itself be a
+  ///       vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] stride Distance in elements between two lanes in memory.
+  ///                     A stride of one represents a contiguous store.
+  /// @param[in] alignment The alignment of the store, in bytes
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If null, the operation is unpredicated:
+  /// it is executed on all lanes.
+  ///
+  /// @return IR value that results from the vector store.
+  virtual llvm::Value *createStore(llvm::IRBuilder<> &builder,
+                                   llvm::Value *data, llvm::Value *ptr,
+                                   llvm::Value *stride, unsigned alignment,
+                                   llvm::Value *evl = nullptr) const;
+
+  /// @brief Create a masked vector load.
+  ///        Only lanes with a non-zero mask will be loaded from the address.
+  ///        Other lanes will contain undefined data.
+  ///
+  /// @note ptr refers to the unwidened element type, not the wide type.
+  ///       ptr needs to be 'element aligned'. The element can itself be a
+  ///       vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value from.
+  /// @param[in] mask Vector mask used to disable loading certain lanes.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the masked vector load.
+  virtual llvm::Value *createMaskedLoad(llvm::IRBuilder<> &builder,
+                                        llvm::Type *ty, llvm::Value *ptr,
+                                        llvm::Value *mask, llvm::Value *evl,
+                                        unsigned alignment) const;
+
+  /// @brief Create a masked vector store.
+  ///        Only lanes with a non-zero mask will be stored to the address.
+  ///
+  /// @note ptr refers to the unwidened element type, not the wide type.
+  ///       ptr needs to be 'element aligned'. The element can itself be a
+  ///       vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] mask Vector mask used to disable storing certain lanes.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the masked vector store.
+  virtual llvm::Value *createMaskedStore(llvm::IRBuilder<> &builder,
+                                         llvm::Value *data, llvm::Value *ptr,
+                                         llvm::Value *mask, llvm::Value *evl,
+                                         unsigned alignment) const;
+
+  /// @brief Create a interleaved vector load.
+  ///
+  /// @note Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value to.
+  /// @param[in] stride Stride for interleaved memory operation.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment  Alignment of the load
+  ///
+  /// @return IR value that results from the interleaved load.
+  virtual llvm::Value *createInterleavedLoad(llvm::IRBuilder<> &builder,
+                                             llvm::Type *ty, llvm::Value *ptr,
+                                             llvm::Value *stride,
+                                             llvm::Value *evl,
+                                             unsigned alignment) const;
+
+  /// @brief Create a interleaved vector store.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] stride Stride for interleaved memory operation.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the load
+  ///
+  /// @return IR value that results from the interleaved vector store.
+  virtual llvm::Value *
+  createInterleavedStore(llvm::IRBuilder<> &builder, llvm::Value *data,
+                         llvm::Value *ptr, llvm::Value *stride,
+                         llvm::Value *evl, unsigned alignment) const;
+
+  /// @brief Create a masked interleaved vector load.
+  ///        Only lanes with a non-zero mask will be loaded from the address.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value to.
+  /// @param[in] mask Vector mask used to disable loading certain lanes.
+  /// @param[in] stride Stride for interleaved memory operation.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the load
+  ///
+  /// @return IR value that results from the masked interleaved vector load.
+  virtual llvm::Value *
+  createMaskedInterleavedLoad(llvm::IRBuilder<> &builder, llvm::Type *ty,
+                              llvm::Value *ptr, llvm::Value *mask,
+                              llvm::Value *stride, llvm::Value *evl,
+                              unsigned alignment) const;
+
+  /// @brief Create a masked interleaved vector store.
+  ///        Only lanes with a non-zero mask will be stored to the address.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] mask Vector mask used to disable storing certain lanes.
+  /// @param[in] stride Stride for interleaved memory operation.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the load
+  ///
+  /// @return IR value that results from the masked interleaved vector store.
+  virtual llvm::Value *
+  createMaskedInterleavedStore(llvm::IRBuilder<> &builder, llvm::Value *data,
+                               llvm::Value *ptr, llvm::Value *mask,
+                               llvm::Value *stride, llvm::Value *evl,
+                               unsigned alignment) const;
+
+  /// @brief Create a gather vector load.
+  ///        Vector lanes are loaded from different memory addresses.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value from.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the gather vector load.
+  virtual llvm::Value *createGatherLoad(llvm::IRBuilder<> &builder,
+                                        llvm::Type *ty, llvm::Value *ptr,
+                                        llvm::Value *evl,
+                                        unsigned alignment) const;
+
+  /// @brief Create a scatter vector store.
+  ///        Vector lanes are stored to different memory addresses.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the scatter vector store.
+  virtual llvm::Value *createScatterStore(llvm::IRBuilder<> &builder,
+                                          llvm::Value *data, llvm::Value *ptr,
+                                          llvm::Value *evl,
+                                          unsigned alignment) const;
+
+  /// @brief Create a masked gather vector load.
+  ///        Only lanes with a non-zero mask will be loaded from different
+  ///        address.
+  ///        Other lanes will contain undefined data.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Value type to load from memory.
+  /// @param[in] ptr Memory address to load a vector value from.
+  /// @param[in] mask Vector mask used to disable loading certain lanes.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the masked gather vector load.
+  virtual llvm::Value *createMaskedGatherLoad(llvm::IRBuilder<> &builder,
+                                              llvm::Type *ty, llvm::Value *ptr,
+                                              llvm::Value *mask,
+                                              llvm::Value *evl,
+                                              unsigned alignment) const;
+
+  /// @brief Create a masked scatter vector store.
+  ///        Only lanes with a non-zero mask will be stored to the address.
+  ///
+  /// @note  Pointers are scalar and need to be 'scalar aligned'.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] data Vector value to store to memory.
+  /// @param[in] ptr Memory address to store a vector value to.
+  /// @param[in] mask Vector mask used to disable storing certain lanes.
+  /// @param[in] evl 'effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  /// @param[in] alignment Alignment of the store.
+  ///
+  /// @return IR value that results from the masked scatter vector store.
+  virtual llvm::Value *
+  createMaskedScatterStore(llvm::IRBuilder<> &builder, llvm::Value *data,
+                           llvm::Value *ptr, llvm::Value *mask,
+                           llvm::Value *evl, unsigned alignment) const;
+
+  /// @brief Create a scalable extractelement instruction. Note that the
+  /// operands are expected to have been pre-packetized before passing to this
+  /// function.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] Ctx Vectorization context.
+  /// @param[in] extract The original pre-packetized extractelement Instruction
+  /// @param[in] narrowTy Narrowed type of @a extract.
+  /// @param[in] src The packetized source vector
+  /// @param[in] index The packetized extraction index
+  /// @param[in] evl 'Effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  ///
+  /// @return A value identical to the requested extractelement
+  virtual llvm::Value *createScalableExtractElement(
+      llvm::IRBuilder<> &builder, vecz::VectorizationContext &Ctx,
+      llvm::Instruction *extract, llvm::Type *narrowTy, llvm::Value *src,
+      llvm::Value *index, llvm::Value *evl) const;
+
+  /// @brief Create an outer broadcast of a vector. An outer broadcast is one
+  /// where a vector with length V is replicated in its entirety N times across
+  /// the lanes of a larger vector with length L x V. The broadcast factor is
+  /// expected to be scalable:
+  ///
+  ///   outer_broadcast(<A,B>, vscale x 1) -> <A,B,A,B,A,B,...>
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] vector Vector to broadcast.
+  /// @param[in] VL Vector length.
+  /// @param[in] factor Broadcast factor.
+  virtual llvm::Value *
+  createOuterScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector,
+                               llvm::Value *VL,
+                               llvm::ElementCount factor) const;
+
+  /// @brief Create an inner broadcast of a vector. An inner broadcast is one
+  /// where a vector with length V has its lanes individually and sequentially
+  /// replicated N times to fill a larger vector with length L x V. The
+  /// broadcast factor is expected to be a fixed amount:
+  ///
+  ///   inner_broadcast(<A,B,C,...>, 2) -> <A,A,B,B,C,C, ...>
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] vector Vector to broadcast.
+  /// @param[in] VL Vector length.
+  /// @param[in] factor Broadcast factor.
+  virtual llvm::Value *
+  createInnerScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector,
+                               llvm::Value *VL,
+                               llvm::ElementCount factor) const;
+
+  /// @brief Utility function for packetizing an insertelement instruction by a
+  /// scalable factor. Note that the operands are expected to have been
+  /// pre-packetized before passing to this function.
+  ///
+  /// @param[in] builder the builder to create the needed instructions
+  /// @param[in] Ctx Vectorization context.
+  /// @param[in] insert the original pre-packetized insertelement Instruction
+  /// @param[in] elt the packetized element to insert
+  /// @param[in] into the packetized source vector
+  /// @param[in] index the packetized insertion index
+  /// @param[in] evl 'Effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes but obeys the mask parameter.
+  ///
+  /// @return a value identical to the requested insertelement
+  virtual llvm::Value *createScalableInsertElement(
+      llvm::IRBuilder<> &builder, vecz::VectorizationContext &Ctx,
+      llvm::Instruction *insert, llvm::Value *elt, llvm::Value *into,
+      llvm::Value *index, llvm::Value *evl) const;
+
+  /// @brief Function allowing targets to customize the insertion of
+  /// instructions to calculate the vector-predicated kernel width.
+  ///
+  /// Note that this must return an expression equivalent to:
+  ///   i32 = umin(%factor, %remainingIters)
+  /// This is the expression computed if this function returns nullptr.
+  ///
+  /// @param[in] builder the builder to create the needed instructions
+  /// @param[in] remainingIters the remaining number of work-items being
+  /// executed in the work-group in the dimension being vectorized.
+  /// @param[in] widestEltTy an optimization hint indicating the widest (vector
+  /// element) type in the kernel. Must not be relied on for correctness.
+  /// @param[in] factor the vectorization width.
+  virtual llvm::Value *createVPKernelWidth(llvm::IRBuilder<> &builder,
+                                           llvm::Value *remainingIters,
+                                           unsigned widestEltTy,
+                                           llvm::ElementCount factor) const {
+    (void)builder;
+    (void)remainingIters;
+    (void)widestEltTy;
+    (void)factor;
+    return nullptr;
+  }
+
+  /// @brief Create a single-source vector shuffle with a general shuffle mask.
+  /// Can work with dynamic shuffle masks and scalable vectors, and can return
+  /// vectors of a different length to the source.
+  ///
+  /// @param[in] builder the builder to create the needed instructions
+  /// @param[in] src the source vector
+  /// @param[in] mask the shuffle mask
+  /// @param[in] evl 'Effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes.
+  ///
+  /// @return the result of the shuffle operation
+  virtual llvm::Value *createVectorShuffle(llvm::IRBuilder<> &builder,
+                                           llvm::Value *src, llvm::Value *mask,
+                                           llvm::Value *evl) const;
+
+  /// @brief Create a vector slide-up operation, that moves all vector elements
+  /// up by one place, with the specified element inserted into the zeroth
+  /// position.
+  ///
+  /// @param[in] builder the builder to create the needed instructions
+  /// @param[in] src the source vector
+  /// @param[in] insert the value to slide into the vacant position
+  /// @param[in] evl 'Effective vector length' of the operation. Must be
+  /// pre-scaled for vector operations. If evl is null, the operation is not
+  /// length-predicated: it executes on all lanes.
+  ///
+  /// @return the result of the slide-up operation
+  virtual llvm::Value *createVectorSlideUp(llvm::IRBuilder<> &builder,
+                                           llvm::Value *src,
+                                           llvm::Value *insert,
+                                           llvm::Value *evl) const;
+
+  /// @brief Determine whether the specified group of interleaved memory
+  /// instructions can be optimized or not.
+  ///
+  /// @param[in] val Memory access operation.
+  /// @param[in] kind Kind of interleaved instructions.
+  /// @param[in] stride Stride of the interleaved memory operations.
+  /// @param[in] groupSize Number of interleaved operations in the group.
+  ///
+  /// @return true if the interleaved group can be optimized, false otherwise.
+  virtual bool canOptimizeInterleavedGroup(const llvm::Instruction &val,
+                                           InterleavedOperation kind,
+                                           int stride,
+                                           unsigned groupSize) const;
+
+  /// @brief Try to optimize a group of consecutive interleaved vector memory
+  /// instructions. These instructions collectively access a consecutive chunk
+  /// of memory and are sorted by increasing address.
+  ///
+  /// @note Pointers are scalar and need to be 'scalar aligned'.
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] Kind Kind of interleaved group to look for.
+  /// @param[in] group List of interleaved operations.
+  /// @param[in] masks List of mask operands.
+  /// @param[in] baseAddress Base pointer for the memory operation.
+  /// @param[in] stride Stride of the interleaved memory operations.
+  ///
+  /// @return Return true if the interleaved group was optimized or false.
+  virtual bool optimizeInterleavedGroup(llvm::IRBuilder<> &builder,
+                                        InterleavedOperation Kind,
+                                        llvm::ArrayRef<llvm::Value *> group,
+                                        llvm::ArrayRef<llvm::Value *> masks,
+                                        llvm::Value *baseAddress,
+                                        int stride) const;
+
+  /// @brief (De-)interleave a list of vectors.
+  ///
+  /// @param[in] builder Builder used to generate new instructions.
+  /// @param[in,out] vectors List of vectors to (de-)interleave.
+  /// @param[in] forward true to interleave, false to deinterleave.
+  ///
+  /// @return true if the vectors were (de-)interleaved, false otherwise.
+  virtual bool interleaveVectors(llvm::IRBuilder<> &builder,
+                                 llvm::MutableArrayRef<llvm::Value *> vectors,
+                                 bool forward) const;
+
+  /// @brief Estimates the widest SIMD width that will fit into registers for a
+  ///        given set of values.
+  ///
+  /// @param[in] TTI the Target Transform Info
+  /// @param[in] vals Set of values to fit into registers
+  /// @param[in] width the widest SIMD width to consider
+  /// @return the widest SIMD width that is expected to fit into registers, or
+  ///         zero if the set can never fit into registers.
+  virtual unsigned
+  estimateSimdWidth(const llvm::TargetTransformInfo &TTI,
+                    const llvm::ArrayRef<const llvm::Value *> vals,
+                    unsigned width) const;
+
+  /// @brief Get the preferred vector width for the given scalar type
+  ///
+  /// @param[in] TTI the Target Transform Info
+  /// @param[in] Ty the scalar type to get the width for
+  /// @return the preferred vector width
+  virtual unsigned getVectorWidthForType(const llvm::TargetTransformInfo &TTI,
+                                         const llvm::Type &Ty) const;
+
+  /// @brief Return whether the value can be packetized by the given width.
+  ///
+  /// @param[in] Val The value to be packetized
+  /// @param[in] Width The vectorization factor by which to packetize Val
+  /// @return true if the value can be packetized, false otherwise.
+  virtual bool canPacketize(const llvm::Value *Val,
+                            llvm::ElementCount Width) const;
+
+  /// @return Whether a given vector type would be legal as the result of a
+  /// binary vp intrinsic.
+  virtual bool isVPVectorLegal(const llvm::Function &F, llvm::Type *Ty) const;
+
+protected:
+  /// @brief This type indicates legality of a VP/Masked memory operation in a
+  /// target.
+  class VPMemOpLegality {
+  public:
+    constexpr VPMemOpLegality() = default;
+    constexpr VPMemOpLegality(bool VPLegal, bool MaskLegal)
+        : VPLegal(VPLegal), MaskLegal(MaskLegal) {}
+
+    /// @brief States whether the operation is legal as or not a VP intrinsic.
+    void setVPLegality(bool Legal) { VPLegal = Legal; }
+
+    /// @brief States whether the operation is legal ot not as a masked memory
+    /// operation.
+    void setMaskLegality(bool Legal) { MaskLegal = Legal; }
+
+    /// @brief Tests whether the operation is legal as a VP intrinsic.
+    constexpr bool isVPLegal() const { return VPLegal; }
+
+    /// @brief Tests whether the operation is legal as a masked memory
+    /// operation.
+    constexpr bool isMaskLegal() const { return MaskLegal; }
+
+  private:
+    bool VPLegal = false;
+    bool MaskLegal = false;
+  };
+
+  /// @brief Create an indices vector to be used in createScalableBroadcast()
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] ty Type of the indices vector.
+  /// @param[in] factor Vectorization factor.
+  /// @param[in] URem Whether to broadcast a fixed-length vector to a scalable
+  /// one or a scalable-vector by a fixed amount.
+  /// @param[in] N Name of the value to produce.
+  static llvm::Value *createBroadcastIndexVector(llvm::IRBuilder<> &builder,
+                                                 llvm::Type *ty,
+                                                 llvm::ElementCount factor,
+                                                 bool URem,
+                                                 const llvm::Twine &N = "");
+
+  /// @return A VPMemOpLegality enum stating whether we can create a vp.load or
+  /// a masked.load intrinsic.
+  ///
+  /// @param[in] F The function in which the instruction will be created.
+  /// @param[in] Ty Type of the vector to load.
+  /// @param[in] Alignment Alignment of the operation.
+  /// @param[in] AddrSpace Address space of the operation.
+  virtual VPMemOpLegality isVPLoadLegal(const llvm::Function *F, llvm::Type *Ty,
+                                        unsigned Alignment,
+                                        unsigned AddrSpace) const;
+
+  /// @return A VPMemOpLegality enum stating whether we can create a vp.store or
+  /// a masked.store intrinsic.
+  ///
+  /// @param[in] F The function in which the instruction will be created.
+  /// @param[in] Ty Type of the vector to store.
+  /// @param[in] Alignment Alignment of the operation.
+  /// @param[in] AddrSpace Address space of the operation.
+  virtual VPMemOpLegality isVPStoreLegal(const llvm::Function *F,
+                                         llvm::Type *Ty, unsigned Alignment,
+                                         unsigned AddrSpace) const;
+
+  /// @return A VPMemOpLegality enum stating whether we can create a vp.gather
+  /// or a masked.gather intrinsic.
+  ///
+  /// @param[in] F The function in which the instruction will be created.
+  /// @param[in] Ty Type of the vector to gather.
+  /// @param[in] Alignment Alignment of the operation.
+  /// @param[in] AddrSpace Address space of the operation.
+  virtual VPMemOpLegality isVPGatherLegal(const llvm::Function *F,
+                                          llvm::Type *Ty, unsigned Alignment,
+                                          unsigned AddrSpace) const;
+
+  /// @return A VPMemOpLegality enum stating whether we can create a vp.scatter
+  /// or a masked.scatter intrinsic.
+  ///
+  /// @param[in] F The function in which the instruction will be created.
+  /// @param[in] Ty Type of the vector to scatter.
+  /// @param[in] Alignment Alignment of the operation.
+  /// @param[in] AddrSpace Address space of the operation.
+  virtual VPMemOpLegality isVPScatterLegal(const llvm::Function *F,
+                                           llvm::Type *Ty, unsigned Alignment,
+                                           unsigned AddrSpace) const;
+
+  /// @brief Function to check whether a given type is valid as the element type
+  /// of a scalable vector used in a VP intrinsic.
+  ///
+  /// @param[in] Ty The type to be checked.
+  virtual bool isLegalVPElementType(llvm::Type *Ty) const;
+
+  /// @brief LLVM target machine that will be used for compilation.
+  llvm::TargetMachine *TM_;
+
+private:
+  /// @brief Helper function to check legality of memory operations.
+  ///
+  /// @return Illegal in LLVM < 13 and check leagality in LLVM >= 13.
+  VPMemOpLegality
+  checkMemOpLegality(const llvm::Function *F,
+                     llvm::function_ref<bool(const llvm::TargetTransformInfo &,
+                                             llvm::Type *, unsigned, unsigned)>
+                         Checker,
+                     llvm::Type *Ty, unsigned Alignment,
+                     unsigned AddrSpace) const;
+
+  /// @brief Create a broadcast of a vector.
+  ///
+  /// @param[in] builder Builder used to create IR.
+  /// @param[in] vector Vector to broadcast.
+  /// @param[in] VL Vector length.
+  /// @param[in] factor Vectorization factor.
+  /// @param[in] URem Whether to broadcast a fixed-length vector to a scalable
+  /// one or a scalable-vector by a fixed amount
+  llvm::Value *createScalableBroadcast(llvm::IRBuilder<> &builder,
+                                       llvm::Value *vector, llvm::Value *VL,
+                                       llvm::ElementCount factor,
+                                       bool URem) const;
+};
+
+/// @brief Caches and returns the TargetInfo for a Module.
+class TargetInfoAnalysis : public llvm::AnalysisInfoMixin<TargetInfoAnalysis> {
+  friend AnalysisInfoMixin<TargetInfoAnalysis>;
+
+public:
+  struct Result {
+    Result(std::unique_ptr<TargetInfo> &&I) : Info(std::move(I)) {}
+    /// Handle the invalidation of this information.
+    ///
+    /// When used as a result of TargetInfoAnalysis this method will be called
+    /// when the function this was computed for changes. When it returns false,
+    /// the information is preserved across those changes.
+    bool invalidate(llvm::Module &, const llvm::PreservedAnalyses &,
+                    llvm::ModuleAnalysisManager::Invalidator &) {
+      return false;
+    }
+
+    operator TargetInfo *() { return Info.get(); }
+    operator const TargetInfo *() const { return Info.get(); }
+
+    std::unique_ptr<TargetInfo> Info;
+  };
+
+  using CallbackFn = std::function<Result(const llvm::Module &)>;
+
+  TargetInfoAnalysis();
+
+  TargetInfoAnalysis(llvm::TargetMachine *TM);
+
+  TargetInfoAnalysis(CallbackFn TICallback) : TICallback(TICallback) {}
+
+  /// @brief Retrieve the TargetInfo for the requested module.
+  Result run(llvm::Module &M, llvm::ModuleAnalysisManager &) {
+    return TICallback(M);
+  }
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "TargetInfo analysis"; }
+
+private:
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+
+  /// @brief Callback function producing a BuiltinInfo on demand.
+  CallbackFn TICallback;
+};
+
+std::unique_ptr<TargetInfo> createTargetInfoArm(llvm::TargetMachine *tm);
+
+std::unique_ptr<TargetInfo> createTargetInfoAArch64(llvm::TargetMachine *tm);
+
+std::unique_ptr<TargetInfo> createTargetInfoRISCV(llvm::TargetMachine *tm);
+
+/// @brief Create a new vector target info instance.
+/// @param[in] tm LLVM target machine that will be used for compilation, can
+/// be NULL if no target data is available.
+/// @return The new TargetInfo instance.
+std::unique_ptr<TargetInfo>
+createTargetInfoFromTargetMachine(llvm::TargetMachine *tm);
+
+/// @}
+} // namespace vecz
+
+#endif // VECZ_VECZ_TARGET_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
new file mode 100644
index 0000000000000..cdeb01e71d77c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/control_flow_analysis.cpp
@@ -0,0 +1,99 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/control_flow_analysis.h"
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/Analysis/CFG.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+
+#define DEBUG_TYPE "vecz-cf"
+
+using namespace llvm;
+using namespace vecz;
+
+////////////////////////////////////////////////////////////////////////////////
+
+llvm::AnalysisKey CFGAnalysis::Key;
+
+CFGResult CFGAnalysis::run(llvm::Function &F,
+                           llvm::FunctionAnalysisManager &AM) {
+  CFGResult Res;
+
+  LLVM_DEBUG(dbgs() << "CONTROL FLOW ANALYSIS\n");
+
+  const UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+
+  bool mayDiverge = false;
+  for (BasicBlock &BB : F) {
+    // Update diverge information for a block which has varying branch.
+    auto *term = BB.getTerminator();
+    if (isa<ReturnInst>(term) || isa<UnreachableInst>(term)) {
+      // an "unreachable" terminator may be generated from an "optimization"
+      // of undefined behaviour in the IR; where a "trap" call has been
+      // introduced, the end of the Basic Block will never be reached.
+      // This should still be regarded as an exit block for our purposes.
+      if (Res.exitBB) {
+        emitVeczRemarkMissed(&F, &F,
+                             "CFG should not have more than one exit block.");
+        Res.setFailed(true);
+        return Res;
+      }
+      Res.exitBB = &BB;
+      LLVM_DEBUG(dbgs() << BB.getName() << " returns\n");
+    } else if (BranchInst *B = dyn_cast<BranchInst>(term)) {
+      if (B->isConditional()) {
+        auto *const cond = B->getCondition();
+        if (cond && UVR.isVarying(cond)) {
+          mayDiverge = true;
+        }
+      }
+    } else if (isa<SwitchInst>(term)) {
+      // Control Flow Conversion Pass is not able to handle switch instructions.
+      emitVeczRemarkMissed(&F, &F, "Unexpected Switch instruction.");
+      Res.setFailed(true);
+      return Res;
+    }
+  }
+
+  if (!Res.getExitBlock()) {
+    emitVeczRemarkMissed(&F, &F, "Non-terminating CFG in");
+    Res.setFailed(true);
+    return Res;
+  }
+
+  const LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  using RPOTraversal = ReversePostOrderTraversal<const Function *>;
+  const RPOTraversal FuncRPOT(&F);
+  if (containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
+                             const LoopInfo>(FuncRPOT, LI)) {
+    emitVeczRemarkMissed(&F, &F, "Irreducible loop detected in");
+    Res.setFailed(true);
+    return Res;
+  }
+
+  if (mayDiverge) {
+    Res.setConversionNeeded(true);
+  }
+
+  return Res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
new file mode 100644
index 0000000000000..39c78b01a2bf4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/divergence_analysis.cpp
@@ -0,0 +1,808 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/divergence_analysis.h"
+
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <algorithm>
+#include <memory>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+using RPOT = ReversePostOrderTraversal<Function *>;
+} // namespace
+
+BlockQueue::BlockQueue(const DivergenceResult &dr,
+                       const DenseSet<BasicBlock *> &blocks)
+    : DR(dr) {
+  indices.reserve(blocks.size());
+  for (auto *const BB : blocks) {
+    indices.push_back(DR.getTagIndex(BB));
+  }
+
+  // Note that make_heap builds a Max heap, so we use `std::greater` to get a
+  // Min heap.
+  std::make_heap(indices.begin(), indices.end(), std::greater<index_type>());
+}
+
+const BasicBlockTag &BlockQueue::pop() {
+  assert(!indices.empty() && "Trying to pop from an empty BlockQueue");
+  std::pop_heap(indices.begin(), indices.end(), std::greater<index_type>());
+  const auto popped_index = indices.back();
+  indices.pop_back();
+
+  return DR.getBlockTag(popped_index);
+}
+
+void BlockQueue::push(size_t index) {
+  indices.push_back(index);
+  std::push_heap(indices.begin(), indices.end(), std::greater<index_type>());
+}
+
+void BlockQueue::push(const BasicBlock *bb) {
+  indices.push_back(DR.getTagIndex(bb));
+  std::push_heap(indices.begin(), indices.end(), std::greater<index_type>());
+}
+
+DivergenceResult::DivergenceResult(Function &F, FunctionAnalysisManager &AM)
+    : F(F), AM(AM) {}
+
+size_t DivergenceResult::getTagIndex(const llvm::BasicBlock *BB) const {
+  assert(BB && "Trying to get the tag of a null BasicBlock");
+  auto iter = BBMap.find(BB);
+  assert(iter != BBMap.end() && "BasicBlock tag is not defined");
+  return iter->second;
+}
+
+BasicBlockTag &DivergenceResult::getOrCreateTag(BasicBlock *BB) {
+  assert(BB && "Trying to get the tag of a null BasicBlock");
+  const auto &result = BBMap.try_emplace(BB, basicBlockTags.size());
+  if (result.second) {
+    // It's a new map entry, so create the new tag and return it.
+    basicBlockTags.emplace_back();
+    auto &tag = basicBlockTags.back();
+    tag.BB = BB;
+    return tag;
+  }
+  // Return the indexed tag.
+  return basicBlockTags[result.first->second];
+}
+
+LoopTag &DivergenceResult::getTag(const Loop *L) const {
+  assert(L && "Trying to get the tag of a null loop");
+  auto iter = LMap.find(L);
+  assert(iter != LMap.end() && "Loop tag is not defined");
+  return *iter->second;
+}
+
+LoopTag &DivergenceResult::getOrCreateTag(Loop *L) {
+  assert(L && "Trying to get or create the tag of a null loop");
+  auto &tag = LMap[L];
+  if (!tag) {
+    tag = std::make_unique<LoopTag>();
+    tag->loop = L;
+  }
+  return *tag;
+}
+
+bool DivergenceResult::hasFlag(const BasicBlock &BB,
+                               BlockDivergenceFlag F) const {
+  return (getTag(&BB).divergenceFlag & F) == F;
+}
+
+BlockDivergenceFlag DivergenceResult::getFlag(const BasicBlock &BB) const {
+  return getTag(&BB).divergenceFlag;
+}
+
+void DivergenceResult::setFlag(const BasicBlock &BB, BlockDivergenceFlag F) {
+  auto &tag = getTag(&BB);
+  tag.divergenceFlag = static_cast<BlockDivergenceFlag>(tag.divergenceFlag | F);
+}
+
+void DivergenceResult::clearFlag(const BasicBlock &BB, BlockDivergenceFlag F) {
+  auto &tag = getTag(&BB);
+  tag.divergenceFlag =
+      static_cast<BlockDivergenceFlag>(tag.divergenceFlag & ~F);
+}
+
+bool DivergenceResult::isDivCausing(const BasicBlock &BB) const {
+  return (hasFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranch) ||
+          hasFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranchFake));
+}
+
+bool DivergenceResult::isDivergent(const BasicBlock &BB) const {
+  return hasFlag(BB, BlockDivergenceFlag::eBlockIsDivergent);
+}
+
+bool DivergenceResult::isOptional(const BasicBlock &BB) const {
+  return !isDivergent(BB);
+}
+
+bool DivergenceResult::isByAll(const BasicBlock &BB) const {
+  return hasFlag(BB, BlockDivergenceFlag::eBlockIsByAll);
+}
+
+bool DivergenceResult::isBlend(const BasicBlock &BB) const {
+  return hasFlag(BB, BlockDivergenceFlag::eBlockIsBlend);
+}
+
+bool DivergenceResult::isUniform(const BasicBlock &BB) const {
+  return hasFlag(BB, BlockDivergenceFlag::eBlockIsUniform);
+}
+
+bool DivergenceResult::hasFlag(const Loop &L, LoopDivergenceFlag F) const {
+  return (getTag(&L).divergenceFlag & F) == F;
+}
+
+LoopDivergenceFlag DivergenceResult::getFlag(const Loop &L) const {
+  return getTag(&L).divergenceFlag;
+}
+
+void DivergenceResult::setFlag(const Loop &L, LoopDivergenceFlag F) {
+  auto &tag = getTag(&L);
+  tag.divergenceFlag = static_cast<LoopDivergenceFlag>(tag.divergenceFlag | F);
+}
+
+void DivergenceResult::clearFlag(const Loop &L, LoopDivergenceFlag F) {
+  auto &tag = getTag(&L);
+  tag.divergenceFlag = static_cast<LoopDivergenceFlag>(tag.divergenceFlag & ~F);
+}
+
+bool DivergenceResult::computeBlockOrdering(DominatorTree &DT) {
+  LLVM_DEBUG(dbgs() << "Divergence Analysis: COMPUTE BLOCK ORDERING\n");
+
+  // The DCBI (Dominance Compact Block Indexing) is a topological ordering of
+  // the basic blocks that is also dominance compact, that is, an ordering such
+  // that for any block A, every block that A dominates follows in a contiguous
+  // subsequence in the ordering. To construct this, we gather a reverse post-
+  // order traversal over the CFG, and then a depth-first traversal over the
+  // dominator tree, ordering each node's children according to the previously
+  // calculated reverse post-order. We need to take special care of loop exits,
+  // however, since where a loop exits from some block other than a latch,
+  // the dominator tree traversal can erroneously order it inside of the loop.
+  // To prevent this, we store up exit blocks until we have processed all
+  // the blocks at the current loop level.
+
+  struct DCnode {
+    BasicBlock *BB;
+    unsigned depth = 0;
+  };
+  std::vector<DCnode> graph;
+  llvm::DenseMap<llvm::BasicBlock *, unsigned> indexMap;
+
+  indexMap.reserve(F.size());
+  {
+    // Note that a post-order traversal of the CFG does not include any blocks
+    // with no predecessors, other than the entry block.
+    unsigned index = 0;
+    for (auto *const BB : RPOT(&F)) {
+      indexMap[BB] = index++;
+      graph.emplace_back();
+      graph.back().BB = BB;
+
+      if (const auto *const LTag = getTag(BB).loop) {
+        graph.back().depth = LTag->loop->getLoopDepth();
+      }
+    }
+  }
+
+  // Do a depth-first traversal of the dominator tree
+  SmallVector<unsigned, 16> stack;
+  stack.push_back(0);
+  uint32_t pos = 0;
+  const SmallVector<unsigned, 16> children;
+  SmallVector<unsigned, 16> loopExits;
+  while (!stack.empty()) {
+    const auto u = stack.pop_back_val();
+    const auto &uNode = graph[u];
+
+    getTag(uNode.BB).pos = pos++;
+
+    // Children in the same loop or subloops get added back to the stack.
+    // Children outside of the current loop get stored up until we processed
+    // everything in this loop. Note that we can accumulate exit blocks
+    // from multiple points within the loop, and across multiple depth levels.
+    auto *const DTNode = DT.getNode(uNode.BB);
+    unsigned stacked = 0;
+    for (auto *const childNode : make_range(DTNode->begin(), DTNode->end())) {
+      const auto child = indexMap[childNode->getBlock()];
+      auto &cNode = graph[child];
+      if (cNode.depth >= uNode.depth) {
+        stack.push_back(child);
+        ++stacked;
+      } else {
+        // Note that we can exit across more than one loop level, so we need to
+        // find the right place to insert it.
+        auto insert = loopExits.end();
+        while (insert != loopExits.begin()) {
+          auto scan = insert - 1;
+          if (cNode.depth < graph[*scan].depth) {
+            insert = scan;
+          } else {
+            break;
+          }
+        }
+        loopExits.insert(insert, child);
+      }
+    }
+    // Sort any children added to the stack into post-order
+    std::sort(stack.end() - stacked, stack.end(), std::greater<unsigned>());
+
+    if (!loopExits.empty()) {
+      const unsigned curDepth = stack.empty() ? 0 : graph[stack.back()].depth;
+      const unsigned depth = std::max(curDepth, graph[loopExits.back()].depth);
+      unsigned count = 0;
+      while (!loopExits.empty() && depth == graph[loopExits.back()].depth) {
+        stack.push_back(loopExits.pop_back_val());
+        ++count;
+      }
+
+      // Sort the loop exits into post-order
+      std::sort(stack.end() - count, stack.end(), std::greater<unsigned>());
+    }
+  }
+  assert(pos == graph.size() && "Incomplete DCBI");
+
+  reorderTags(pos);
+  return true;
+}
+
+void DivergenceResult::reorderTags(size_t n) {
+  numOrderedBlocks = n;
+
+  // This is a Cycle Sort. It re-orders the tags in the tag vector according to
+  // their calculated block index. Despite the two nested loops, it is O(n).
+  // Out-of-range indices (pos >= n) will be left where they are, but a later
+  // ordered tag might move it afterwards.
+  for (size_t i = 0, n = basicBlockTags.size(); i != n; ++i) {
+    auto &tag = basicBlockTags[i];
+    while (tag.pos < n && tag.pos != i) {
+      std::swap(tag, basicBlockTags[tag.pos]);
+    }
+  }
+
+  // Rebuild the index map after sorting. Note that we can't absorb this into
+  // the above loop, since an unordered tag might not be in its final position
+  // until all of the ordered tags are in their correct places.
+  for (size_t i = 0, n = basicBlockTags.size(); i != n; ++i) {
+    BBMap[basicBlockTags[i].BB] = i;
+  }
+}
+
+bool DivergenceResult::computeLoopOrdering() {
+  loopOrdering.clear();
+  for (const auto &pair : LMap) {
+    loopOrdering.push_back(pair.second.get());
+  }
+
+  std::sort(loopOrdering.begin(), loopOrdering.end(),
+            [](const LoopTag *LHS, const LoopTag *RHS) -> bool {
+              return LHS->loop->getLoopDepth() < RHS->loop->getLoopDepth();
+            });
+
+  return true;
+}
+
+void DivergenceResult::markDivCausing(BasicBlock &BB, DivergenceInfo &DI,
+                                      PostDominatorTree &PDT) {
+  if (isDivCausing(BB)) {
+    return;
+  }
+
+  divCausingBlocks.push_back(&BB);
+  setFlag(BB, BlockDivergenceFlag::eBlockHasDivergentBranch);
+  LLVM_DEBUG(dbgs() << "Block " << BB.getName() << " is div_causing\n");
+
+  for (BasicBlock *succ : successors(&BB)) {
+    markDivergent(*succ);
+  }
+
+  // If a block is a joint point (blend) of `BB`, then it is divergent (unless
+  // it is the post-dominator of `BB`).
+  const auto &joins = joinPoints(BB);
+  for (BasicBlock *const join : joins) {
+    setFlag(*join, BlockDivergenceFlag::eBlockIsBlend);
+    LLVM_DEBUG(dbgs() << "\tBlock " << join->getName() << " is blend\n");
+
+    if (!PDT.dominates(join, &BB)) {
+      markDivergent(*join);
+    }
+
+    for (BasicBlock *const pred : predecessors(join)) {
+      // If at least 2 successors of `pred` are join points of `BB`, then mark
+      // `pred` as a fake div causing block because its successors may be
+      // executed by multiple work-items.
+      if (std::count_if(
+              succ_begin(pred), succ_end(pred),
+              [&joins](BasicBlock *succ) { return joins.count(succ); }) > 1) {
+        fakeDivCausingBlocks.insert(pred);
+      }
+    }
+
+    // Join points of divergent branches need their PHIs marked varying.
+    DI.insert(join);
+  }
+}
+
+void DivergenceResult::markDivLoopDivBlocks(BasicBlock &BB, Loop &L,
+                                            DivergenceInfo &DI) {
+  markDivergent(L);
+
+  // Find loop exits through which some work-items may leave the loop while
+  // others keep iterating over it. These exit blocks can be reached from the
+  // div_causing block before reaching the latch because the divergent path
+  // cannot fully reconverge before leaving the loop (since the loop is
+  // divergent).
+  SmallVector<BasicBlock *, 1> exits;
+  L.getExitBlocks(exits);
+  const auto &divergentExits = escapePoints(BB, L);
+  for (BasicBlock *E : exits) {
+    if (divergentExits.contains(E)) {
+      markDivergent(*E);
+    }
+    // All loop exits of a divergent loop need their PHIs marked varying.
+    DI.insert(E);
+  }
+
+  // The latch of a divergent loop is divergent.
+  markDivergent(*L.getLoopLatch());
+}
+
+void DivergenceResult::markDivergent(const BasicBlock &BB) {
+  if (!isDivergent(BB)) {
+    setFlag(BB, BlockDivergenceFlag::eBlockIsDivergent);
+    LLVM_DEBUG(dbgs() << "\tBlock " << BB.getName() << " is divergent\n");
+  }
+}
+
+void DivergenceResult::markDivergent(const Loop &L) {
+  if (!getTag(&L).isLoopDivergent()) {
+    setFlag(L, LoopDivergenceFlag::eLoopIsDivergent);
+    LLVM_DEBUG(dbgs() << "\tLoop " << L.getName() << " is divergent\n");
+  }
+}
+
+void DivergenceResult::markByAll(BasicBlock &src) {
+  Function &F = *src.getParent();
+  const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  const PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+  BlockQueue queue(*this);
+  queue.push(&src);
+
+  while (!queue.empty()) {
+    auto &BBTag = queue.pop();
+    auto *const BB = BBTag.BB;
+
+    if (isByAll(*BB)) {
+      continue;
+    }
+
+    const bool isHeaderDivLoop =
+        BBTag.isLoopHeader() && BBTag.loop->isLoopDivergent();
+    // If BB is a loop header, it can only be marked by_all if its loop does not
+    // diverge.
+    if (!isHeaderDivLoop) {
+      setFlag(*BB, BlockDivergenceFlag::eBlockIsByAll);
+      LLVM_DEBUG(dbgs() << "Block " << BB->getName() << " is by_all\n");
+    }
+
+    SmallVector<BasicBlock *, 2> descendants;
+    DT.getDescendants(BB, descendants);
+
+    // For all descendants `D` of `BB` that post-dominate `BB`, `D` is by_all.
+    for (BasicBlock *D : descendants) {
+      if (D != BB) {
+        if (PDT.dominates(D, BB)) {
+          const auto DIndex = getTagIndex(D);
+          const auto *const DLoopTag = basicBlockTags[DIndex].loop;
+          // If we are not in a loop, or the loop we live in does not diverge
+          // nor does the one englobing us if it exists, then mark by_all.
+          if (DLoopTag) {
+            if (DLoopTag->isLoopDivergent())
+              continue;
+            Loop *parentLoop = DLoopTag->loop->getParentLoop();
+            if (parentLoop && !isByAll(*parentLoop->getHeader()))
+              continue;
+          }
+          queue.push(DIndex);
+        }
+      }
+    }
+
+    // For all descendants `D` of `BB` that do not post-dominate `BB`, `D` is
+    // by_all if all predecessors of `D` are by_all.
+    //
+    // If BB is a divergent branch, it cannot propagate by_all to its
+    // successors.
+    if (!isHeaderDivLoop && !isDivCausing(*BB)) {
+      for (BasicBlock *D : descendants) {
+        if (D != BB) {
+          if (!PDT.dominates(D, BB)) {
+            if (std::all_of(
+                    pred_begin(D), pred_end(D),
+                    [this](BasicBlock *pred) { return isByAll(*pred); })) {
+              queue.push(D);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+bool DivergenceResult::isReachable(BasicBlock *src, BasicBlock *dst,
+                                   bool allowLatch) const {
+  DenseSet<BasicBlock *> visited;
+  std::vector<BasicBlock *> worklist;
+
+  worklist.push_back(src);
+  visited.insert(src);
+
+  while (!worklist.empty()) {
+    BasicBlock *BB = worklist.back();
+    worklist.pop_back();
+
+    if (BB == dst) {
+      return true;
+    }
+
+    const auto &BBTag = getTag(BB);
+    for (BasicBlock *succ : successors(BB)) {
+      if (!allowLatch && BBTag.isLoopBackEdge(succ)) {
+        continue;
+      }
+      if (visited.insert(succ).second) {
+        worklist.push_back(succ);
+      }
+    }
+  }
+
+  return false;
+}
+
+DenseSet<BasicBlock *> DivergenceResult::joinPoints(BasicBlock &src) const {
+  if (src.getTerminator()->getNumSuccessors() < 2) {
+    return {};
+  }
+
+  Function &F = *src.getParent();
+  const PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+  DenseMap<const BasicBlock *, const BasicBlock *> defMap;
+  DenseSet<BasicBlock *> joins;
+
+  BlockQueue queue(*this);
+
+  auto schedule = [&defMap, &joins, &queue](BasicBlock *block,
+                                            const BasicBlock *defBlock) {
+    auto defIt = defMap.find(block);
+    // First time we meet this block; not a join (yet).
+    if (defIt == defMap.end()) {
+      queue.push(block);
+      defMap.insert({block, defBlock});
+    } else if (defIt->second != defBlock) {
+      // We've found a block that has two different incoming definitions; it is
+      // a join point.
+      joins.insert(block);
+    }
+  };
+
+  for (BasicBlock *const succ : successors(&src)) {
+    schedule(succ, succ);
+  }
+
+  auto *Node = PDT.getNode(&src);
+  assert(Node && "Could not get node");
+  auto *IDom = Node->getIDom();
+  assert(IDom && "Could not get IDom");
+  BasicBlock *PIDom = IDom->getBlock();
+  assert(PIDom && "Could not get block");
+
+  while (!queue.empty()) {
+    auto &curTag = queue.pop();
+    BasicBlock *cur = curTag.BB;
+
+    if (cur == PIDom) {
+      continue;
+    }
+
+    const BasicBlock *const defBlock = defMap.find(cur)->second;
+
+    const auto *const curLTag = curTag.loop;
+    // If the successor is the header of a nested loop pretend its a single
+    // node with the loop's exits as successors.
+    if (curLTag && curLTag->header == cur) {
+      SmallVector<BasicBlock *, 2> exits;
+      curLTag->loop->getUniqueExitBlocks(exits);
+      for (BasicBlock *const exit : exits) {
+        if (exit == &src) {
+          continue;
+        }
+        schedule(exit, defBlock);
+      }
+    } else {
+      // the successors are either on the same loop level or loop exits
+      for (BasicBlock *const succ : successors(cur)) {
+        if (succ == &src) {
+          continue;
+        }
+        schedule(succ, defBlock);
+      }
+    }
+  }
+
+  return joins;
+}
+
+DenseSet<BasicBlock *> DivergenceResult::escapePoints(const BasicBlock &src,
+                                                      const Loop &L) const {
+  const LoopTag &LTag = getTag(&L);
+
+  DenseSet<BasicBlock *> divergentExits;
+
+  DenseSet<const BasicBlock *> visited;
+  BlockQueue queue(*this);
+
+  queue.push(&src);
+  visited.insert(&src);
+
+  while (!queue.empty()) {
+    const auto &BBTag = queue.pop();
+    auto *const BB = BBTag.BB;
+
+    // We found a divergent loop exit.
+    if (!L.contains(BB)) {
+      divergentExits.insert(BB);
+      continue;
+    }
+
+    bool allowLatch = true;
+    auto *const loopTag = BBTag.loop;
+    // 'BB' is a backedge
+    if (loopTag && loopTag->latch == BB) {
+      if (loopTag == &LTag) {
+        // `BB` is the latch of the current loop; forbid the backedge.
+        allowLatch = false;
+      } else {
+        // Otherwise, forbid the backedge only if none of the remaining blocks
+        // in the queue belong to `L`, in which case no exit block starting
+        // from the header of the nested loop can be divergent.
+        allowLatch =
+            std::any_of(queue.begin(), queue.end(), [this, &L](size_t index) {
+              return L.contains(basicBlockTags[index].BB);
+            });
+      }
+    }
+
+    for (BasicBlock *succ : successors(BB)) {
+      if (BBTag.isLoopBackEdge(succ) && !allowLatch) {
+        continue;
+      }
+      if (visited.insert(succ).second) {
+        queue.push(succ);
+      }
+    }
+  }
+
+  return divergentExits;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+llvm::AnalysisKey DivergenceAnalysis::Key;
+
+DivergenceResult DivergenceAnalysis::run(llvm::Function &F,
+                                         llvm::FunctionAnalysisManager &AM) {
+  DivergenceResult Res(F, AM);
+
+  LLVM_DEBUG(dbgs() << "DIVERGENCE ANALYSIS\n");
+  Res.basicBlockTags.reserve(F.size() * 4);
+
+  // Prepare the BasicBlockTags.
+  const LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  for (BasicBlock &BB : F) {
+    // Create BB info entries.
+    BasicBlockTag &BBTag = Res.getOrCreateTag(&BB);
+
+    // Update loop info.
+    if (Loop *L = LI.getLoopFor(&BB)) {
+      if (!BBTag.loop) {
+        BBTag.loop = &Res.getOrCreateTag(L);
+        BBTag.loop->latch = L->getLoopLatch();
+        BBTag.loop->header = L->getHeader();
+        BBTag.loop->preheader = L->getLoopPreheader();
+      }
+    }
+  }
+
+  // Find loop live values and update loop exit information.
+  Res.computeLoopOrdering();
+  for (auto *const LTag : Res.loopOrdering) {
+    SmallVector<BasicBlock *, 1> loopExitBlocks;
+    LTag->loop->getExitBlocks(loopExitBlocks);
+    for (BasicBlock *BB : loopExitBlocks) {
+      auto &BBTag = Res.getTag(BB);
+      // If BB already leaves a loop, update it if the previous loop is nested
+      // in the current.
+      if (BBTag.outermostExitedLoop) {
+        if (BBTag.outermostExitedLoop->loop->getLoopDepth() >
+            LTag->loop->getLoopDepth()) {
+          BBTag.outermostExitedLoop = LTag;
+        }
+      } else {
+        BBTag.outermostExitedLoop = LTag;
+      }
+
+      // LoopSimplify pass has already converted SSA form to LCSSA from.
+      // Let's use lcssa phi nodes to find loop live variables like llvm loop
+      // vectorizer.
+      // LoopSimplify pass is added on PreparationPass of vectorizer.cpp.
+      //
+      // See head comment on lib/Transforms/Utils/LCSSA.cpp
+      for (Instruction &I : *BB) {
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          // lcssa phi has incoming values defined in the loop.
+          for (Value *incoming : PHI->incoming_values()) {
+            if (Instruction *incomingInst = dyn_cast<Instruction>(incoming)) {
+              if (LTag->loop->contains(incomingInst->getParent())) {
+                LTag->loopLiveValues.insert(incoming);
+                LLVM_DEBUG(dbgs() << *incoming << " is a loop live value of "
+                                  << LTag->loop->getName() << "\n");
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // From the UVA, we know which conditions are varying which allows us to
+  // find divergent branches.
+  // Moreover, from divergent branches - and therefore from divergent paths -
+  // we can find more varying values that are computed on those divergent paths.
+  // The latter allows us to find more divergent branches, and so on...
+  // We take a local copy of the UVR because it is not good to modify one
+  // analysis result from another analysis. However, after Control Flow
+  // Conversion has been run, all control flow divergence is converted into
+  // non-uniform dataflow so any subsequent run of the UVA is still correct.
+  auto UVR = AM.getResult<UniformValueAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+  Res.computeBlockOrdering(DT);
+
+  std::vector<std::pair<BasicBlock *, Value *>> uniformBranches;
+  uniformBranches.reserve(F.size() - 1u);
+  for (BasicBlock &BB : F) {
+    if (BranchInst *B = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (B->isConditional()) {
+        uniformBranches.push_back({&BB, B->getCondition()});
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      uniformBranches.push_back({&BB, SI->getCondition()});
+    }
+  }
+
+  while (!uniformBranches.empty()) {
+    // Partition the list so all the varying branches are grouped at the end.
+    const auto varyingBranches =
+        std::partition(uniformBranches.begin(), uniformBranches.end(),
+                       [&UVR](std::pair<BasicBlock *, Value *> &p) -> bool {
+                         return !UVR.isVarying(p.second);
+                       });
+
+    // Process all the varying branches.
+    DivergenceInfo divergenceInfo;
+    for (auto it = varyingBranches; it != uniformBranches.end(); ++it) {
+      BasicBlock *BB = it->first;
+
+      // Find blocks diverged by varying branch block.
+      Res.markDivCausing(*BB, divergenceInfo, PDT);
+
+      if (const auto *const LTag = Res.getTag(BB).loop) {
+        Loop *L = LTag->loop;
+        while (L) {
+          // If BB is a varying branch, mark the loop as diverging if any two
+          // instances of a SIMD group can leave the loop over different exit
+          // edges and/or in different iterations. This means that BB cannot
+          // be postdominated by any block of L.
+          auto *Node = PDT.getNode(BB);
+          assert(Node && "Could not get node");
+          auto *IDom = Node->getIDom();
+          assert(IDom && "Could not get IDom");
+          BasicBlock *PIDom = IDom->getBlock();
+          if (!L->contains(PIDom)) {
+            Res.markDivLoopDivBlocks(*BB, *L, divergenceInfo);
+          } else {
+            // If the loop does not diverge because of `BB`, none of its
+            // parent loops can diverge either.
+            break;
+          }
+          L = L->getParentLoop();
+        }
+      }
+    }
+
+    // Remove all the varying branches from the end of the list.
+    uniformBranches.erase(varyingBranches, uniformBranches.end());
+
+    // PHIs defined in join point of divergent branches and in exit blocks of
+    // divergent loops are varying.
+    bool updated = false;
+    for (BasicBlock *BB : divergenceInfo) {
+      const bool exitedLoop = Res.getTag(BB).outermostExitedLoop;
+      for (Instruction &I : *BB) {
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          // Loop exits might have constant phi nodes (lcssa value).
+          if (exitedLoop || !PHI->hasConstantOrUndefValue()) {
+            if (!UVR.isVarying(&I)) {
+              updated = true;
+              UVR.markVaryingValues(&I);
+              LLVM_DEBUG(dbgs()
+                         << I.getName() << " is a varying instruction\n");
+            }
+          }
+        } else {
+          break;
+        }
+      }
+    }
+    if (!updated) {
+      // We made no updates, so we processed all the varying branches.
+      break;
+    }
+  }
+
+  // All blocks that are predecessors of join points of div causing blocks and
+  // have a uniform condition must be marked as fake div causing blocks because
+  // divergence may have occurred at the div causing block and we must make sure
+  // we execute all paths that lead to the join point.
+  for (BasicBlock *BB : Res.fakeDivCausingBlocks) {
+    if (BB->getTerminator()->getNumSuccessors() > 1 && !Res.isDivCausing(*BB)) {
+      Res.setFlag(*BB, BlockDivergenceFlag::eBlockHasDivergentBranchFake);
+      LLVM_DEBUG(dbgs() << "Found fake div causing block " << BB->getName()
+                        << "\n");
+      // Because we have marked `BB` as a target for linearization, its join
+      // points must be marked as `blend` because they may lose some
+      // predecessors during the rewiring.
+      for (BasicBlock *join : Res.joinPoints(*BB)) {
+        Res.setFlag(*join, BlockDivergenceFlag::eBlockIsBlend);
+        LLVM_DEBUG(dbgs() << "\tBlock " << join->getName() << " is blend\n");
+      }
+    }
+  }
+
+  // By definition, the entry block is by_all.
+  Res.markByAll(F.getEntryBlock());
+
+  return Res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
new file mode 100644
index 0000000000000..f14239789e598
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/instantiation_analysis.cpp
@@ -0,0 +1,135 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/instantiation_analysis.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "vectorization_context.h"
+
+#define DEBUG_TYPE "vecz-instantiation"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+bool analyzeType(Type *Ty) {
+  return !Ty->isVoidTy() && !Ty->isVectorTy() &&
+         !FixedVectorType::isValidElementType(Ty);
+}
+
+bool analyzeMemOp(MemOp &Op) {
+  assert(Op.getPointerType()->isPointerTy() && "MemOp inconsistency");
+  return analyzeType(Op.getDataType());
+}
+
+bool analyzeCall(const VectorizationContext &Ctx, CallInst *CI) {
+  Function *Callee = CI->getCalledFunction();
+  VECZ_FAIL_IF(!Callee);
+
+  // Handle internal builtins.
+  if (Ctx.isInternalBuiltin(Callee)) {
+    if (auto Op = MemOp::get(CI)) {
+      return analyzeMemOp(*Op);
+    }
+    return false;
+  }
+
+  // Handle function containing pointers as parameter.
+  if (any_of(Callee->args(),
+             [](const Argument &A) { return A.getType()->isPointerTy(); })) {
+    return true;
+  }
+
+  // Handle masked function calls
+  if (Ctx.isMaskedFunction(Callee)) {
+    return true;
+  }
+
+  auto B = Ctx.builtins().analyzeBuiltin(*Callee);
+  const auto Props = B ? B->properties : 0;
+
+  // Intrinsics without side-effects can be safely instantiated.
+  if (Callee->isIntrinsic() &&
+      (Props & compiler::utils::eBuiltinPropertyNoSideEffects)) {
+    // If the intrinsic has a vector equivalent, then we can use it directly
+    // instead.
+    if (Props & compiler::utils::eBuiltinPropertyVectorEquivalent) {
+      return analyzeType(CI->getType());
+    }
+    return true;
+  }
+
+  // Functions returning void must have side-effects.
+  // We cannot vectorize them and instead we need to instantiate them.
+  const bool HasSideEffects =
+      Callee->getReturnType()->isVoidTy() ||
+      (Props & compiler::utils::eBuiltinPropertySideEffects);
+  if (HasSideEffects &&
+      (Props & compiler::utils::eBuiltinPropertySupportsInstantiation)) {
+    return true;
+  }
+
+  return analyzeType(CI->getType());
+}
+
+bool analyzeAlloca(const VectorizationContext &Ctx, AllocaInst *alloca) {
+  // Possibly, we could packetize by creating a wider array, but for now let's
+  // just let instantiation deal with it.
+  if (alloca->isArrayAllocation()) {
+    return true;
+  }
+
+  // We can create an array of anything, however, we need to be careful of
+  // alignment. In the case the alloca has a specific alignment requirement, we
+  // have to be sure it divides the type allocation size, otherwise only the
+  // first vector element would necessarily be correctly aligned.
+  auto *const dataTy = alloca->getAllocatedType();
+  const uint64_t memSize = Ctx.dataLayout()->getTypeAllocSize(dataTy);
+  const uint64_t align = alloca->getAlign().value();
+  return (align != 0 && (memSize % align) != 0);
+}
+} // namespace
+
+namespace vecz {
+bool needsInstantiation(const VectorizationContext &Ctx, Instruction &I) {
+  if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    return analyzeCall(Ctx, CI);
+  } else if (LoadInst *Load = dyn_cast<LoadInst>(&I)) {
+    if (auto Op = MemOp::get(Load)) {
+      return analyzeMemOp(*Op);
+    }
+    // If it's not a MemOp, assume we don't need to instantiate.
+    return false;
+  } else if (StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+    if (auto Op = MemOp::get(Store)) {
+      return analyzeMemOp(*Op);
+    }
+    // If it's not a MemOp, assume we don't need to instantiate.
+    return false;
+  } else if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
+    return analyzeAlloca(Ctx, Alloca);
+  } else if (isa<AtomicRMWInst>(&I) || isa<AtomicCmpXchgInst>(&I)) {
+    return true;
+  } else {
+    return analyzeType(I.getType());
+  }
+}
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
new file mode 100644
index 0000000000000..6bdcf9c295412
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/liveness_analysis.cpp
@@ -0,0 +1,253 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Implementation based on Section 5.2 of the paper:
+// Florian Brandner, Benoit Boissinot, Alain Darte, Benoît Dupont de Dinechin,
+// Fabrice Rastello.
+// Computing Liveness Sets for SSA-Form Programs.
+// [Research Report] RR-7503, INRIA. 2011, pp.25. inria-00558509v2
+//
+// https://hal.inria.fr/inria-00558509v2
+
+#include "analysis/liveness_analysis.h"
+
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/Analysis/Passes.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Instructions.h>
+
+#include "vectorization_unit.h"
+
+using namespace llvm;
+using namespace vecz;
+
+llvm::AnalysisKey LivenessAnalysis::Key;
+
+namespace {
+
+// Returns true if V defines a variable and is likely to require a register
+bool definesVariable(const Value &V) {
+  // Constants are likely to be immediate values
+  if (isa<Constant>(V)) {
+    return false;
+  }
+
+  // If a value isn't used, it can't be live
+  if (V.user_empty()) {
+    return false;
+  }
+
+  const auto valueType = V.getType();
+  return !valueType->isVoidTy() && !valueType->isLabelTy() &&
+         !valueType->isTokenTy() && !valueType->isMetadataTy();
+}
+
+// Tries to push a value onto the set, if it is not there already.
+// Returns true if the value was pushed, false otherwise.
+//
+// Note that since the implementation completely processes every instruction
+// sequentially, only the last element needs to be checked.
+inline bool pushOnce(BlockLivenessInfo::LiveSet &s, Value *V) {
+  if (!s.empty() && s.back() == V) {
+    return false;
+  }
+  s.push_back(V);
+  return true;
+}
+
+} // namespace
+
+class LivenessResult::Impl {
+public:
+  Impl(LivenessResult &lr) : LR(lr) {}
+
+  void recalculate();
+
+private:
+  LivenessResult &LR;
+
+  void computeByVar(const BasicBlock &BB);
+
+  void computeVar(Value *V, const BasicBlock *BB);
+
+  void mark(Value *V, const BasicBlock *parent, const BasicBlock *BB);
+
+  void calculateMaxRegistersInBlock(const llvm::BasicBlock *BB);
+
+  // private utility method for code conciseness
+  BlockLivenessInfo &info(const BasicBlock *BB) const {
+    auto BIi = LR.BlockInfos.find(BB);
+    assert(BIi != LR.BlockInfos.end() && "Block Liveness Info does not exist!");
+    return BIi->second;
+  }
+};
+
+LivenessResult LivenessAnalysis::run(llvm::Function &F,
+                                     llvm::FunctionAnalysisManager &) {
+  Result R(F);
+  R.recalculate();
+  return R;
+}
+
+size_t LivenessResult::getMaxLiveVirtualRegisters() const {
+  return maxNumberOfLiveValues;
+}
+
+const BlockLivenessInfo &
+LivenessResult::getBlockInfo(const BasicBlock *BB) const {
+  auto found = BlockInfos.find(BB);
+  assert(found != BlockInfos.end() && "No liveness information for BasicBlock");
+  return found->second;
+}
+
+void LivenessResult::recalculate() {
+  maxNumberOfLiveValues = 0;
+
+  BlockInfos.clear();
+
+  Impl impl(*this);
+  impl.recalculate();
+}
+
+void LivenessResult::Impl::recalculate() {
+  auto &F = LR.F;
+
+  // Create infos in advance so things don't relocate under our feet.
+  for (auto &BB : F) {
+    (void)LR.BlockInfos[&BB];
+  }
+
+  // Arguments are always live-ins of the entry block (if they are used).
+  {
+    auto *BB = &F.getEntryBlock();
+    auto &BI = info(BB);
+    for (auto &arg : F.args()) {
+      if (!arg.use_empty()) {
+        BI.LiveIn.push_back(&arg);
+        computeVar(&arg, BB);
+      }
+    }
+  }
+
+  // Add all other variables to the live sets.
+  for (auto &BB : F) {
+    auto &BI = LR.BlockInfos[&BB];
+    for (auto &I : BB) {
+      if (definesVariable(I)) {
+        if (isa<PHINode>(I)) {
+          // PHI nodes are always live-ins.
+          BI.LiveIn.push_back(&I);
+        }
+        computeVar(&I, &BB);
+      }
+    }
+  }
+
+  // Calculate the maximum number of live values in every block.
+  for (auto &BB : F) {
+    calculateMaxRegistersInBlock(&BB);
+  }
+
+  // Store the largest number of live values in the function.
+  for (const auto &entry : LR.BlockInfos) {
+    LR.maxNumberOfLiveValues = std::max(LR.maxNumberOfLiveValues,
+                                        entry.getSecond().MaxRegistersInBlock);
+  }
+}
+
+void LivenessResult::Impl::computeVar(Value *V, const BasicBlock *BB) {
+  SmallPtrSet<const BasicBlock *, 8> UseBlocks;
+  for (auto *User : V->users()) {
+    if (auto *UI = dyn_cast<Instruction>(User)) {
+      if (auto *PHI = dyn_cast<PHINode>(UI)) {
+        for (unsigned i = 0, n = PHI->getNumIncomingValues(); i != n; ++i) {
+          if (PHI->getIncomingValue(i) == V) {
+            const auto *Incoming = PHI->getIncomingBlock(i);
+
+            if (pushOnce(info(Incoming).LiveOut, V) && Incoming != BB) {
+              UseBlocks.insert(Incoming);
+            }
+          }
+        }
+      } else {
+        const auto *Parent = UI->getParent();
+        if (Parent != BB) {
+          UseBlocks.insert(Parent);
+        }
+      }
+    }
+  }
+
+  for (auto *UB : UseBlocks) {
+    if (pushOnce(info(UB).LiveIn, V)) {
+      mark(V, BB, UB);
+    }
+  }
+}
+
+void LivenessResult::Impl::mark(Value *V, const BasicBlock *parent,
+                                const BasicBlock *BB) {
+  // Propagate backward
+  for (const auto *pred : predecessors(BB)) {
+    auto &PBI = info(pred);
+    if (pushOnce(PBI.LiveOut, V) && pred != parent && pushOnce(PBI.LiveIn, V)) {
+      mark(V, parent, pred);
+    }
+  }
+}
+
+void LivenessResult::Impl::calculateMaxRegistersInBlock(const BasicBlock *BB) {
+  auto &BI = LR.BlockInfos[BB];
+  const SmallPtrSet<const Value *, 16> liveOut(BI.LiveOut.begin(),
+                                               BI.LiveOut.end());
+  SmallPtrSet<const Value *, 16> seenButNotInLiveOut;
+
+  auto maxRegistersUsed = liveOut.size();
+  auto registersUsed = liveOut.size();
+
+  // Walk backwards through instructions in a block to count the maximum number
+  // of live values in that block.
+  for (auto &inst : make_range(BB->rbegin(), BB->rend())) {
+    // Phi nodes were in live out or were counted as operands. No need to
+    // decrement the registerCount, as one of the arguments used a register.
+    if (isa<PHINode>(&inst)) {
+      break;
+    }
+
+    // Operands are live so they use a register. Increment registerCount if not
+    // in live out or already counted.
+    for (const auto *operand : inst.operand_values()) {
+      if (definesVariable(*operand) && !liveOut.contains(operand) &&
+          !seenButNotInLiveOut.contains(operand)) {
+        registersUsed++;
+        seenButNotInLiveOut.insert(operand);
+      }
+    }
+
+    // If inst defines a variable, one less register was used before it
+    if (definesVariable(inst)) {
+      registersUsed--;
+    }
+
+    maxRegistersUsed = std::max(registersUsed, maxRegistersUsed);
+  }
+
+  assert(registersUsed == BI.LiveIn.size() &&
+         "Final number of live values inconsistent with live-in");
+
+  BI.MaxRegistersInBlock = maxRegistersUsed;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
new file mode 100644
index 0000000000000..d5230a303e3c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/packetization_analysis.cpp
@@ -0,0 +1,176 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/packetization_analysis.h"
+
+#include <compiler/utils/mangling.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "offset_info.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+bool isDivergenceReduction(const Function &F) {
+  compiler::utils::Lexer L(F.getName());
+  return (L.Consume(VectorizationContext::InternalBuiltinPrefix) &&
+          L.Consume("divergence_"));
+}
+} // namespace
+
+llvm::AnalysisKey PacketizationAnalysis::Key;
+
+PacketizationAnalysisResult::PacketizationAnalysisResult(
+    llvm::Function &f, StrideAnalysisResult &sar)
+    : F(f), SAR(sar), UVR(sar.UVR) {
+  // Vectorize branch conditions.
+  for (BasicBlock &BB : F) {
+    auto *TI = BB.getTerminator();
+    if (UVR.isVarying(TI)) {
+      markForPacketization(TI);
+    }
+  }
+
+  // Then vectorize other instructions, starting at leaves.
+  std::vector<Instruction *> Leaves;
+  UVR.findVectorLeaves(Leaves);
+
+  // Traverse the function from the leaves to find instructions that need to be
+  // packetized.
+  for (Instruction *I : Leaves) {
+    markForPacketization(I);
+  }
+}
+
+void PacketizationAnalysisResult::markForPacketization(Value *V) {
+  if (!toPacketize.insert(V).second) {
+    return;
+  }
+
+  auto *const I = dyn_cast<Instruction>(V);
+  if (!I) {
+    return;
+  }
+
+  if (auto *phi = dyn_cast<PHINode>(I)) {
+    for (unsigned i = 0, n = phi->getNumIncomingValues(); i < n; ++i) {
+      auto *const incoming = phi->getIncomingValue(i);
+      if (UVR.isVarying(incoming)) {
+        markForPacketization(incoming);
+      }
+    }
+    return;
+  }
+
+  auto mo = MemOp::get(I);
+  if (UVR.isMaskVarying(I)) {
+    if (mo) {
+      markForPacketization(mo->getMaskOperand());
+      return;
+    }
+
+    if (auto *const CI = dyn_cast<CallInst>(I)) {
+      Function *Callee = CI->getCalledFunction();
+      if (Callee && UVR.Ctx.isInternalBuiltin(Callee) &&
+          isDivergenceReduction(*Callee)) {
+        markForPacketization(CI->getOperand(0));
+        return;
+      }
+    }
+  }
+
+  if (mo) {
+    auto *const ptr = mo->getPointerOperand();
+    if (ptr && UVR.isVarying(ptr)) {
+      const auto *info = SAR.getInfo(ptr);
+      assert(info && "markForPacketization: Unable to obtain stride info");
+
+      bool hasValidStride = info->hasStride();
+
+      // Analyse the computed stride to see if the pointer will need to be
+      // packetized. No packetization is necessary where a contiguous or
+      // interleaved memop can be created, since only the pointer to the
+      // first element will be used.
+      if (hasValidStride) {
+        // Get the pointer stride as a number of elements
+        auto *const eltTy = mo->getDataType();
+        if (eltTy->isVectorTy() || eltTy->isPointerTy()) {
+          // No interleaved memops exist for vector element types or pointer
+          // types. We can only vectorize pointer loads/stores or widen vector
+          // load/stores if they are contiguous.
+          const auto stride = info->getConstantMemoryStride(
+              eltTy, &F.getParent()->getDataLayout());
+          if (stride != 1) {
+            hasValidStride = false;
+          }
+        } else if (!VectorType::isValidElementType(eltTy)) {
+          hasValidStride = false;
+        }
+      }
+
+      // Only mark the pointer for packetization if it does not have a
+      // valid linear stride
+      if (!hasValidStride) {
+        markForPacketization(ptr);
+      }
+    }
+
+    auto *const data = mo->getDataOperand();
+    auto *const mask = mo->getMaskOperand();
+    if (data && UVR.isVarying(data)) {
+      markForPacketization(data);
+    }
+    if (mask && UVR.isVarying(mask)) {
+      markForPacketization(mask);
+    }
+    return;
+  }
+
+  if (auto *const intrinsic = dyn_cast<llvm::IntrinsicInst>(I)) {
+    const auto intrinsicID = intrinsic->getIntrinsicID();
+    if (intrinsicID == llvm::Intrinsic::lifetime_end ||
+        intrinsicID == llvm::Intrinsic::lifetime_start) {
+      // We don't trace through lifetime intrinsics.
+      return;
+    }
+  }
+
+  // Mark any varying operands for packetization..
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    auto *const opI = I->getOperand(i);
+    if (UVR.isVarying(opI)) {
+      markForPacketization(opI);
+    }
+  }
+}
+
+PacketizationAnalysisResult
+PacketizationAnalysis::run(Function &F, llvm::FunctionAnalysisManager &AM) {
+  auto &SAR = AM.getResult<StrideAnalysis>(F);
+  return Result(F, SAR);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
new file mode 100644
index 0000000000000..9354efd65bb12
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/simd_width_analysis.cpp
@@ -0,0 +1,199 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/simd_width_analysis.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DepthFirstIterator.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallSet.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Function.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+#include "analysis/liveness_analysis.h"
+#include "analysis/packetization_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-simd-width"
+
+using namespace llvm;
+using namespace vecz;
+
+llvm::AnalysisKey SimdWidthAnalysis::Key;
+
+namespace {
+bool definedOrUsedInLoop(Value *V, Loop *L) {
+  if (!L) {
+    // We're not in a loop, so consider everything.
+    return true;
+  }
+
+  const auto *const I = dyn_cast<Instruction>(V);
+  if (I && L->contains(I)) {
+    // It's defined in the current loop.
+    return true;
+  }
+
+  // If it's used in the current loop, return true, unless it is a PHI node.
+  // Values defined outwith the loop, but used only by a PHI node within it must
+  // be loop-carried variable initial values. If these are not otherwise used
+  // directly within the loop, then they are not really live inside the loop.
+  for (const auto *const U : V->users()) {
+    const auto *const I = dyn_cast<Instruction>(U);
+    if (I && !isa<PHINode>(I) && L->contains(I)) {
+      return true;
+    }
+  }
+  return false;
+}
+} // namespace
+
+// Avoid Spill implementation. It focus on avoiding register spill by optimizing
+// register pressure.
+unsigned SimdWidthAnalysis::avoidSpillImpl(Function &F,
+                                           FunctionAnalysisManager &AM,
+                                           unsigned MinWidth) {
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+  const TargetTransformInfo TTI = VU.context().getTargetTransformInfo(F);
+  const auto &Liveness = AM.getResult<LivenessAnalysis>(F);
+  const auto &PAR = AM.getResult<PacketizationAnalysis>(F);
+  const LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  // Determine the SIMD width based on a live values register usage estimation.
+  assert(!VU.width().isScalable() && "Can't handle scalable-vectors");
+  unsigned SimdWidth = VU.width().getFixedValue();
+  assert(SimdWidth != 0 && "SimdWidthAnalysis: SimdWidth == 0");
+
+  SmallSet<const Value *, 16> OpenIntervals;
+  SmallVector<const Value *, 16> IntervalArray;
+
+  auto ShouldConsider = [&](const Value *V) -> bool {
+    // Filter out work item builtin calls such as get_local_id()
+    if (auto *const CI = dyn_cast<CallInst>(V)) {
+      if (const Function *Callee = CI->getCalledFunction()) {
+        if (auto B = VU.context().builtins().analyzeBuiltin(*Callee)) {
+          if (B->properties == compiler::utils::eBuiltinPropertyWorkItem) {
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  };
+
+  LLVM_DEBUG(dbgs() << "VEC(REG): Calculating max register usage:\n");
+  for (const auto &BB : F) {
+    // Get the LiveIns for this Basic Block.
+    // The principle of the Loop Aware SIMD Width Analysis is that it is not
+    // acceptable to spill values in the middle of a loop, however it may be
+    // acceptable to spill some values before entering a loop.
+    const auto &BI = Liveness.getBlockInfo(&BB);
+    OpenIntervals.clear();
+    auto *const CurLoop = LI.getLoopFor(&BB);
+    for (auto *V : BI.LiveOut) {
+      if (ShouldConsider(V) && PAR.needsPacketization(V) &&
+          definedOrUsedInLoop(V, CurLoop)) {
+        OpenIntervals.insert(V);
+      }
+    }
+
+    // Walk backwards through instructions in a block to count the maximum
+    // number of live values in that block.
+    for (auto &inst : make_range(BB.rbegin(), BB.rend())) {
+      if (isa<PHINode>(&inst)) {
+        break;
+      }
+
+      // The first instruction in the reverse range will be the terminator,
+      // so we don't really need to consider it. However we do need to consider
+      // the live set at the point before the last (i.e. first) instruction, so
+      // we deal with the operands first and then process the live set.
+      if (PAR.needsPacketization(&inst)) {
+        const bool isGEP = isa<GetElementPtrInst>(&inst);
+        for (auto operand : inst.operand_values()) {
+          if (isa<Instruction>(operand) || isa<Argument>(operand)) {
+            if (!isGEP || PAR.needsPacketization(operand)) {
+              OpenIntervals.insert(operand);
+            }
+          }
+        }
+      }
+
+      OpenIntervals.erase(&inst);
+      IntervalArray.assign(OpenIntervals.begin(), OpenIntervals.end());
+      SimdWidth = VU.context().targetInfo().estimateSimdWidth(
+          TTI, IntervalArray, SimdWidth);
+      LLVM_DEBUG(dbgs() << "VEC(REG): Interval # " << OpenIntervals.size()
+                        << " at SIMD Width " << SimdWidth << '\n');
+      LLVM_DEBUG(
+          for (auto OII = OpenIntervals.begin(), OIIE = OpenIntervals.end();
+               OII != OIIE; OII++) { dbgs() << "inst:" << **OII << '\n'; });
+
+      if (SimdWidth < MinWidth) {
+        return 0;
+      }
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "VEC(REG): Found widest fitting SIMD width: "
+                    << SimdWidth << '\n');
+  return SimdWidth;
+}
+
+SimdWidthAnalysis::Result
+SimdWidthAnalysis::run(Function &F, llvm::FunctionAnalysisManager &AM) {
+  const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  const VectorizationUnit &VU =
+      AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+
+  // If the target does not provide vector registers, return 0.
+  MaxVecRegBitWidth =
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedValue();
+
+  if (MaxVecRegBitWidth == 0) {
+    return 0;
+  }
+
+  // If the vectorization factor is for scalable vectors, return 0.
+  if (VU.width().isScalable()) {
+    return 0;
+  }
+
+  auto SimdWidth = avoidSpillImpl(F, AM, 1);
+  if (SimdWidth != 0 && SimdWidth < 4) {
+    // We only return 0 (i.e. don't vectorize) in the case that the packetized
+    // values wouldn't fit into vector registers even with a factor of 1. If
+    // the packetized values fit into vector registers for any width, we use
+    // a baseline factor of 4 since this is empirically better than 2.
+    SimdWidth = 4;
+  }
+
+  return SimdWidth;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
new file mode 100644
index 0000000000000..b98a149c97b12
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/stride_analysis.cpp
@@ -0,0 +1,124 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/stride_analysis.h"
+
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "offset_info.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+llvm::AnalysisKey StrideAnalysis::Key;
+
+OffsetInfo &StrideAnalysisResult::analyze(Value *V) {
+  const auto find = analyzed.find(V);
+  if (find != analyzed.end()) {
+    return find->second;
+  }
+
+  // We construct it on the stack first, and copy it into the map, because
+  // the constructor itself can create more things in the map and constructing
+  // it in-place could result in the storage being re-allocated while the
+  // constructor is still running.
+  const auto OI = OffsetInfo(*this, V);
+  return analyzed.try_emplace(V, OI).first->second;
+}
+
+StrideAnalysisResult::StrideAnalysisResult(llvm::Function &f,
+                                           UniformValueResult &uvr,
+                                           AssumptionCache &AC)
+    : F(f), UVR(uvr), AC(AC) {
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (!UVR.isVarying(&I)) {
+        continue;
+      }
+
+      if (auto mo = MemOp::get(&I)) {
+        auto *const ptr = mo->getPointerOperand();
+        analyze(ptr);
+      }
+    }
+  }
+}
+
+void StrideAnalysisResult::manifestAll(IRBuilder<> &B) {
+  const auto saved = B.GetInsertPoint();
+  for (auto &info : analyzed) {
+    info.second.manifest(B, *this);
+  }
+  B.SetInsertPoint(saved->getParent(), saved);
+}
+
+Value *StrideAnalysisResult::buildMemoryStride(IRBuilder<> &B, llvm::Value *Ptr,
+                                               llvm::Type *EleTy) const {
+  if (auto *const info = getInfo(Ptr)) {
+    return info->buildMemoryStride(B, EleTy, &F.getParent()->getDataLayout());
+  }
+  return nullptr;
+}
+
+StrideAnalysisResult StrideAnalysis::run(llvm::Function &F,
+                                         llvm::FunctionAnalysisManager &AM) {
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &UVR = AM.getResult<UniformValueAnalysis>(F);
+  return Result(F, UVR, AC);
+}
+
+PreservedAnalyses StrideAnalysisPrinterPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &SAR = AM.getResult<StrideAnalysis>(F);
+  OS << "StrideAnalysis for function '" << F.getName() << "':\n";
+
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (auto MO = MemOp::get(&I)) {
+        auto *const Ptr = MO->getPointerOperand();
+        if (!Ptr) {
+          continue;
+        }
+        if (const OffsetInfo *Info = SAR.getInfo(Ptr)) {
+          OS << "* Stride for " << *Ptr << "\n  - ";
+          if (Info->mayDiverge()) {
+            OS << "divergent";
+          } else if (Info->hasStride()) {
+            OS << "linear";
+          } else if (Info->isUniform()) {
+            OS << "uniform";
+          } else {
+            OS << "unknown";
+          }
+          if (Info->isStrideConstantInt()) {
+            OS << " stride of " << Info->getStrideAsConstantInt();
+          }
+          OS << "\n";
+        }
+      }
+    }
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
new file mode 100644
index 0000000000000..0d24a43a81921
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/uniform_value_analysis.cpp
@@ -0,0 +1,563 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/uniform_value_analysis.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+
+#include <cstdlib>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "vectorization_unit.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+
+// Find leaves by recursing through an instruction's uses
+bool findStrayLeaves(UniformValueResult &UVR, Instruction &I,
+                     DenseSet<Instruction *> &Visited) {
+  for (const Use &U : I.uses()) {
+    auto *User = U.getUser();
+    if (isa<StoreInst>(User) || isa<AtomicRMWInst>(User) ||
+        isa<AtomicCmpXchgInst>(User)) {
+      if (UVR.isValueOrMaskVarying(User)) {
+        return true;
+      }
+    } else if (auto *CI = dyn_cast<CallInst>(User)) {
+      if (CI->use_empty()) {
+        // Any call instruction with no uses is counted as a leaf. This case
+        // should also cover any kind of masked stores, since masked stores are
+        // builtin calls with no uses, there is no need to explicitly check for
+        // masked stores.
+        if (UVR.isValueOrMaskVarying(CI)) {
+          return true;
+        }
+      }
+    } else if (auto *UI = dyn_cast<Instruction>(User)) {
+      if (isa<LoadInst>(User)) {
+        // Don't trace through loads
+      } else if (Visited.insert(UI).second) {
+        if (findStrayLeaves(UVR, *UI, Visited)) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool isDivergenceReduction(const Function &F) {
+  compiler::utils::Lexer L(F.getName());
+  return (L.Consume(VectorizationContext::InternalBuiltinPrefix) &&
+          L.Consume("divergence_"));
+}
+
+bool isTrueUniformInternal(const Value *V, unsigned Depth) {
+  if (!V) {
+    return false;
+  }
+
+  // Constants and Arguments that can't be undef/poison are truly uniform
+  if (isa<Constant>(V) || isa<Argument>(V)) {
+    return isGuaranteedNotToBePoison(V);
+  }
+
+  constexpr unsigned DepthLimit = 6;
+
+  if (Depth < DepthLimit) {
+    // For a specific subset of instructions, if all operands are truly
+    // uniform, then the instruction is too.
+    // FIXME: This is pessimistic. We could improve this by extending the list
+    // of instructions covered. We could also use flow-sensitive analysis in
+    // isGuaranteedNotToBePoison to enhance its capabilities.
+    if (const auto *I = dyn_cast<Instruction>(V)) {
+      if (isa<UnaryOperator>(I) || isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+          isa<CmpInst>(I) || isa<SelectInst>(I) || isa<PHINode>(I)) {
+        return isGuaranteedNotToBePoison(I) &&
+               llvm::all_of(I->operands(), [Depth](Value *Op) {
+                 return isTrueUniformInternal(Op, Depth + 1);
+               });
+      }
+    }
+  }
+
+  return false;
+}
+
+} // namespace
+
+UniformValueResult::UniformValueResult(Function &F, VectorizationUnit &vu)
+    : F(F), VU(vu), Ctx(VU.context()), dimension(VU.dimension()) {}
+
+bool UniformValueResult::isVarying(const Value *V) const {
+  auto found = varying.find(V);
+  if (found == varying.end()) {
+    return false;
+  }
+  return found->second == VaryingKind::eValueVarying;
+}
+
+bool UniformValueResult::isMaskVarying(const Value *V) const {
+  auto found = varying.find(V);
+  if (found == varying.end()) {
+    return false;
+  }
+  return found->second == VaryingKind::eMaskVarying;
+}
+
+bool UniformValueResult::isValueOrMaskVarying(const Value *V) const {
+  auto found = varying.find(V);
+  if (found == varying.end()) {
+    return false;
+  }
+  return found->second != VaryingKind::eValueTrueUniform &&
+         found->second != VaryingKind::eValueActiveUniform;
+}
+
+bool UniformValueResult::isTrueUniform(const Value *V) {
+  auto found = varying.find(V);
+  if (found != varying.end()) {
+    return found->second == VaryingKind::eValueTrueUniform;
+  }
+  if (!isTrueUniformInternal(V, /*Depth=*/0)) {
+    return false;
+  }
+  // Cache this result to help speed up future queries
+  varying[V] = VaryingKind::eValueTrueUniform;
+  return true;
+}
+
+/// @brief Utility function to check whether an instruction is a call to a
+/// reduction or broadcast operaton.
+///
+/// @param[in] I Instruction to check
+/// @param[in] BI BuiltinInfo for platform-specific builtin IDs
+/// @return true if the instruction is a call to a reduction or broadcast
+/// builtin.
+static bool
+isGroupBroadcastOrReduction(const Instruction &I,
+                            const compiler::utils::BuiltinInfo &BI) {
+  if (!isa<CallInst>(&I)) {
+    return false;
+  }
+  auto *const CI = cast<CallInst>(&I);
+  auto *const Callee = CI->getCalledFunction();
+  if (!Callee) {
+    return false;
+  }
+  auto B = BI.analyzeBuiltin(*Callee);
+  if (!B) {
+    return false;
+  }
+  auto Info = BI.isMuxGroupCollective(B->ID);
+  return Info && (Info->isSubGroupScope() || Info->isWorkGroupScope()) &&
+         (Info->isAnyAll() || Info->isReduction() || Info->isBroadcast());
+}
+
+void UniformValueResult::findVectorLeaves(
+    std::vector<Instruction *> &Leaves) const {
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      // Reductions and broadcasts are always vector leaves regardless of
+      // uniformity.
+      if (isGroupBroadcastOrReduction(I, BI)) {
+        Leaves.push_back(&I);
+        continue;
+      }
+
+      if (!isVarying(&I)) {
+        if (isMaskVarying(&I)) {
+          // it's a leaf if only its mask operand is varying, since the value
+          // itself will be uniform and won't propagate "varying" to its users.
+          Leaves.push_back(&I);
+          continue;
+        }
+        if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+          Function *Callee = CI->getCalledFunction();
+          if (!Callee) {
+            continue;
+          }
+
+          // If its a call to user defined function whose use is empty, and is
+          // uniform then add it to the leaves
+          if (!Callee->isIntrinsic() && CI->use_empty()) {
+            // Try to identify the called function
+            const auto Builtin = BI.analyzeBuiltin(*Callee);
+            if (!Builtin) {
+              Leaves.push_back(CI);
+            }
+          }
+        }
+        continue;
+      }
+
+      if (StoreInst *Store = dyn_cast<StoreInst>(&I)) {
+        Instruction *Ptr = dyn_cast<Instruction>(Store->getPointerOperand());
+        if (Ptr && isVarying(Ptr)) {
+          Leaves.push_back(Store);
+        }
+        continue;
+      }
+
+      if (ReturnInst *Ret = dyn_cast<ReturnInst>(&I)) {
+        Leaves.push_back(Ret);
+        continue;
+      }
+
+      if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&I)) {
+        Leaves.push_back(RMW);
+        continue;
+      } else if (AtomicCmpXchgInst *CmpXchg = dyn_cast<AtomicCmpXchgInst>(&I)) {
+        Leaves.push_back(CmpXchg);
+        continue;
+      }
+
+      // Functions that have no uses are leaves.
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        bool IsCallLeaf = false;
+        if (CI->use_empty()) {
+          IsCallLeaf = true;
+        } else if (auto Op = MemOp::get(CI)) {
+          // Handle masked stores.
+          if (Op->isStore() &&
+              (Op->isMaskedMemOp() || Op->isMaskedInterleavedMemOp() ||
+               Op->isMaskedScatterGatherMemOp())) {
+            IsCallLeaf = true;
+          }
+        } else if (Ctx.isMaskedAtomicFunction(*CI->getCalledFunction())) {
+          IsCallLeaf = true;
+        }
+        if (IsCallLeaf) {
+          Leaves.push_back(CI);
+          continue;
+        }
+      }
+    }
+  }
+}
+
+void UniformValueResult::findVectorRoots(std::vector<Value *> &Roots) const {
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI || !CI->getCalledFunction()) {
+        continue;
+      }
+      const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension);
+      if (!Builtin) {
+        continue;
+      }
+      const auto Uniformity = Builtin->uniformity;
+      if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
+          Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+        // Calls to `get_global_id`/`get_local_id` are roots.
+        Roots.push_back(CI);
+      } else if ((Uniformity == compiler::utils::eBuiltinUniformityNever) &&
+                 !CI->getType()->isVoidTy()) {
+        // Non-void builtins with side-effects are also roots.
+        Roots.push_back(CI);
+      }
+    }
+  }
+
+  // Add vectorized arguments to the list of roots.
+  for (const VectorizerTargetArgument &TargetArg : VU.arguments()) {
+    if (!TargetArg.IsVectorized && !TargetArg.PointerRetPointeeTy) {
+      continue;
+    }
+
+    if (&F == VU.scalarFunction()) {
+      Roots.push_back(TargetArg.OldArg);
+    } else if (&F == VU.vectorizedFunction()) {
+      if (TargetArg.Placeholder) {
+        Roots.push_back(TargetArg.Placeholder);
+      } else {
+        Roots.push_back(TargetArg.NewArg);
+      }
+    }
+  }
+}
+
+AllocaInst *UniformValueResult::findAllocaFromPointer(Value *Pointer) {
+  while (Pointer) {
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Pointer)) {
+      return Alloca;
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Pointer)) {
+      Pointer = GEP->getPointerOperand();
+    } else if (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer)) {
+      Pointer = BC->getOperand(0);
+    } else if (LoadInst *Load = dyn_cast<LoadInst>(Pointer)) {
+      Pointer = Load->getPointerOperand();
+    } else {
+      return nullptr;
+    }
+  }
+
+  return nullptr;
+}
+
+void UniformValueResult::markVaryingValues(Value *V, Value *From) {
+  auto &vary = varying[V];
+  // Do not visit values twice.
+  if (vary == VaryingKind::eValueVarying) {
+    return;
+  }
+
+  if (CallInst *CI = dyn_cast<CallInst>(V)) {
+    // Some builtins produce a uniform value regardless of their inputs.
+    Function *Callee = CI->getCalledFunction();
+    if (Callee) {
+      const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+      if (const auto Builtin = BI.analyzeBuiltinCall(*CI, dimension)) {
+        const auto Uniformity = Builtin->uniformity;
+        if (Uniformity == compiler::utils::eBuiltinUniformityAlways) {
+          return;
+        }
+      }
+      if (auto Op = MemOp::get(CI)) {
+        // The mask cannot affect the MemOp value, even though we may still
+        // need to packetize the mask..
+        auto *Mask = Op->getMaskOperand();
+        if (Mask && From == Mask) {
+          vary = VaryingKind::eMaskVarying;
+          return;
+        }
+      } else if (Ctx.isInternalBuiltin(Callee)) {
+        // A divergence reduction builtin's value is uniform even though its
+        // argument is not, since it is a reduction over the SIMD width.
+        if (isDivergenceReduction(*Callee)) {
+          vary = VaryingKind::eMaskVarying;
+          return;
+        }
+      }
+    }
+  }
+
+  // Mark V as being varying.
+  vary = VaryingKind::eValueVarying;
+  LLVM_DEBUG(dbgs() << "vecz: Needs packetization: " << *V << "\n");
+
+  // Visit all users of V, they are varying too.
+  for (const Use &Use : V->uses()) {
+    User *User = Use.getUser();
+    markVaryingValues(User, V);
+  }
+
+  // Mark uses of V for certain kinds of values.
+  Instruction *VIns = dyn_cast<Instruction>(V);
+  if (!VIns) {
+    return;
+  }
+
+  if (StoreInst *Store = dyn_cast<StoreInst>(VIns)) {
+    // Find the base address for the store. Storing varying values to an
+    // alloca location requires the alloca to be vectorized.
+    // We don't want to use extractMemOffset here because this requires the
+    // uniform value analysis to be finished.
+    AllocaInst *Alloca = findAllocaFromPointer(Store->getPointerOperand());
+    if (Alloca) {
+      markVaryingValues(Alloca);
+    }
+  } else if (LoadInst *Load = dyn_cast<LoadInst>(VIns)) {
+    AllocaInst *Alloca = findAllocaFromPointer(Load->getPointerOperand());
+    if (Alloca) {
+      markVaryingValues(Alloca);
+    }
+  } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(VIns)) {
+    // We need to clear the flags because the initial address may be out of
+    // bounds but masked out.
+    GEP->setNoWrapFlags(GEPNoWrapFlags::none());
+
+    // Same as with the stores
+    AllocaInst *Alloca = findAllocaFromPointer(GEP->getPointerOperand());
+    if (Alloca) {
+      markVaryingValues(Alloca);
+    }
+  } else if (BitCastInst *BC = dyn_cast<BitCastInst>(VIns)) {
+    // Same as with the stores
+    AllocaInst *Alloca = findAllocaFromPointer(BC->getOperand(0));
+    if (Alloca) {
+      markVaryingValues(Alloca);
+    }
+  } else if (CallInst *CI = dyn_cast<CallInst>(VIns)) {
+    // Stores might be function calls as well
+    // Known MemOps have one known pointer operand which we can check.
+    if (auto Op = MemOp::get(CI)) {
+      if (auto *const Ptr = Op->getPointerOperand()) {
+        if (auto *Alloca = findAllocaFromPointer(Ptr)) {
+          markVaryingValues(Alloca);
+        }
+      }
+    } else {
+      // Check all parameters of unknown calls with pointer arguments.
+      for (auto &A : CI->args()) {
+        if (A->getType()->isPointerTy()) {
+          if (auto *Alloca = findAllocaFromPointer(A)) {
+            markVaryingValues(Alloca);
+          }
+        }
+      }
+    }
+  }
+}
+
+Value *UniformValueResult::extractMemBase(Value *Address) {
+  if (BitCastInst *BCast = dyn_cast<BitCastInst>(Address)) {
+    return extractMemBase(BCast->getOperand(0));
+  } else if (auto *ASCast = dyn_cast<AddrSpaceCastInst>(Address)) {
+    return extractMemBase(ASCast->getOperand(0));
+  } else if (isa<IntToPtrInst>(Address)) {
+    return Address;
+  } else if (isa<Argument>(Address)) {
+    return Address;
+  } else if (isa<GlobalVariable>(Address)) {
+    return Address;
+  } else if (isa<AllocaInst>(Address)) {
+    return Address;
+  } else if (auto *const Phi = dyn_cast<PHINode>(Address)) {
+    // If all the incoming values are the same, we can trace through it. In
+    // the general case, it's not trivial to check that the stride is the same
+    // from every incoming block, and since incoming values may not dominate
+    // the IRBuilder insert point, we might not even be able to build the
+    // offset expression instructions there.
+    if (auto *const CVal = Phi->hasConstantValue()) {
+      return extractMemBase(CVal);
+    }
+
+    // In the simple case of a loop-incremented pointer using a GEP, we can
+    // handle it thus:
+    auto NumIncoming = Phi->getNumIncomingValues();
+    if (NumIncoming != 2) {
+      // Perhaps we can handle more than one loop latch, but not yet.
+      return nullptr;
+    }
+
+    if (auto *const GEP =
+            dyn_cast<GetElementPtrInst>(Phi->getIncomingValue(1))) {
+      // If it's a simple loop iterator, the base can be analyzed from the
+      // initial value.
+      if (GEP->getPointerOperand() == Phi) {
+        for (const auto &index : GEP->indices()) {
+          if (isVarying(index.get())) {
+            return nullptr;
+          }
+        }
+        return extractMemBase(Phi->getIncomingValue(0));
+      }
+    }
+
+    return nullptr;
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Address)) {
+    // Try to recursively extract the base from the GEP base.
+    return extractMemBase(GEP->getPointerOperand());
+  } else if (isVarying(Address)) {
+    // If it's varying we can't analyze it any further.
+    return nullptr;
+  } else {
+    // If it's uniform we can just return the uniform address.
+    return Address;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+llvm::AnalysisKey UniformValueAnalysis::Key;
+
+UniformValueResult
+UniformValueAnalysis::run(llvm::Function &F,
+                          llvm::FunctionAnalysisManager &AM) {
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+  UniformValueResult Res(F, VU);
+  std::vector<Value *> Roots;
+  Res.findVectorRoots(Roots);
+
+  // Mark all roots and their uses as being varying.
+  for (Value *Root : Roots) {
+    Res.markVaryingValues(Root);
+  }
+
+  const compiler::utils::BuiltinInfo &BI = Res.Ctx.builtins();
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      // Find atomic instructions, these are always varying
+      if (I.isAtomic()) {
+        Res.markVaryingValues(&I);
+        continue;
+      }
+
+      // The same goes for the atomic builtins as well
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (Function *Callee = CI->getCalledFunction()) {
+          const auto Builtin = BI.analyzeBuiltin(*Callee);
+          if (Builtin &&
+              Builtin->properties & compiler::utils::eBuiltinPropertyAtomic) {
+            Res.markVaryingValues(&I);
+            continue;
+          }
+        }
+      }
+    }
+  }
+
+  // If an alloca has been initialized with a uniform value, findVectorLeaves()
+  // will not pick up the store instruction as a leaf, even when that alloca is
+  // used by some other leaves. We have to go through all the allocas and mark
+  // them as varying if any varying instructions use them. This is the case
+  // also for masked stores where only the mask is varying.
+  bool Changed = true;
+  while (Changed) {
+    DenseSet<Instruction *> Visited;
+    Changed = false;
+    bool Remaining = false;
+    for (Instruction &I : F.front()) {
+      if (isa<AllocaInst>(&I)) {
+        if (!Res.isVarying(&I)) {
+          if (findStrayLeaves(Res, I, Visited)) {
+            // We found a varying leaf, so this Alloca is non-uniform.
+            Res.markVaryingValues(&I);
+
+            // Marking an alloca as varying could mark a leaf as varying that
+            // may also depend on a different alloca, so we have to go again.
+            Changed = true;
+          } else {
+            Remaining = true;
+          }
+        }
+      } else {
+        break;
+      }
+    }
+    Changed &= Remaining;
+  }
+
+  return Res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
new file mode 100644
index 0000000000000..edf0101ba883a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorizable_function_analysis.cpp
@@ -0,0 +1,133 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/vectorizable_function_analysis.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
+
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "vectorization_context.h"
+
+#define DEBUG_TYPE "vecz-function-analysis"
+
+using namespace vecz;
+using namespace llvm;
+
+llvm::AnalysisKey VectorizableFunctionAnalysis::Key;
+
+/// @brief Tell Vecz to go ahead and handle calls to declaration-only functions
+///
+/// This flag is for testing and debugging purposes and it should not be used
+/// for normal code as instantiating undefined functions is not always valid.
+static cl::opt<bool> HandleDeclOnlyCalls(
+    "vecz-handle-declaration-only-calls",
+    cl::desc("Go ahead and handle calls to declaration-only functions"));
+
+namespace {
+
+/// @brief Determine whether the instruction can be vectorized or not.
+///
+/// @param[in] I Instruction to check for vectorizability.
+/// @param[in] Ctx VectorizationContext for BuiltinInfo.
+///
+/// @return true if I can be vectorized, false otherwise.
+bool canVectorize(const Instruction &I, const VectorizationContext &Ctx) {
+  // Certain instructions just cannot appear.
+  switch (I.getOpcode()) {
+  default:
+    break;
+  case Instruction::IndirectBr:
+  case Instruction::VAArg:
+  case Instruction::Invoke:
+  case Instruction::Resume:
+  case Instruction::LandingPad:
+    return false;
+  }
+
+  // User function calls.
+  if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+    if (const Function *Callee = CI->getCalledFunction()) {
+      // We are going to assume that we can handle LLVM intrinsics for now and
+      // let the later passes deal with them
+      if (Callee->isIntrinsic()) {
+        return true;
+      }
+
+      // All builtins should be vectorizable, in principle. "Invalid builtins"
+      // correspond to user functions.
+      const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+      const auto Builtin = BI.analyzeBuiltin(*Callee);
+      if (!Builtin) {
+        // If it is a user function missing a definition, we cannot safely
+        // instantiate it. For example, what if it contains calls to
+        // get_global_id internally?
+        if (Callee->isDeclaration()) {
+          return HandleDeclOnlyCalls;
+        }
+        // The same goes for functions we cannot inline, at least until we have
+        // a way of determining if a function can be safely instantiated or not.
+        if (Callee->hasFnAttribute(Attribute::NoInline)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+/// @brief Determine whether the function can be vectorized or not.
+///
+/// @param[in] F Function to check for vectorizability.
+/// @param[in] Ctx VectorizationContext for BuiltinInfo.
+///
+/// @return true if F can be vectorized, false otherwise.
+bool canVectorize(const Function &F, const VectorizationContext &Ctx) {
+  // Do not vectorize functions with the OptNone attribute. Also do not
+  // vectorize functions with the NoInline attribute, since conceptually, the
+  // vectorized kernel calls the original kernel in a loop, and then that gets
+  // inlined and optimized.
+  if (F.hasFnAttribute(Attribute::OptimizeNone) ||
+      F.hasFnAttribute(Attribute::NoInline)) {
+    return false;
+  }
+
+  // Look for things that are not (yet?) supported.
+  for (const BasicBlock &BB : F) {
+    for (const Instruction &I : BB) {
+      if (!canVectorize(I, Ctx)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+} // namespace
+
+VectorizableFunctionAnalysis::Result
+VectorizableFunctionAnalysis::run(llvm::Function &F,
+                                  llvm::FunctionAnalysisManager &AM) {
+  Result res;
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+
+  res.canVectorize = canVectorize(F, Ctx);
+  return res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
new file mode 100644
index 0000000000000..484da6c6c8eae
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/analysis/vectorization_unit_analysis.cpp
@@ -0,0 +1,40 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/vectorization_unit_analysis.h"
+
+#define DEBUG_TYPE "vecz-unit-analysis"
+
+using namespace vecz;
+
+llvm::AnalysisKey VectorizationUnitAnalysis::Key;
+
+VectorizationUnitAnalysis::Result
+VectorizationUnitAnalysis::run(llvm::Function &F,
+                               llvm::FunctionAnalysisManager &) {
+  return Result{Ctx.getActiveVU(&F)};
+}
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "vecz-context-analysis"
+
+llvm::AnalysisKey VectorizationContextAnalysis::Key;
+
+VectorizationContextAnalysis::Result
+VectorizationContextAnalysis::run(llvm::Function &,
+                                  llvm::FunctionAnalysisManager &) {
+  return Result{Context};
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
new file mode 100644
index 0000000000000..b6099bad61731
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_boscc.cpp
@@ -0,0 +1,1401 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "control_flow_boscc.h"
+
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/ADT/SetOperations.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <numeric>
+#include <queue>
+#include <utility>
+
+#include "analysis/divergence_analysis.h"
+#include "analysis/liveness_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "llvm_helpers.h"
+#include "reachability.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz-cf"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+using RPOT = ReversePostOrderTraversal<Function *>;
+
+bool isUsedOutsideDefinitionBlock(Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    return std::any_of(I->user_begin(), I->user_end(), [&I](User *U) {
+      return cast<Instruction>(U)->getParent() != I->getParent();
+    });
+  }
+  return false;
+}
+
+/// @brief Check whether a block is "trivial" according to a heuristic
+/// @param[in] BB the Basic Block to check
+/// @return true if the block is trivial
+bool isTrivialBlock(const BasicBlock &BB) {
+  if (BB.size() > 3) {
+    return false;
+  }
+
+  for (const auto &I : BB) {
+    if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects() ||
+        isa<PHINode>(&I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace
+
+/// @brief Check whether a uniform region is viable and worth keeping.
+/// @param[in] region the region to check
+/// @param[in] noDuplicateBlocks blocks the region is not alowed to contain
+/// @return false iff the region should be discarded.
+
+bool ControlFlowConversionState::BOSCCGadget::duplicateUniformRegions() {
+  LLVM_DEBUG(dbgs() << "DUPLICATE UNIFORM REGIONS\n");
+
+  // Keep tracks of blocks that contain NoDuplicate calls.
+  DenseSet<BasicBlock *> noDuplicateBlocks;
+  SmallPtrSet<Loop *, 16> noDuplicateLoops;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        if (CI->hasFnAttr(Attribute::NoDuplicate)) {
+          noDuplicateBlocks.insert(&BB);
+          auto *const loop = DR->getTag(&BB).loop;
+          if (loop) {
+            noDuplicateLoops.insert(loop->loop);
+          }
+          break;
+        }
+      }
+    }
+  }
+
+  // First, create the regions.
+  VECZ_FAIL_IF(!createUniformRegions(noDuplicateBlocks));
+
+  // Keep track of blocks that belong to loops. If a whole loop is duplicated,
+  // then a new loop object should be created for the uniform version.
+  SmallVector<Loop *, 16> duplicatedLoops;
+  SmallPtrSet<Loop *, 16> duplicatedLoopSet;
+
+  const size_t size =
+      std::accumulate(uniformRegions.begin(), uniformRegions.end(), 0,
+                      [](size_t base, const UniformRegion &region) {
+                        return base + region.predicatedBlocks.size();
+                      });
+  std::vector<BasicBlock *> newBlocks;
+  newBlocks.reserve(size);
+
+  // Conserve the original edges of the CFG.
+  for (BasicBlock &BB : F) {
+    for (BasicBlock *succ : successors(&BB)) {
+      uniformEdges[&BB].push_back(succ);
+    }
+  }
+
+  // Then duplicate them.
+  for (auto &region : uniformRegions) {
+    BasicBlock *entry = region.entryBlock;
+
+    std::vector<BasicBlock *> sortedNewRegionBlocks;
+    sortedNewRegionBlocks.reserve(region.predicatedBlocks.size());
+
+    // Process the region's predicated blocks in DCBI order.
+    // Gather the block indices, then sort them.
+    std::vector<size_t> predicatedBlockIndices;
+    predicatedBlockIndices.reserve(region.predicatedBlocks.size());
+    for (auto *const B : region.predicatedBlocks) {
+      predicatedBlockIndices.push_back(DR->getTagIndex(B));
+    }
+    std::sort(predicatedBlockIndices.begin(), predicatedBlockIndices.end());
+
+    for (const auto index : predicatedBlockIndices) {
+      const auto &BTag = DR->getBlockTag(index);
+      auto *const B = BTag.BB;
+      auto *const LTag = BTag.loop;
+
+      // If the block is the BOSCC entry block, we don't want to duplicate it
+      // unless it is part of a loop.
+      if (B == entry && !LTag) {
+        continue;
+      }
+
+      BasicBlock *newB = nullptr;
+      // If we have already cloned 'B', then we can reuse the cloned version.
+      if (VMap.count(B)) {
+        continue;
+      }
+
+      newB = CloneBasicBlock(B, VMap, ".uniform", &F);
+      VMap.insert({B, newB});
+      region.uniformBlocks.insert(newB);
+      newBlocks.push_back(newB);
+      sortedNewRegionBlocks.push_back(newB);
+
+      // The new blocks will remain uniform
+      BasicBlockTag &newBTag = DR->getOrCreateTag(newB);
+      DR->setFlag(*newB, eBlockIsUniform);
+
+      if (LTag) {
+        auto *const loop = LTag->loop;
+        if (LTag->header == B) {
+          duplicatedLoopSet.insert(loop);
+          duplicatedLoops.push_back(loop);
+        }
+
+        if (!duplicatedLoopSet.contains(loop)) {
+          newBTag.loop = LTag;
+          loop->addBasicBlockToLoop(newB, *LI);
+        }
+      }
+    }
+
+    // Splice the newly inserted blocks into the function right before the
+    // first div_causing block.
+    if (!sortedNewRegionBlocks.empty() &&
+        entry->getNextNode() != sortedNewRegionBlocks[0]) {
+      F.splice(entry->getNextNode()->getIterator(), &F,
+               sortedNewRegionBlocks[0]->getIterator(), F.end());
+    }
+  }
+
+  // Since we added all loops by their headers in DCBI order, inner loops will
+  // always follow outer loops, so there is no need to sort them.
+  for (Loop *L : duplicatedLoops) {
+    if (!LMap.contains(L) && !noDuplicateLoops.contains(L)) {
+      VECZ_FAIL_IF(!duplicateUniformLoops(L));
+    }
+  }
+
+  // Fix the duplicated instructions arguments.
+  for (BasicBlock *B : newBlocks) {
+    const bool notHeader = !DR->getTag(B).isLoopHeader();
+
+    for (Instruction &I : *B) {
+      RemapInstruction(&I, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+      // Update the phi nodes if a uniform block has any incoming blocks*
+      // that are not div causing. In that case, the predicated incoming blocks
+      // will never be rewired to the uniform block so we can remove the
+      // incoming block from the phi node, unless 'B' is a loop header, in which
+      // case its predicated preheader (if any) will be rewired to it while we
+      // connect the regions).
+      //
+      // *NOTE a non-div-causing incoming block may or may not be a predicated
+      // block. A By All block with a non-varying branch can still branch into
+      // a BOSCC region, which would seem to break the SESE criteria.
+      if (notHeader) {
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          for (unsigned i = 0; i < PHI->getNumIncomingValues(); ++i) {
+            BasicBlock *PHIB = PHI->getIncomingBlock(i);
+            if (!DR->isUniform(*PHIB) &&
+                !DR->hasFlag(*PHIB,
+                             BlockDivergenceFlag::eBlockHasDivergentBranch)) {
+              PHI->removeIncomingValue(i--);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::duplicateUniformLoops(Loop *L) {
+  const LoopTag &LTag = DR->getTag(L);
+  Loop *const uniformL = LI->AllocateLoop();
+
+  // Either add 'uniformL' as a child of a loop or as a top level loop.
+  // If it is a child loop, either add it as a child of a uniform loop if it
+  // exists, otherwise as a child of a predicated loop.
+  if (Loop *parentL = L->getParentLoop()) {
+    auto it = LMap.find(parentL);
+    if (it != LMap.end()) {
+      it->second->addChildLoop(uniformL);
+    } else {
+      parentL->addChildLoop(uniformL);
+    }
+  } else {
+    LI->addTopLevelLoop(uniformL);
+  }
+
+  LMap.insert({L, uniformL});
+
+  LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " has been duplicated\n");
+
+  // Fill the loop tag.
+  LoopTag *uniformLTag = &DR->getOrCreateTag(uniformL);
+
+  // The preheader of the loop may not have been duplicated.
+  BasicBlock *preheader = LTag.preheader;
+  if (BasicBlock *uniformPreheader = getBlock(preheader)) {
+    preheader = uniformPreheader;
+  }
+  uniformLTag->preheader = preheader;
+  uniformLTag->header = getBlock(LTag.header);
+  uniformLTag->latch = getBlock(LTag.latch);
+
+  LLVM_DEBUG(dbgs() << "\tPreheader: " << uniformLTag->preheader->getName()
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "\tHeader: " << uniformLTag->header->getName() << "\n");
+  LLVM_DEBUG(dbgs() << "\tLatch: " << uniformLTag->latch->getName() << "\n");
+
+  // Add all blocks to the uniform version.
+  for (BasicBlock *blockL : L->blocks()) {
+    if (DR->getTag(blockL).loop->loop == L) {
+      BasicBlockTag &uniformBlockLTag = DR->getTag(getBlock(blockL));
+      uniformL->addBasicBlockToLoop(uniformBlockLTag.BB, *LI);
+      uniformBlockLTag.loop = uniformLTag;
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::createUniformRegions(
+    const DenseSet<BasicBlock *> &noDuplicateBlocks) {
+  auto discardRegion =
+      [&noDuplicateBlocks](const UniformRegion &region) -> bool {
+    // To determine if it is worth it to duplicate the uniform region, we must
+    // take several elements into account:
+    // - The length of the duplicated code
+    // - branch probability
+    // size_t cost =
+    //    std::accumulate(Region->predicatedBlocks.begin(),
+    //    Region->predicatedBlocks.end(), 0,
+    //                    [](int x, BasicBlock *B) { return x +
+    //                    B->size(); });
+    // PercentageOfAllTrue =
+    // runTimeValuesOfVectorPredicateAllTrue /
+    // runTimeValuesOfVectorPredicate;
+    //
+    // It may not be worth to duplicate the whole uniform region but still worth
+    // to duplicate some of the divergent branches in it.
+
+    if (region.predicatedBlocks.empty() /*|| cost > max*/) {
+      return true;
+    }
+
+    // If the region we want to duplicate contains NoDuplicate
+    // function calls, then we cannot duplicate it.
+    if (std::any_of(region.predicatedBlocks.begin(),
+                    region.predicatedBlocks.end(),
+                    [&noDuplicateBlocks](BasicBlock *B) {
+                      return noDuplicateBlocks.count(B);
+                    })) {
+      LLVM_DEBUG(dbgs() << "Region of " << region.entryBlock->getName()
+                        << " cannot be duplicated because of "
+                           "NoDuplicate instructions\n");
+      return true;
+    }
+
+    // It's not worth BOSCCing if all the blocks are trivial
+    if (std::all_of(region.predicatedBlocks.begin(),
+                    region.predicatedBlocks.end(),
+                    [](BasicBlock *B) { return isTrivialBlock(*B); })) {
+      return true;
+    }
+
+    return false;
+  };
+
+  // We wish to identify Single-Entry, Single-Exit regions of the CFG
+  // that contain divergence-causing branches. A SESE region is defined
+  // as a subgraph of the CFG with an entry point at A and an exit point
+  // at B such that:
+  //   1. A dominates B
+  //   2. B post-dominates A
+  //   3. Any loop containing A also contains B, and vice-versa.
+  //
+  // The properties of the Dominance-Compact Block Indexing also happen to
+  // imply SESE-compactness, so once we identify an entry point, we can
+  // construct a SESE region by finding the exit block that post-dominates
+  // everything in a subsequence of the DCBI starting from A.
+  //
+  // We had assumed initailly that any divergence-causing block will be the
+  // start of a SESE region. However, certain edge cases have arisen during
+  // testing that demonstrate that this is not the case. In practice, this
+  // doesn't seem to matter, as long as we can fully identify the predicated
+  // subset of the SESE region, so we are really working with Multiple-Entry,
+  // Single-Exit regions here. This was the cause of the BOSCC Back Door bug
+  // that was encountered previously, where the entry block of a
+  // supposed SESE region did not actually dominate everything in the region,
+  // which in this case was caused by an additional non-divergent code path
+  // (the "back door" entry point), but it is equally possible for two
+  // divergence-causing branches to enter a predicated region.
+  //
+  // a)    A*      b)    A       c)    A       d)    A      .
+  //      / \           / \           / \           / \     .
+  //     B   D         B*  D         B*  D*        B*  D*   .
+  //    / \ / \       / \ / \       / \ / \       / \ / \   .
+  //   C   F   E     C   F   E     C   F   E     C   F   E  .
+  //    \  |  /       \  |  /       \  |  /       \ /   /   .
+  //     \ | /         \ | /         \ | /         G   /    .
+  //      \|/           \|/           \|/           \ /     .
+  //       X             X             X             X      .
+  //
+  // Figure 1. CFGs showing SESE regions. Divergence-causing blocks are marked
+  // with an asterisk. Blocks are labelled alphabetically in DCBI order.
+  //
+  // (1a) shows the case of a SESE region with a divergence-causing entry block.
+  //
+  // (1b) shows the "back door" case, where a block inside the predicated
+  // sub-region has a non-divergent predecessor outside of it.
+  //
+  // (1c) shows a SESE region with two divergence-causing entry points into the
+  // predicated sub-region. This will result in two overlapping regions.
+  //
+  // (1d) shows a case where the exit block of the SESE region is not the
+  // immediate post-dominator of B, the first-encountered divergence causing
+  // block. Therefore the two overlapping regions have different exit blocks.
+  //
+  // Another situation can arise where the SESE region can contain
+  // two completely unconnected predicated subregions. Although the DCBI is
+  // SESE compact, a SESE region can still contain other, nested SESE regions.
+  // Since an entry point into the predicated subregion is not necessarily the
+  // SESE entry point, all predicated blocks may not be reachable from every
+  // entry point. Because of these cases, it is necessary to consider each
+  // divergence causing block that is not part of the predicated subregion of
+  // any other divergence causing block as the entry point of their own SESE
+  // regions, even though this does not strictly satisfy the SESE criteria.
+  //
+  // a)    A      b)       A      Figure 2.
+  //      / \             / \     .
+  //     B*  E*          /   D*   (2a) shows a case of two independent regions
+  //    / \ / \         /   / \   sharing an exit block.
+  //   C  D F  G       B*  E   F  .
+  //    \ | | /       / \   \ /   (2b) shows a case where a SESE subregion will
+  //     \| |/       C   \   G    appear in the middle of the DCBI of the
+  //      \ /         \   \ /     subregion beginning with B. G post-dominates
+  //       X           \   H      D, forming a complete nested SESE region.
+  //                    \ /       .
+  //                     X        .
+
+  struct SESEInfo {
+    BasicBlock *BB = nullptr;
+    bool divCausing = false;
+    bool predicated = false;
+  };
+
+  // Collect all the blocks in the worklist
+  const auto &DCBI = DR->getBlockOrdering();
+  const size_t numBlocks = DCBI.size();
+  SmallVector<SESEInfo, 16> SESE;
+  SESE.reserve(numBlocks);
+  for (const auto &BBTag : DCBI) {
+    SESE.emplace_back();
+    SESE.back().BB = BBTag.BB;
+  }
+
+  // Mark all the divergence-causing blocks
+  for (auto *const BB : DR->getDivCausingBlocks()) {
+    SESE[DR->getTagIndex(BB)].divCausing = true;
+  }
+
+  // Create the BOSCC regions
+  for (size_t i = 0; i != numBlocks;) {
+    auto &info = SESE[i];
+    if (!info.divCausing) {
+      ++i;
+      continue;
+    }
+
+    uniformRegions.emplace_back();
+    auto &region = uniformRegions.back();
+    const size_t entryPos = i;
+    size_t exitPos = 0u;
+    size_t firstPredicated = numBlocks;
+
+    region.entryBlock = info.BB;
+    region.divergentBranches.push_back(info.BB);
+
+    SmallVector<unsigned, 16> stack;
+
+    // If we are in a divergent loop, then the whole loop needs a uniform
+    // version.
+    const auto *const entryLoopTag = DR->getTag(info.BB).loop;
+    if (entryLoopTag && entryLoopTag->isLoopDivergent()) {
+      auto *const loop = entryLoopTag->loop;
+      for (BasicBlock *loopB : loop->blocks()) {
+        const size_t pos = DR->getTagIndex(loopB);
+        firstPredicated = std::min(firstPredicated, pos);
+        SESE[pos].predicated = true;
+        region.predicatedBlocks.insert(loopB);
+
+        if (loop->isLoopExiting(loopB)) {
+          stack.push_back(pos);
+        }
+      }
+    }
+
+    // Traverse the CFG from the entry point, marking blocks for predication
+    stack.push_back(entryPos);
+    while (!stack.empty()) {
+      auto *const cur = SESE[stack.pop_back_val()].BB;
+      for (BasicBlock *succ : successors(cur)) {
+        const size_t succPos = DR->getTagIndex(succ);
+
+        auto *const succLoopTag = DR->getBlockTag(succPos).loop;
+        if ((!succLoopTag || !succLoopTag->isLoopDivergent()) &&
+            // The region 'entry' creates contains only blocks that are
+            // contained in its SESE region.
+            PDT->properlyDominates(succ, region.entryBlock)) {
+          VECZ_ERROR_IF(exitPos != 0u && succPos != exitPos,
+                        "SESE region multiple exit blocks identified");
+          exitPos = succPos;
+          continue;
+        }
+
+        auto &succInfo = SESE[succPos];
+        if (!succInfo.predicated) {
+          firstPredicated = std::min(firstPredicated, succPos);
+          stack.push_back(succPos);
+          region.predicatedBlocks.insert(succ);
+          succInfo.predicated = true;
+        }
+      }
+    }
+    VECZ_ERROR_IF(exitPos == 0u, "SESE region exit block not identified");
+    region.exitBlock = SESE[exitPos].BB;
+    i = exitPos;
+
+    // Collect any other divergent branches in the predicated region, and clear
+    // the predication flags so regions can overlap.
+    for (unsigned j = firstPredicated; j != exitPos; ++j) {
+      auto &ji = SESE[j];
+      if (ji.divCausing && j > entryPos) {
+        if (ji.predicated) {
+          region.divergentBranches.push_back(ji.BB);
+          ji.divCausing = false;
+        } else if (j < i) {
+          // Found another unpredicated divergent branch between the entry
+          // point and the exit point. Reset the iterator so we can process it.
+          i = j;
+        }
+      }
+      ji.predicated = false;
+    }
+
+    if (discardRegion(region)) {
+      // It's not worth keeping this region.
+      uniformRegions.pop_back();
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::connectBOSCCRegions() {
+  LLVM_DEBUG(dbgs() << "CONNECT BOSCC REGIONS\n");
+
+  // If we have not duplicated a loop but we have duplicated the preheader,
+  // then the loop now has 2 preheaders. We thus need to blend them into one
+  // single preheader.
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    if (!LTag->isLoopDivergent() && !LMap.contains(LTag->loop)) {
+      BasicBlock *predicatedPreheader = LTag->preheader;
+      if (BasicBlock *uniformPreheader = getBlock(predicatedPreheader)) {
+        BasicBlock *header = LTag->header;
+
+        LLVM_DEBUG(dbgs() << "Loop " << header->getName()
+                          << " has two preheaders\n");
+
+        // Create a new loop preheader that blends both the uniform and
+        // predicated preheaders, to keep well formed loops (with only one
+        // incoming preheader).
+        BasicBlock *newPreheader = BasicBlock::Create(
+            F.getContext(), predicatedPreheader->getName() + ".blend", &F,
+            header);
+        BranchInst::Create(header, newPreheader);
+
+        // Set the successor of both preheaders to be the new preheader.
+        auto *predicatedPreheaderT = predicatedPreheader->getTerminator();
+        auto *uniformPreheaderT = uniformPreheader->getTerminator();
+        VECZ_ERROR_IF(predicatedPreheaderT->getNumSuccessors() != 1,
+                      "Preheader should have only one successor");
+        VECZ_ERROR_IF(uniformPreheaderT->getNumSuccessors() != 1,
+                      "Preheader should have only one successor");
+        predicatedPreheaderT->setSuccessor(0, newPreheader);
+        uniformPreheaderT->setSuccessor(0, newPreheader);
+
+        // Update the tags.
+        BasicBlockTag &newPreheaderTag = DR->getOrCreateTag(newPreheader);
+        newPreheaderTag.loop = DR->getTag(predicatedPreheader).loop;
+        LTag->preheader = newPreheader;
+
+        DR->setFlag(*newPreheader, DR->getFlag(*predicatedPreheader));
+
+        addInRegions(newPreheader, predicatedPreheader);
+      }
+    }
+  }
+
+  // We must make the outermost non duplicated loop's preheader target the
+  // outermost duplicated uniform and predicated loop's headers. The first
+  // iteration of the loop will necessarily have all lanes activated until it
+  // reaches the first divergent block. Also, once the loop starts diverging,
+  // there is no way to go back to a dynamically uniform loop, so there is no
+  // point allowing the loop to go back and forth between its uniform and
+  // predicated versions. Only going from the uniform to the predicated
+  // version makes sense.
+  for (const auto &pair : LMap) {
+    Loop *uniformL = pair.second;
+    const Loop *L = pair.first;
+
+    if (Loop *parentL = L->getParentLoop()) {
+      if (LMap.contains(parentL)) {
+        continue;
+      }
+    }
+
+    const auto &LTag = DR->getTag(L);
+    BasicBlock *preheader = LTag.preheader;
+    if (!VMap.count(preheader)) {
+      auto *T = preheader->getTerminator();
+      VECZ_ERROR_IF(T->getNumSuccessors() != 1,
+                    "Preheader has more than one successor");
+
+      LLVM_DEBUG(dbgs() << "Non duplicated preheader " << preheader->getName()
+                        << "must target uniform loop " << uniformL->getName()
+                        << "\n");
+
+      // Add a path from 'preheader' to the uniform loop header and make it
+      // always branch to it. We want to keep the edge from 'preheader' to the
+      // predicated loop header (even though we will never branch to it) to ease
+      // some needed blendings later on.
+      IRCleanup::deleteInstructionNow(T);
+      BranchInst::Create(DR->getTag(uniformL).header, LTag.header,
+                         ConstantInt::getTrue(F.getContext()), preheader);
+    }
+  }
+
+  DenseSet<BasicBlock *> connectedBlocks;
+  for (auto &region : uniformRegions) {
+    // Each uniform version of div causing blocks need an entry point to the
+    // predicated CFG.
+    for (BasicBlock *B : region.divergentBranches) {
+      if (connectedBlocks.insert(B).second) {
+        if (BasicBlock *uniformB = getBlock(B)) {
+          VECZ_FAIL_IF(!connectUniformRegion(region, B, uniformB));
+        } else {
+          VECZ_FAIL_IF(!connectUniformRegion(region, B, B));
+        }
+      } else {
+        // No other region should have connected the entry block.
+        BasicBlock *entry = region.entryBlock;
+        VECZ_FAIL_IF(B == entry);
+      }
+    }
+  }
+
+  // If a uniform block targets a predicated block, the latter needs its
+  // operands that have a uniform and predicated version blended.
+  for (const auto &predicatedBTag : DR->getBlockOrdering()) {
+    if (BasicBlock *uniformB = getBlock(predicatedBTag.BB)) {
+      for (BasicBlock *succ : successors(uniformB)) {
+        // We've found a uniform block that targets a predicated block prior
+        // to connecting the regions.
+        if (!DR->isUniform(*succ)) {
+          LLVM_DEBUG(dbgs() << "Uniform block " << uniformB->getName()
+                            << " targets predicated block " << succ->getName()
+                            << "\n");
+          VECZ_FAIL_IF(
+              !blendConnectionPoint(succ, {predicatedBTag.BB, uniformB}));
+        }
+      }
+    }
+  }
+
+  // Add all the uniform blocks into the worklist now they got connected.
+  DT->recalculate(F);
+  PDT->recalculate(F);
+  VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated");
+  VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated");
+  VECZ_FAIL_IF(!computeBlockOrdering());
+
+  // NOTE doing the Liveness Analysis here is potentially dangerous, since we
+  // have yet to fully restore SSA form.
+  liveness = &AM.getResult<LivenessAnalysis>(F);
+  RC->recalculate(F);
+  VECZ_FAIL_IF(!blendFinalize());
+
+  // Sort URVBlender in a post order so that the replaced new values don't
+  // overlap with old ones.
+  if (!URVB.empty()) {
+    std::sort(URVB.begin(), URVB.end(),
+              [this](const URVBlender::value_type &LHS,
+                     const URVBlender::value_type &RHS) {
+                return DR->getTagIndex(LHS.first) > DR->getTagIndex(RHS.first);
+              });
+
+    // Now that the CFG has been fully rewired and every node is correctly
+    // connected, we can replace the blended values uses with their new
+    // value.
+    DenseSet<Instruction *> toDelete;
+    for (const URVBlender::value_type &blender : URVB) {
+      BasicBlock *block = blender.first;
+      Value *from = blender.second.first;
+      Instruction *to = blender.second.second;
+      if (!isUsedOutsideDefinitionBlock(from)) {
+        toDelete.insert(to);
+      } else {
+        VECZ_ERROR_IF(!isa<Instruction>(from),
+                      "Trying to replace uses of a value");
+        VECZ_FAIL_IF(
+            !replaceReachableUses(*RC, cast<Instruction>(from), to, block));
+      }
+    }
+
+    for (Instruction *I : toDelete) {
+      IRCleanup::deleteInstructionNow(I);
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::connectUniformRegion(
+    UniformRegion &region, BasicBlock *predicatedB, BasicBlock *uniformB) {
+  auto replaceIncomingBlock = [](BasicBlock *B, BasicBlock *from,
+                                 BasicBlock *to) {
+    for (Instruction &I : *B) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+        const int fromIdx = PHI->getBasicBlockIndex(from);
+        if (fromIdx != -1) {
+          PHI->setIncomingBlock(fromIdx, to);
+        }
+      } else {
+        break;
+      }
+    }
+  };
+
+  LLVM_DEBUG(dbgs() << "\tConnect uniform region of " << predicatedB->getName()
+                    << "\n");
+
+  ConstantInt *trueCI = ConstantInt::getTrue(F.getContext());
+
+  auto *T = uniformB->getTerminator();
+
+  BasicBlock *target = predicatedB->getTerminator()->getSuccessor(0);
+
+  // 1. For each pair {taken, fallthrough} of successors of uniformB,
+  //   a. 'taken' is taken if the exit mask towards that edge is full, i.e. if
+  //      it contains all-true values.
+  //   b. otherwise, we branch to a new block, 'boscc_indir'. If the exit mask
+  //      towards 'fallthrough' is full, branch to the latter.
+  //   c. Otherwise, it means the mask is not dynamically uniform, but varying,
+  //      so we need to branch into the varying counterpart of the uniformregion
+  //      region. The chosen block to branch to is the first successor of
+  //      predicatedB.
+  // 2. When a latch is divergent, we make the uniform latch target the
+  //    predicated header.
+  // 3. We need to feed the last computed uniform values when transitioning to
+  //    the varying version.
+  BasicBlock *runtimeCheckerBlock = uniformB;
+  DR->setFlag(*uniformB, eBlockNeedsAllOfMask);
+
+  // 1.
+  SmallVector<BasicBlock *, 2> succs = uniformEdges[predicatedB];
+  const size_t size = succs.size();
+  VECZ_ERROR_IF(size == 0, "BasicBlock has no successors");
+  for (size_t i = 0; i < size; ++i) {
+    // Not all successors of a BOSCC entry block may be duplicated.
+    if (BasicBlock *uniformSucc = getBlock(succs[i])) {
+      succs[i] = uniformSucc;
+    }
+    LLVM_DEBUG(dbgs() << "\tSuccessor " << i << ": " << succs[i]->getName()
+                      << "\n");
+  }
+
+  for (size_t i = 0; i + 1 < size; ++i) {
+    BasicBlock *succ = succs[i];
+
+    BasicBlock *BOSCCIndir = BasicBlock::Create(
+        uniformB->getContext(), uniformB->getName() + ".boscc_indir", &F,
+        succ->getNextNode());
+
+    region.uniformBlocks.insert(BOSCCIndir);
+
+    BasicBlockTag &BOSCCIndirTag = DR->getOrCreateTag(BOSCCIndir);
+    DR->setFlag(*BOSCCIndir, static_cast<BlockDivergenceFlag>(
+                                 eBlockNeedsAllOfMask | eBlockIsUniform));
+    BOSCCIndirTag.loop = DR->getTag(runtimeCheckerBlock).loop;
+    if (BOSCCIndirTag.loop) {
+      BOSCCIndirTag.loop->loop->addBasicBlockToLoop(BOSCCIndir, *LI);
+    }
+
+    auto *cond =
+        CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                        PassState.getMaskInfo(uniformB).exitMasks.lookup(succ),
+                        trueCI, "", runtimeCheckerBlock);
+    BranchInst::Create(succ, BOSCCIndir, cond, runtimeCheckerBlock);
+
+    if (i > 0) {
+      // Update the incoming block of the phi nodes in 'succ' from 'uniformB'
+      // to 'runtimeCheckerBlock'.
+      replaceIncomingBlock(succ, uniformB, runtimeCheckerBlock);
+    }
+
+    runtimeCheckerBlock = BOSCCIndir;
+  }
+
+  BasicBlock *succ = succs[size - 1];
+  auto *cond =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                      PassState.getMaskInfo(uniformB).exitMasks.lookup(succ),
+                      trueCI, "", runtimeCheckerBlock);
+
+  BasicBlock *connectionPoint = target;
+
+  const auto *const LTag = DR->getTag(predicatedB).loop;
+  const bool needsStore = LTag && LMap.contains(LTag->loop);
+  if (needsStore) {
+    // 'store' is a block that will contain all the uniform versions of the
+    // live in instructions of the predicated target.
+    BasicBlock *store = BasicBlock::Create(
+        target->getContext(), uniformB->getName() + ".boscc_store", &F,
+        runtimeCheckerBlock->getNextNode());
+
+    region.uniformBlocks.insert(store);
+
+    BasicBlockTag &storeTag = DR->getOrCreateTag(store);
+    DR->setFlag(*store, eBlockIsUniform);
+
+    // 2.
+    auto *const uniformLTag = DR->getTag(uniformB).loop;
+    const bool isLoopLatch = uniformLTag && (uniformLTag->latch == uniformB);
+    if (isLoopLatch) {
+      BasicBlock *header = LTag->header;
+      PHINode *entryMask =
+          cast<PHINode>(PassState.getMaskInfo(header).entryMask);
+      Value *latchMask =
+          PassState.getMaskInfo(uniformB).exitMasks.lookup(uniformLTag->header);
+      VECZ_ERROR_IF(!latchMask, "Exit mask does not exist");
+      entryMask->addIncoming(latchMask, store);
+      connectionPoint = header;
+
+      if (succ == uniformLTag->header) {
+        uniformLTag->latch = runtimeCheckerBlock;
+      }
+    }
+
+    BranchInst::Create(connectionPoint, store);
+
+    // 'store' belongs in the first outer loop non duplicated.
+    Loop *parentLoop = LTag->loop->getParentLoop();
+    while (parentLoop && LMap.contains(parentLoop)) {
+      parentLoop = parentLoop->getParentLoop();
+    }
+    if (parentLoop) {
+      storeTag.loop = &DR->getTag(parentLoop);
+      parentLoop->addBasicBlockToLoop(store, *LI);
+    }
+
+    target = store;
+  }
+
+  // 1.c. 'uniformB' has a new runtime check, we can remove its old one.
+  IRCleanup::deleteInstructionNow(T);
+  BranchInst::Create(succ, target, cond, runtimeCheckerBlock);
+
+  // Update the incoming block of the new successors of 'runTimeCheckerBlock'.
+  replaceIncomingBlock(succ, uniformB, runtimeCheckerBlock);
+
+  if (uniformB == predicatedB) {
+    replaceIncomingBlock(connectionPoint, predicatedB, runtimeCheckerBlock);
+  } else {
+    // 3.
+    VECZ_FAIL_IF(!blendConnectionPoint(
+        connectionPoint,
+        {predicatedB, needsStore ? target : runtimeCheckerBlock}));
+
+    if (needsStore) {
+      region.storeBlocks.emplace_back();
+      auto &sb = region.storeBlocks.back();
+      sb.connectionPoint = connectionPoint;
+      sb.target = target;
+      sb.runtimeCheckerBlock = runtimeCheckerBlock;
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::blendConnectionPoint(
+    BasicBlock *CP, const std::pair<BasicBlock *, BasicBlock *> &incoming) {
+  const auto *const CPLTag = DR->getTag(CP).loop;
+  for (auto &region : uniformRegions) {
+    // Create blend instructions at each blend point following 'CP'.
+    if (region.contains(CP) || (CP == region.exitBlock) ||
+        (CP == region.entryBlock)) {
+      // Compute all the blend points that will need to have blend instructions
+      // because of 'CP'. These blocks are all the blocks that have more than
+      // one predecessor, that belong to the same region as 'CP', and that
+      // succeed it.
+      if (!region.blendPoints.contains(CP)) {
+        // The first blend point impacted by 'CP' is 'CP' itself.
+        region.blendPoints.insert({CP, {CP}});
+
+        DenseSet<BasicBlock *> visited{CP};
+        std::queue<BasicBlock *> queue;
+        queue.push(CP);
+        while (!queue.empty()) {
+          BasicBlock *cur = queue.front();
+          queue.pop();
+          // The region exit block is the delimiter of the region.
+          if (cur == region.exitBlock) {
+            continue;
+          }
+          for (BasicBlock *succ : successors(cur)) {
+            if (visited.insert(succ).second) {
+              queue.push(succ);
+              if (std::distance(pred_begin(succ), pred_end(succ)) > 1) {
+                // Nested loops are dominated.
+                if (CPLTag == DR->getTag(succ).loop ||
+                    (CPLTag && !CPLTag->loop->contains(succ))) {
+                  region.blendPoints[CP].push_back(succ);
+                }
+              }
+            }
+          }
+        }
+      }
+
+      region.connections.push_back(UniformRegion::ConnectionInfo{CP, incoming});
+    }
+  }
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::blendFinalize() {
+  for (auto &region : uniformRegions) {
+    for (const auto &connection : region.connections) {
+      BasicBlock *CP = connection.connectionPoint;
+      auto &incoming = connection.incoming;
+
+      // Create blend instructions at each blend point following 'CP'.
+      for (BasicBlock *blendPoint : region.blendPoints[CP]) {
+        LLVM_DEBUG(dbgs() << "BLEND CONNECTION POINT " << blendPoint->getName()
+                          << "\n");
+
+        for (Instruction &I : *blendPoint) {
+          if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+            // Only add 'incoming' for 'CP' because for the other blend points
+            // we don't actually add a new edge.
+            if (blendPoint != CP ||
+                PHI->getBasicBlockIndex(incoming.second) != -1) {
+              continue;
+            }
+
+            unsigned idx = 0;
+            for (; idx < PHI->getNumIncomingValues(); ++idx) {
+              // If one incoming block of the phi node is the predicated version
+              // of the new, uniform, incoming block, use its uniform incoming
+              // value version if it exists.
+              if (PHI->getIncomingBlock(idx) == incoming.first) {
+                if (Value *V = getUniformV(PHI->getIncomingValue(idx))) {
+                  if (Instruction *VI = dyn_cast<Instruction>(V)) {
+                    if (RC->isReachable(VI->getParent(), incoming.second)) {
+                      PHI->addIncoming(VI, incoming.second);
+                      break;
+                    }
+                  }
+                }
+              }
+            }
+            if (idx == PHI->getNumIncomingValues()) {
+              PHI->addIncoming(getDefaultValue(PHI->getType()),
+                               incoming.second);
+            }
+            LLVM_DEBUG(
+                dbgs()
+                << "PHINode " << PHI->getName() << ": Add incoming value "
+                << PHI->getIncomingValueForBlock(incoming.second)->getName()
+                << " from " << incoming.second->getName() << " in "
+                << blendPoint->getName() << "\n");
+          } else {
+            break;
+          }
+        }
+      }
+    }
+    region.connections.clear();
+  }
+
+  DenseSet<BasicBlock *> blendBlocks;
+  for (const auto &region : uniformRegions) {
+    for (auto &CP : region.blendPoints) {
+      for (BasicBlock *blendPoint : CP.second) {
+        blendBlocks.insert(blendPoint);
+      }
+    }
+  }
+
+  for (const auto &tag : DR->getBlockOrdering()) {
+    BasicBlock *blendPoint = tag.BB;
+    if (!blendBlocks.contains(blendPoint)) {
+      continue;
+    }
+
+    DenseSet<Value *> blendedValues;
+    for (Instruction &I : *blendPoint) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+        if (PHI->getName().contains(".boscc_blend")) {
+          for (Value *v : PHI->incoming_values()) {
+            blendedValues.insert(v);
+          }
+        }
+      } else {
+        break;
+      }
+    }
+
+    for (auto *liveInVal : liveness->getBlockInfo(blendPoint).LiveIn) {
+      if (blendedValues.contains(liveInVal)) {
+        continue;
+      }
+
+      auto *liveIn = dyn_cast<Instruction>(liveInVal);
+      if (!liveIn) {
+        continue;
+      }
+
+      BasicBlock *src = liveIn->getParent();
+
+      // Nothing to be done if the definition block has no uniform
+      // equivalent.
+      BasicBlock *uniformSrc = getBlock(src);
+      if (!uniformSrc) {
+        continue;
+      }
+
+      // Nothing to be done if the instruction:
+      // - dominates the connection point,
+      // - cannot reach 'CP'.
+      if (DT->dominates(src, blendPoint)) {
+        continue;
+      }
+
+      if (!RC->isReachable(src, blendPoint)) {
+        continue;
+      }
+
+      Value *uniformLiveIn = getDefaultValue(liveIn->getType());
+      if (Value *V = getUniformV(liveIn)) {
+        uniformLiveIn = V;
+      }
+
+      LLVM_DEBUG(dbgs() << "Blend live in " << liveIn->getName() << " in "
+                        << blendPoint->getName() << "\n");
+
+      PHINode *blend = PHINode::Create(liveIn->getType(), 2,
+                                       liveIn->getName() + ".boscc_blend");
+      blend->insertBefore(blendPoint->begin());
+      bool replaceUniform = false;
+      bool replacePredicate = false;
+      // For each predecessor, if it can reach the instruction, set the
+      // latter as the incoming value, otherwise set a default value.
+      for (BasicBlock *pred : predecessors(blendPoint)) {
+        if (DR->isUniform(*pred)) {
+          Instruction *uniformLiveInI = dyn_cast<Instruction>(uniformLiveIn);
+          if (uniformLiveInI &&
+              !RC->isReachable(uniformLiveInI->getParent(), pred)) {
+            blend->addIncoming(getDefaultValue(uniformLiveInI->getType()),
+                               pred);
+          } else {
+            replaceUniform = true;
+            blend->addIncoming(uniformLiveIn, pred);
+          }
+        } else if (DR->getTag(pred).isLoopBackEdge(blendPoint)) {
+          blend->addIncoming(blend, pred);
+        } else {
+          if (!RC->isReachable(liveIn->getParent(), pred)) {
+            blend->addIncoming(getDefaultValue(liveIn->getType()), pred);
+          } else {
+            replacePredicate = true;
+            blend->addIncoming(liveIn, pred);
+          }
+        }
+        LLVM_DEBUG(dbgs() << "\tAdd incoming value "
+                          << blend->getIncomingValueForBlock(pred)->getName()
+                          << " from " << pred->getName() << "\n");
+      }
+
+      // If we have blended 'liveIn' in 'CP', update the uses.
+      if (replacePredicate) {
+        URVB.push_back({blendPoint, {liveIn, blend}});
+        addReference(blend, liveIn);
+      }
+      // If we have blended 'uniformLiveIn' in 'CP', update the uses.
+      if (replaceUniform && isa<Instruction>(uniformLiveIn)) {
+        URVB.push_back({blendPoint, {uniformLiveIn, blend}});
+      }
+
+      // Update the blend instructions in the loop header, if any.
+      VECZ_FAIL_IF(
+          !updateLoopBlendValues(DR->getTag(blendPoint).loop, liveIn, blend));
+      blendedValues.insert(liveIn);
+    }
+  }
+
+  for (const auto &region : uniformRegions) {
+    for (auto &sb : region.storeBlocks) {
+      BasicBlock *connectionPoint = sb.connectionPoint;
+      BasicBlock *target = sb.target;
+      BasicBlock *runtimeCheckerBlock = sb.runtimeCheckerBlock;
+
+      // Create a bunch of lcssa instructions into 'store' so that the repair
+      // SSA doesn't have to look for the instructions inside the uniform loop.
+      for (Instruction &I : *connectionPoint) {
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          const int idx = PHI->getBasicBlockIndex(target);
+          VECZ_ERROR_IF(idx == -1, "Connection point PHIs must have incoming "
+                                   "block from the target");
+          if (Instruction *incoming =
+                  dyn_cast<Instruction>(PHI->getIncomingValue(idx))) {
+            LLVM_DEBUG(dbgs()
+                       << "Create live-in lcssa of " << incoming->getName()
+                       << " in " << target->getName() << "\n");
+
+            PHINode *blend = PHINode::Create(
+                incoming->getType(), 1, incoming->getName() + ".boscc_lcssa");
+            blend->insertBefore(target->begin());
+            blend->addIncoming(incoming, runtimeCheckerBlock);
+            PHI->setIncomingValue(idx, blend);
+          }
+        } else {
+          break;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+BasicBlock *ControlFlowConversionState::BOSCCGadget::getBlock(BasicBlock *B) {
+  auto BUniform = VMap.find(B);
+  if (BUniform != VMap.end()) {
+    return cast<BasicBlock>(BUniform->second);
+  }
+  return nullptr;
+}
+
+Loop *ControlFlowConversionState::BOSCCGadget::getLoop(Loop *L) {
+  auto LUniform = LMap.find(L);
+  if (LUniform != LMap.end()) {
+    return LUniform->second;
+  }
+  return nullptr;
+}
+
+void ControlFlowConversionState::BOSCCGadget::getUnduplicatedEntryBlocks(
+    SmallVectorImpl<BasicBlock *> &blocks) const {
+  for (const auto &region : uniformRegions) {
+    if (VMap.count(region.entryBlock) == 0) {
+      blocks.push_back(region.entryBlock);
+    }
+  }
+}
+
+void ControlFlowConversionState::BOSCCGadget::createReference(
+    Value *pred, Value *uni, bool needsMapping) {
+  if (!pred || !uni) {
+    return;
+  }
+  auto predIt = VMap.find(pred);
+  if (predIt != VMap.end()) {
+    predIt->second = uni;
+  } else {
+    VMap.insert({pred, uni});
+  }
+
+  if (needsMapping) {
+    if (Instruction *uniI = dyn_cast<Instruction>(uni)) {
+      RemapInstruction(uniI, VMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    }
+  }
+}
+
+void ControlFlowConversionState::BOSCCGadget::addReference(Value *pred,
+                                                           Value *old) {
+  auto uniformOldIt = VMap.find(old);
+  if (uniformOldIt != VMap.end()) {
+    VMap.insert({pred, uniformOldIt->second});
+  }
+}
+
+void ControlFlowConversionState::BOSCCGadget::addInRegions(BasicBlock *newB,
+                                                           BasicBlock *refB) {
+  for (auto &region : uniformRegions) {
+    if (region.contains(refB)) {
+      if (region.predicatedBlocks.insert(newB).second) {
+        LLVM_DEBUG(dbgs() << "BasicBlock " << newB->getName()
+                          << " added to BOSCC region: "
+                          << region.entryBlock->getName() << "\n");
+      }
+    }
+  }
+}
+
+Value *
+ControlFlowConversionState::BOSCCGadget::getUniformV(Value *predicatedV) {
+  auto uniformVIt = VMap.find(predicatedV);
+  if (uniformVIt != VMap.end()) {
+    return uniformVIt->second;
+  }
+  return nullptr;
+}
+
+void ControlFlowConversionState::BOSCCGadget::updateValue(Value *from,
+                                                          Value *to) {
+  auto fromIt = VMap.find(from);
+  if (fromIt != VMap.end()) {
+    Value *fromUniform = fromIt->second;
+    VMap.erase(from);
+    VMap.insert({to, fromUniform});
+  }
+}
+
+bool ControlFlowConversionState::BOSCCGadget::linkMasks() {
+  for (const auto &BTag : DR->getBlockOrdering()) {
+    auto *const BB = BTag.BB;
+    if (auto *const uniformB = getBlock(BB)) {
+      // Both sets of masks had better exist by this point.
+      auto &masks = PassState.getMaskInfo(BB);
+      auto &masksUniform = PassState.getMaskInfo(uniformB);
+      createReference(masks.entryMask, masksUniform.entryMask);
+
+      for (auto *const succ : successors(BB)) {
+        auto *const uniformSucc = getBlock(succ);
+        auto *const target = uniformSucc ? uniformSucc : succ;
+        createReference(masks.exitMasks.lookup(succ),
+                        masksUniform.exitMasks.lookup(target));
+      }
+    }
+  }
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::updateLoopBlendValues(
+    LoopTag *LTag, Instruction *from, Instruction *to) {
+  auto createLatchIncoming = [&from, &LTag, this] {
+    auto *ret =
+        PHINode::Create(from->getType(), 2, from->getName() + ".boscc_blend");
+    ret->insertBefore(LTag->latch->begin());
+    Value *uniform = getUniformV(from);
+    Value *default_val = getDefaultValue(from->getType());
+    for (BasicBlock *pred : predecessors(LTag->latch)) {
+      Value *incoming = default_val;
+      if (RC->isReachable(from->getParent(), pred)) {
+        incoming = from;
+      } else if (uniform) {
+        Instruction *uinst = dyn_cast<Instruction>(uniform);
+        if (!uinst || RC->isReachable(uinst->getParent(), pred)) {
+          incoming = uniform;
+        }
+      }
+      ret->addIncoming(incoming, pred);
+    }
+    URVB.push_back({LTag->latch, {from, ret}});
+    addReference(ret, from);
+    return ret;
+  };
+
+  while (LTag) {
+    PHINode *latchIncoming = nullptr;
+    // Try looking for an existing `boscc_blend` value for `from` to avoid
+    // creating a new one in the latch.
+    for (Instruction &latchI : *LTag->latch) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&latchI)) {
+        if (PHI->getName().contains(".boscc_blend")) {
+          for (Value *incomingValue : PHI->incoming_values()) {
+            if (incomingValue == from) {
+              latchIncoming = PHI;
+              break;
+            }
+          }
+          if (latchIncoming) {
+            break;
+          }
+        }
+      } else {
+        break;
+      }
+    }
+    // Update all uses of `from` in the header with the blended value from the
+    // latch. Since the CFG is final now, this should cover everything.
+    for (Instruction &headerI : *LTag->header) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&headerI)) {
+        const int latchIdx = PHI->getBasicBlockIndex(LTag->latch);
+        VECZ_ERROR_IF(latchIdx == -1,
+                      "Header has no incoming value from the latch");
+        if ((PHI == to) || (PHI->getIncomingValue(latchIdx) == from)) {
+          if (!latchIncoming) {
+            latchIncoming = createLatchIncoming();
+          }
+          PHI->setIncomingValue(latchIdx, latchIncoming);
+        }
+      } else {
+        break;
+      }
+    }
+
+    if (Loop *L = LTag->loop->getParentLoop()) {
+      LTag = &DR->getTag(L);
+    } else {
+      break;
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::computeBlockOrdering() {
+  // Create a map from entry blocks to their uniform regions
+  DenseMap<BasicBlock *, const UniformRegion *> entryMap;
+  unsigned maxUBlocks = 0;
+  for (const auto &region : uniformRegions) {
+    if (!region.uniformBlocks.empty()) {
+      entryMap[region.entryBlock] = &region;
+    }
+    maxUBlocks = std::max(maxUBlocks, region.uniformBlocks.size());
+  }
+
+  // Gather the blocks outside of the uniform regions according to the already
+  // computed order, leaving gaps for the uniform regions to fill in.
+  // Note that uniform region blocks do not appear in the block ordering yet.
+  // Also note that we can't use pointers to BasicBlockTags here since
+  // `PassState.computeBlockOrdering()` re-orders the tags vector.
+  SmallVector<BasicBlock *, 16> filtered;
+  for (const auto &tag : DR->getBlockOrdering()) {
+    filtered.push_back(tag.BB);
+    const auto found = entryMap.find(tag.BB);
+    if (found != entryMap.end()) {
+      const auto *const region = found->second;
+      filtered.resize(filtered.size() + region->uniformBlocks.size());
+    }
+  }
+
+  // Recompute the ordering over the uniform regions
+  VECZ_FAIL_IF(!PassState.computeBlockOrdering());
+
+  // Filter by region and fill in the gaps
+  SmallVector<size_t, 16> uniformBlocks;
+  uniformBlocks.reserve(maxUBlocks);
+  for (auto it = filtered.begin(), ie = filtered.end(); it != ie;) {
+    auto *const BB = *it;
+
+    const auto found = entryMap.find(BB);
+    if (found != entryMap.end()) {
+      // If the entry block of the region is NOT duplicated, add the uniform
+      // blocks after it.
+      const bool entryDupe = getBlock(BB);
+      if (!entryDupe) {
+        ++it;
+      }
+
+      // Gather the indices of the uniform blocks and sort them.
+      const auto &region = *found->second;
+      uniformBlocks.clear();
+      for (auto *const uBB : region.uniformBlocks) {
+        uniformBlocks.push_back(DR->getTagIndex(uBB));
+      }
+      std::sort(uniformBlocks.begin(), uniformBlocks.end());
+
+      // Insert the uniform blocks into the gap.
+      for (const auto uBBi : uniformBlocks) {
+        (*it++) = DR->getBlockTag(uBBi).BB;
+      }
+
+      // If the entry block of the region IS duplicated, add it after the
+      // uniform blocks.
+      if (entryDupe) {
+        (*it++) = BB;
+      }
+    } else {
+      ++it;
+    }
+  }
+
+  uint32_t pos = 0;
+  for (auto *const BB : filtered) {
+    DR->getTag(BB).pos = pos++;
+  }
+  DR->reorderTags(filtered.size());
+
+  return true;
+}
+
+bool ControlFlowConversionState::BOSCCGadget::cleanUp() {
+  // BOSCC can create a lot of PHI nodes that are not really necessary.
+  // LCSSA PHI nodes (in Store Blocks) are only required as an intermediate
+  // state and are trivially redundant, and sometimes blends are created that
+  // blend the same two values together. Also, sometimes values are blended
+  // even though they have no further uses and can be removed as dead code.
+
+  const RPOT rpot(&F);
+  std::vector<PHINode *> blends;
+  for (auto *BB : rpot) {
+    for (auto I = BB->begin(); I != BB->end();) {
+      auto *PHI = dyn_cast<PHINode>(&*(I++));
+      if (!PHI) {
+        break;
+      }
+      if (!PHI->getName().contains(".boscc_")) {
+        continue;
+      }
+
+      if (auto *V = PHI->hasConstantValue()) {
+        PHI->replaceAllUsesWith(V);
+        IRCleanup::deleteInstructionNow(PHI);
+      } else {
+        blends.push_back(PHI);
+      }
+    }
+  }
+
+  while (!blends.empty()) {
+    PHINode *PHI = blends.back();
+    if (PHI->use_empty()) {
+      IRCleanup::deleteInstructionNow(PHI);
+    }
+    blends.pop_back();
+  }
+
+  return true;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
new file mode 100644
index 0000000000000..02f6e9e68ca9b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/control_flow_roscc.cpp
@@ -0,0 +1,150 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "control_flow_roscc.h"
+
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+
+#define DEBUG_TYPE "vecz-cf"
+
+// WHAT THIS DOES
+//
+// A common pattern in OpenCL kernels is a line near the start of the program
+// like the following:
+//
+//    if (some_condition) return;
+//
+// Where "some_condition" is non-uniform, the BOSCC control flow optimization
+// can do very well with this. However, without BOSCC, the entire program will
+// have been linearized and the early return will disappear entirely. It is
+// desirable to maintain this sort of early exit branch in order to avoid
+// doing unnecessary work. We can do this by inserting a uniform branch to the
+// return block without the need to duplicate the rest of the kernel into
+// uniform and non-uniform versions, as BOSCC does. This can improve the
+// performance significantly without requiring complex CFG changes.
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+/// @brief checks if the given block contains only a return instruction
+bool isReturnBlock(const llvm::BasicBlock &BB) {
+  if (BB.size() != 1) {
+    return false;
+  }
+
+  auto *T = BB.getTerminator();
+  if (auto *const branch = dyn_cast<BranchInst>(T)) {
+    if (branch->isUnconditional()) {
+      // We can see straight through a block that only contains a single
+      // unconditional branch.
+      return isReturnBlock(*branch->getSuccessor(0));
+    }
+  }
+
+  return isa<ReturnInst>(T);
+}
+} // namespace
+
+bool ControlFlowConversionState::ROSCCGadget::run(Function &F) {
+  bool changed = false;
+
+  SmallVector<BranchInst *, 4> RetBranches;
+  for (auto &BB : F) {
+    if (LI->getLoopFor(&BB)) {
+      // No need to do this transform on loop exits
+      continue;
+    }
+
+    auto *T = BB.getTerminator();
+    if (auto *Branch = dyn_cast<BranchInst>(T)) {
+      if (Branch->isConditional() && Branch->getNumSuccessors() == 2) {
+        Value *cond = Branch->getCondition();
+        if (UVR->isVarying(cond)) {
+          size_t countReturns = 0;
+          for (auto *succ : Branch->successors()) {
+            if (isReturnBlock(*succ)) {
+              ++countReturns;
+            }
+          }
+
+          // Only consider ROSCC when there is exactly one returning successor.
+          if (countReturns == 1) {
+            RetBranches.push_back(Branch);
+          }
+        }
+      }
+    }
+  }
+
+  ConstantInt *trueCI = ConstantInt::getTrue(F.getContext());
+  ConstantInt *falseCI = ConstantInt::getFalse(F.getContext());
+
+  for (auto *Branch : RetBranches) {
+    BasicBlock *BB = Branch->getParent();
+
+    BasicBlock *newBB = SplitBlock(BB, Branch, DT, LI);
+    newBB->setName(Twine(BB->getName(), ".ROSCC"));
+
+    // update the PostDominatorTree manually..
+    auto *Node = PDT->getNode(BB);
+    assert(Node && "Could not get node");
+    auto *IDom = Node->getIDom();
+    assert(IDom && "Could not get IDom");
+    auto *Block = IDom->getBlock();
+    assert(Block && "Could not get Block");
+    PDT->addNewBlock(newBB, Block);
+
+    // Remove the unconditional branch created by splitting..
+    IRCleanup::deleteInstructionNow(BB->getTerminator());
+
+    // Create a new Uniform branch condition to the Return block..
+    // Note that a conditional branch's successors are returned in reverse
+    // order, relative to how they appear in the IR, with the "true" target
+    // last. However, "getSuccessor(n)" also indexes backwards, from the end.
+    BasicBlock *SuccT = Branch->getSuccessor(0);
+    BasicBlock *SuccF = Branch->getSuccessor(1);
+    const bool Which = isReturnBlock(*SuccT);
+
+    BasicBlock *ReturnBlock = Which ? SuccT : SuccF;
+    Value *Cond = Branch->getCondition();
+    auto *newCond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Cond,
+                                    Which ? falseCI : trueCI, "", BB);
+    newCond->setName(Twine(Cond->getName(), ".ROSCC"));
+    BranchInst::Create(newBB, ReturnBlock, newCond, BB);
+
+    // Update Dominator and PostDominator trees..
+    DT->insertEdge(BB, ReturnBlock);
+    PDT->insertEdge(BB, ReturnBlock);
+
+    changed = true;
+  }
+
+  assert((!changed || DT->verify()) &&
+         "ROSCC: Dominator Tree failed verification");
+
+  assert((!changed || PDT->verify()) &&
+         "ROSCC: Post-Dominator Tree failed verification");
+
+  return changed;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
new file mode 100644
index 0000000000000..9d30786cf3d39
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/debugging.cpp
@@ -0,0 +1,88 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "debugging.h"
+
+#include <llvm/Analysis/OptimizationRemarkEmitter.h>
+
+using namespace llvm;
+
+namespace vecz {
+
+/// @brief Create the std::string containing the message for the remark
+///
+/// @param[in] V The value (can be `nullptr`) to be included in the remark
+/// @param[in] Msg The main remark message
+/// @param[in] Note An optional additional note to provide more context/info.
+/// @return The remark message as it is to be printed
+static std::string createRemarkMessage(const Value *V, StringRef Msg,
+                                       StringRef Note = "") {
+  std::string helper_str("Vecz: ");
+  raw_string_ostream helper_stream(helper_str);
+  helper_stream << Msg;
+  if (V) {
+    if (isa<Instruction>(V)) {
+      // Instructions are already prefixed by two spaces when printed
+      V->print(helper_stream, /*IsForDebug=*/true);
+    } else if (const Function *F = dyn_cast<Function>(V)) {
+      // Printing a functions leads to its whole body being printed
+      helper_stream << " function \"" << F->getName() << "\"";
+    } else {
+      helper_stream << " ";
+      V->print(helper_stream, /*IsForDebug=*/true);
+    }
+  }
+  helper_stream << '\n';
+
+  // Provide extra context, if supplied
+  if (!Note.empty()) {
+    helper_stream << "  note: " << Note << '\n';
+  }
+
+  return helper_stream.str();
+}
+
+void emitVeczRemarkMissed(const Function *F, const Value *V, StringRef Msg,
+                          StringRef Note) {
+  const Instruction *I = V ? dyn_cast<Instruction>(V) : nullptr;
+  auto RemarkMsg = createRemarkMessage(V, Msg, Note);
+  OptimizationRemarkEmitter ORE(F);
+  if (I) {
+    ORE.emit(OptimizationRemarkMissed("vecz", "vecz", I) << RemarkMsg);
+  } else {
+    const DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc();
+    ORE.emit(OptimizationRemarkMissed("vecz", "vecz", D, &(F->getEntryBlock()))
+             << RemarkMsg);
+  }
+}
+
+void emitVeczRemarkMissed(const Function *F, StringRef Msg, StringRef Note) {
+  emitVeczRemarkMissed(F, nullptr, Msg, Note);
+}
+
+void emitVeczRemark(const Function *F, const Value *V, StringRef Msg) {
+  const Instruction *I = V ? dyn_cast<Instruction>(V) : nullptr;
+  const DebugLoc D = I ? DebugLoc(I->getDebugLoc()) : DebugLoc();
+
+  auto RemarkMsg = createRemarkMessage(V, Msg);
+  OptimizationRemarkEmitter ORE(F);
+  ORE.emit(OptimizationRemark("vecz", "vecz", F) << RemarkMsg);
+}
+
+void emitVeczRemark(const Function *F, StringRef Msg) {
+  emitVeczRemark(F, nullptr, Msg);
+}
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
new file mode 100644
index 0000000000000..f538de0e6bed4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/control_flow_analysis.h
@@ -0,0 +1,98 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @brief Analysis of control flow.
+
+#ifndef VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class BasicBlock;
+} // namespace llvm
+
+namespace vecz {
+
+/// @brief Holds the results and state for CFG analysis.
+struct CFGResult {
+  /// @brief true if analysis failed, e.g. CFG conversion cannot be done.
+  bool failed = false;
+  /// @brief true if CFG conversion is needed to vectorize the function.
+  bool convNeeded = false;
+  /// @brief Single basic block that exits the function.
+  llvm::BasicBlock *exitBB = nullptr;
+
+  /// @brief Create new analysis results for the given function.
+  CFGResult() = default;
+
+  /// @brief Deleted copy constructor.
+  CFGResult(const CFGResult &) = delete;
+
+  /// @brief Move constructor.
+  ///
+  /// @param[in,out] Res Existing results to move.
+  CFGResult(CFGResult &&Res) = default;
+
+  /// @brief Access the failed flag.
+  /// @return true if analysis failed.
+  bool getFailed() const { return failed; }
+
+  /// @brief Access the failed flag.
+  /// @param[in] newVal New value for the flag.
+  void setFailed(bool newVal) { failed = newVal; }
+
+  /// @brief Determine whether CFG conversion is needed for the function or not.
+  bool isConversionNeeded() const { return convNeeded; }
+  /// @brief Set whether CFG conversion is needed for the function or not.
+  /// @param[in] newVal Whether conversion is needed or not.
+  void setConversionNeeded(bool newVal) { convNeeded = newVal; }
+
+  /// @brief Single block in the function that returns to the caller or null.
+  llvm::BasicBlock *getExitBlock() const { return exitBB; }
+};
+
+/// @brief Analysis that determines whether a function can have divergent
+/// control flow and so whether CFG conversion is needed or not.
+class CFGAnalysis : public llvm::AnalysisInfoMixin<CFGAnalysis> {
+public:
+  /// @brief Create a new CFG analysis object.
+  CFGAnalysis() = default;
+
+  /// @brief Type of the analaysis result.
+  using Result = CFGResult;
+
+  /// @brief Perform CFG analysis on the function to determine whether control
+  /// flow conversion is required and possible or not.
+  ///
+  /// @param[in,out] F Function to analyze.
+  /// @param[in,out] AM FunctionAnalysisManager providing analyses
+  ///
+  /// @return CFG analysis result.
+  CFGResult run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Analysis name.
+  static llvm::StringRef name() { return "CFG analysis"; }
+
+private:
+  friend llvm::AnalysisInfoMixin<CFGAnalysis>;
+  /// @brief Unique identifier for the analysis.
+  static llvm::AnalysisKey Key;
+};
+
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_CONTROL_FLOW_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
new file mode 100644
index 0000000000000..cb66e38ba1bde
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/divergence_analysis.h
@@ -0,0 +1,480 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Divergence analysis.
+
+#ifndef VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+#include <vector>
+
+namespace llvm {
+class BasicBlock;
+class Loop;
+} // namespace llvm
+
+namespace vecz {
+struct BasicBlockTag;
+struct LoopTag;
+
+/// @brief Analysis flags that can be attached to LLVM basic blocks.
+enum BlockDivergenceFlag {
+  /// @brief Flag value where no flag is set.
+  eBlockHasNoFlag = 0,
+  /// @brief True if the block has a divergent branch (different paths might be
+  /// taken by different work items.
+  eBlockHasDivergentBranch = (1 << 0),
+  /// @brief True if the block has no divergent branch but has all its
+  /// successors divergent.
+  eBlockHasDivergentBranchFake = (1 << 1),
+  /// @brief True if the block belongs in a diverged path.
+  eBlockIsDivergent = (1 << 2),
+  /// @brief True if the block is an introduced divergent conditional loop exit.
+  /// The operation is performed during the transformation of a divergent loop.
+  eBlockIsVirtualDivergentLoopExit = (1 << 3),
+  /// @brief True if the block is a join point of a divergent branch.
+  eBlockIsBlend = (1 << 4),
+  /// @brief True if no divergence is present when reaching the block.
+  eBlockIsByAll = (1 << 5),
+  /// @brief True if the block is uniform (duplicated version of a predicated
+  /// block from BOSCC).
+  eBlockIsUniform = (1 << 6),
+  /// @brief True if the block needs an all-of mask.
+  eBlockNeedsAllOfMask = (1 << 7)
+};
+
+/// @brief Analysis flags that can be attached to LLVM loops.
+enum LoopDivergenceFlag {
+  /// @brief Flag value where no flag is set.
+  eLoopNoFlag = 0,
+  /// @brief Whether or not the loop may diverge because of a diverging block.
+  eLoopIsDivergent = (1 << 0)
+};
+
+/// @brief Type that maps basic blocks to tags.
+using DenseBBMap = llvm::DenseMap<const llvm::BasicBlock *, size_t>;
+/// @brief Type that maps loops to tags.
+using DenseLoopMap =
+    llvm::DenseMap<const llvm::Loop *, std::unique_ptr<LoopTag>>;
+/// @brief Type that maps loop live values and their associated state from the
+///        previous loop iteration.
+using DenseLoopResultPHIsMap =
+    llvm::SmallDenseMap<llvm::Value *, llvm::PHINode *, 32>;
+/// @brief Type that maps loop live values and updated value.
+using DenseLoopResultUpdatesMap =
+    llvm::SmallDenseMap<llvm::Value *, llvm::SelectInst *, 32>;
+
+class DivergenceResult;
+
+/// @brief Queue that orders blocks by their DCBI (smallest first).
+struct BlockQueue {
+  using index_type = uint32_t;
+  using index_list = std::vector<index_type>;
+
+  const DivergenceResult &DR;
+
+  /// @brief The DCBI indices of the blocks in the queue, in min-heap order.
+  /// Since we can easily retrieve the BasicBlockTag from the DCBI ordered
+  /// `blockOrdering` vector, and since the queue priority is entirly based on
+  /// the index, it is sufficient to store only the indices to perform the
+  /// queue operations.
+  index_list indices;
+
+  /// @brief Constructs an empty BlockQueue
+  BlockQueue(const DivergenceResult &dr) : DR(dr) {};
+
+  /// @brief Constructs a BlockQueue from a set of blocks.
+  BlockQueue(const DivergenceResult &dr,
+             const llvm::DenseSet<llvm::BasicBlock *> &blocks);
+
+  /// @brief Returns the number of blocks in the queue.
+  size_t size() const { return indices.size(); }
+
+  /// @brief Returns whether the queue is empty.
+  bool empty() const { return indices.empty(); }
+
+  /// @brief Pushes a block on the queue by its DCBI index.
+  void push(size_t index);
+
+  /// @brief Pushes a block on the queue by pointer.
+  /// Prefer `push(size_t)` if the tag index is available.
+  void push(const llvm::BasicBlock *bb);
+
+  /// @brief Pops a block from the queue and returns it.
+  const BasicBlockTag &pop();
+
+  /// @brief Const iterator to beginning of index list, for inspection.
+  index_list::const_iterator begin() const { return indices.begin(); }
+
+  /// @brief Const iterator to end of index list, for inspection.
+  index_list::const_iterator end() const { return indices.end(); }
+};
+
+/// @brief Describes a loop contained in the function to vectorize.
+struct LoopTag {
+  /// @brief Compiler loop info.
+  llvm::Loop *loop = nullptr;
+  /// @brief Loop entering point.
+  llvm::BasicBlock *preheader = nullptr;
+  /// @brief Loop entry point.
+  llvm::BasicBlock *header = nullptr;
+  /// @brief Single block that jumps back to the loop header.
+  llvm::BasicBlock *latch = nullptr;
+  /// @brief Loop live values on the loop.
+  llvm::SmallPtrSet<llvm::Value *, 32> loopLiveValues;
+  /// @brief Map between loop live values and their associated state from the
+  ///        previous loop iteration.
+  DenseLoopResultPHIsMap loopResultPrevs;
+  /// @brief Map between loop live values and their updated value.
+  DenseLoopResultUpdatesMap loopResultUpdates;
+  /// @brief Loop exit that has been chosen during partial linearization.
+  llvm::BasicBlock *pureExit = nullptr;
+
+  LoopDivergenceFlag divergenceFlag = LoopDivergenceFlag::eLoopNoFlag;
+
+  bool isLoopDivergent() const {
+    return divergenceFlag & LoopDivergenceFlag::eLoopIsDivergent;
+  }
+};
+
+/// @brief Describes a basic block contained in the function to vectorize.
+struct BasicBlockTag {
+  /// @brief Compiler basic block object.
+  llvm::BasicBlock *BB = nullptr;
+  /// @brief Inner most loop this block belongs to, if any.
+  LoopTag *loop = nullptr;
+  /// @brief Outermost loop left by this block.
+  LoopTag *outermostExitedLoop = nullptr;
+
+  /// @brief Unique sorted block index.
+  uint32_t pos = ~0u;
+
+  /// @brief Create a new basic block tag.
+  BasicBlockTag() = default;
+  /// @brief Deleted address-of operator
+  BasicBlockTag *operator&() = delete;
+  /// @brief Deleted const address-of operator
+  const BasicBlockTag *operator&() const = delete;
+
+  BlockDivergenceFlag divergenceFlag = BlockDivergenceFlag::eBlockHasNoFlag;
+
+  /// @brief Convenience function for finding the varying property of the branch
+  /// without having to query the Uniform Value Analysis
+  bool hasVaryingBranch() const {
+    return divergenceFlag & BlockDivergenceFlag::eBlockHasDivergentBranch;
+  }
+
+  /// @brief Determine whether there is a backedge from this tag's basic block
+  /// to the target basic block.
+  ///
+  /// @param[in] toBB Potential target for the backedge.
+  ///
+  /// @return true if there is a backedge, false otherwise.
+  bool isLoopBackEdge(llvm::BasicBlock *toBB) const {
+    return loop && (loop->latch == BB) && (loop->header == toBB);
+  }
+
+  /// @brief Determine whether this block is the header of its loop (if any).
+  /// @return true iff the block is the loop header for its loop
+  bool isLoopHeader() const { return loop && loop->header == BB; }
+};
+
+/// @brief Divergent blocks whose PHI nodes may vary.
+using DivergenceInfo = llvm::DenseSet<llvm::BasicBlock *>;
+
+/// @brief Holds the result of Divergence Analysis for a given function.
+class DivergenceResult {
+public:
+  /// @brief Create a new DA result for the given unit.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  DivergenceResult(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Generate a block ordering.
+  ///
+  /// This is based on a dominance-compact block indexing (DCBI) where we
+  /// topologically order blocks that belong to the same dominator tree.
+  ///
+  /// @returns true if no errors occurred.
+  bool computeBlockOrdering(llvm::DominatorTree &DT);
+
+  /// @brief Reorders the tags in the tags vector according to their DBCI
+  /// indices.
+  /// @param[in] n the number of tags in the DCBI
+  void reorderTags(size_t n);
+
+  /// @brief Generate a loop ordering.
+  ///
+  /// This populates the `loopOrdering` vector with loop tags sorted by depth.
+  ///
+  /// @returns true if no errors occurred.
+  bool computeLoopOrdering();
+
+  /// @brief Gets a BasicBlockTag by its DCBI index
+  /// @param[in] index the DCBI index
+  /// @returns reference to the BasicBlockTag
+  const BasicBlockTag &getBlockTag(size_t index) const {
+    return basicBlockTags[index];
+  }
+
+  /// @brief Gets the DCBI ordered range of BasicBlockTags.
+  llvm::ArrayRef<BasicBlockTag> getBlockOrdering() const {
+    return llvm::ArrayRef<BasicBlockTag>(basicBlockTags.data(),
+                                         numOrderedBlocks);
+  }
+
+  llvm::ArrayRef<LoopTag *> getLoopOrdering() { return loopOrdering; }
+
+  size_t getTagIndex(const llvm::BasicBlock *BB) const;
+
+  /// @brief Retrieve a tag for the given basic block.
+  ///
+  /// @param[in] BB Basic block to retrieve a tag for.
+  ///
+  /// @return Basic block tag.
+  BasicBlockTag &getTag(const llvm::BasicBlock *BB) {
+    return basicBlockTags[getTagIndex(BB)];
+  }
+
+  const BasicBlockTag &getTag(const llvm::BasicBlock *BB) const {
+    return basicBlockTags[getTagIndex(BB)];
+  }
+
+  /// @brief Retrieve or create a tag for the given basic block.
+  ///
+  /// @param[in] BB Basic block to retrieve or create a tag for.
+  ///
+  /// @return Basic block tag.
+  BasicBlockTag &getOrCreateTag(llvm::BasicBlock *BB);
+
+  /// @brief Try to retrieve a tag for the given loop.
+  ///
+  /// @param[in] L Loop to retrieve a tag for.
+  ///
+  /// @return Loop tag.
+  LoopTag &getTag(const llvm::Loop *L) const;
+
+  /// @brief Retrieve or create a tag for the given loop.
+  ///
+  /// @param[in] L Loop to retrieve a tag for.
+  ///
+  /// @return Loop tag.
+  LoopTag &getOrCreateTag(llvm::Loop *L);
+
+  /// @brief Determine whether the tag contains the given flags or not.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  /// @param[in] F Flags to test.
+  ///
+  /// @return true if the tag contains all the given flags, false otherwise.
+  bool hasFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F) const;
+  /// @brief Get the given flags for the tag.
+  ///
+  /// @param[in] BB Basic block whose flag we want to get.
+  BlockDivergenceFlag getFlag(const llvm::BasicBlock &BB) const;
+  /// @brief Set the given flags for the tag.
+  ///
+  /// @param[in] BB Basic block whose flag we set.
+  /// @param[in] F Flags to set for the tag.
+  void setFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F);
+  /// @brief Clear the given flags for the tag.
+  ///
+  /// @param[in] BB Basic block whose flag we clear.
+  /// @param[in] F Flags to clear for the tag.
+  void clearFlag(const llvm::BasicBlock &BB, BlockDivergenceFlag F);
+  /// @brief Check whether the basic block contains a div causing flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is div causing, false otherwise.
+  bool isDivCausing(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains a divergent flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is divergent, false otherwise.
+  bool isDivergent(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains an optional flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is optional, false otherwise.
+  bool isOptional(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains a by_all flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is by_all, false otherwise.
+  bool isByAll(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains a blend flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is blend, false otherwise.
+  bool isBlend(const llvm::BasicBlock &BB) const;
+  /// @brief Check whether the basic block contains a uniform flag.
+  ///
+  /// @param[in] BB Basic block whose flag we check.
+  ///
+  /// @return true if the tag is uniform, false otherwise.
+  bool isUniform(const llvm::BasicBlock &BB) const;
+
+  /// @brief Determine whether the tag contains the given flags or not.
+  ///
+  /// @param[in] L Loop whose flag we check.
+  /// @param[in] F Flags to test.
+  ///
+  /// @return true if the tag contains all the given flags, false otherwise.
+  bool hasFlag(const llvm::Loop &L, LoopDivergenceFlag F) const;
+  /// @brief Get the given flags for the tag.
+  ///
+  /// @param[in] L Loop whose flag we want to get.
+  LoopDivergenceFlag getFlag(const llvm::Loop &L) const;
+  /// @brief Set the given flags for the tag.
+  ///
+  /// @param[in] L Loop whose flag we set.
+  /// @param[in] F Flags to set for the tag.
+  void setFlag(const llvm::Loop &L, LoopDivergenceFlag F);
+  /// @brief Clear the given flags for the tag.
+  ///
+  /// @param[in] L Loop whose flag we clear.
+  /// @param[in] F Flags to clear for the tag.
+  void clearFlag(const llvm::Loop &L, LoopDivergenceFlag F);
+
+  /// @brief Check if a block Src can reach a block Dst, either within the same
+  ///        SESE region, or outside too.
+  /// @param[in] src Source node.
+  /// @param[in] dst Destination node.
+  /// @param[in] allowLatch Whether reachability is computed with latches or
+  /// not.
+  /// @return Whether or not dst is reachable from src.
+  bool isReachable(llvm::BasicBlock *src, llvm::BasicBlock *dst,
+                   bool allowLatch = false) const;
+
+  /// @brief List of blocks having a divergent branch.
+  const std::vector<llvm::BasicBlock *> &getDivCausingBlocks() const {
+    return divCausingBlocks;
+  }
+
+private:
+  friend class DivergenceAnalysis;
+
+  /// @brief Mark a block div causing and mark blocks that are control dependent
+  ///        to be divergent
+  /// @param[in] BB Div causing block.
+  /// @param[in,out] DI Divergence information of the function.
+  /// @param[in,out] PDT PostDominatorTree of the function.
+  void markDivCausing(llvm::BasicBlock &BB, DivergenceInfo &DI,
+                      llvm::PostDominatorTree &PDT);
+  /// @brief Mark divergent blocks in a loop (loop exits and latch) that are
+  ///        control dependent of a divergent branch.
+  /// @param[in] BB Div causing block.
+  /// @param[in] L Loop that BB diverges.
+  /// @param[in,out] DI Divergence information of the function.
+  void markDivLoopDivBlocks(llvm::BasicBlock &BB, llvm::Loop &L,
+                            DivergenceInfo &DI);
+  /// @brief Mark a block to be divergent.
+  /// @param[in] BB Block to mark.
+  void markDivergent(const llvm::BasicBlock &BB);
+  /// @brief Mark a loop to be divergent.
+  /// @param[in] L Loop to mark.
+  void markDivergent(const llvm::Loop &L);
+  /// @brief Recursively mark a block by_all.
+  /// @param[in] BB Block to mark.
+  void markByAll(llvm::BasicBlock &BB);
+
+  /// @brief Find join points of a block.
+  /// @param[in] src Starting block
+  /// @return List of blocks that have a disjoint path from the starting block.
+  llvm::DenseSet<llvm::BasicBlock *> joinPoints(llvm::BasicBlock &src) const;
+  /// @brief Find escape points of a divergent loop.
+  ///
+  /// Escape points are loop exit blocks from which some work-items may leave
+  /// through because of a divergent branch.
+  /// @param[in] src Divergent branch
+  /// @param[in] L Divergent loop
+  /// @return List of exit blocks some work-item may leave through.
+  llvm::DenseSet<llvm::BasicBlock *> escapePoints(const llvm::BasicBlock &src,
+                                                  const llvm::Loop &L) const;
+
+  /// @brief the Function the analysis was run on
+  llvm::Function &F;
+  /// @brief AM FunctionAnalysisManager providing analyses.
+  llvm::FunctionAnalysisManager &AM;
+
+  /// @brief Basic block tag mappings.
+  DenseBBMap BBMap;
+  /// @brief Loop tag mappings.
+  DenseLoopMap LMap;
+
+  /// @brief Storage for the Basic Block Tags
+  std::vector<BasicBlockTag> basicBlockTags;
+  /// @brief The number of blocks in the DCBI ordering.
+  size_t numOrderedBlocks = 0;
+
+  /// @brief List of Loop Tags ordered by loop depth
+  llvm::SmallVector<LoopTag *, 16> loopOrdering;
+
+  /// @brief Blocks that have a divergent branch.
+  std::vector<llvm::BasicBlock *> divCausingBlocks;
+
+  /// @brief Blocks with uniform conditions that must be considered div causing
+  ///        because they have a join point of a div causing block as their
+  ///        successor.
+  llvm::DenseSet<llvm::BasicBlock *> fakeDivCausingBlocks;
+};
+
+/// @brief Analysis that determines divergent blocks, i.e. program points
+///        that must not be skipped during SIMD execution.
+class DivergenceAnalysis : public llvm::AnalysisInfoMixin<DivergenceAnalysis> {
+  friend llvm::AnalysisInfoMixin<DivergenceAnalysis>;
+
+public:
+  /// @brief Create a new analysis object.
+  DivergenceAnalysis() = default;
+
+  /// @brief Type of result produced by the analysis.
+  using Result = DivergenceResult;
+
+  /// @brief Determine which values in the function are uniform and which are
+  /// potentially varying.
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Analysis result for the function.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Divergence analysis"; }
+
+private:
+  /// @brief Unique identifier for the pass.
+  static llvm::AnalysisKey Key;
+};
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_DIVERGENCE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
new file mode 100644
index 0000000000000..daf31e624a35d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/instantiation_analysis.h
@@ -0,0 +1,36 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
+
+namespace llvm {
+class Instruction;
+} // namespace llvm
+
+namespace vecz {
+class VectorizationContext;
+
+/// @brief Determine whether the given instruction needs to be instantiated.
+///
+/// @param[in] CTx the vectorization context
+/// @param[in] I Instruction to analyze.
+///
+/// @return true iff the instruction requires instantiation.
+bool needsInstantiation(const VectorizationContext &Ctx, llvm::Instruction &I);
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_INSTANTIATION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
new file mode 100644
index 0000000000000..e36188b41dff5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/liveness_analysis.h
@@ -0,0 +1,100 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file liveness_analysis.h
+///
+/// @brief Live Variable Set Analysis
+
+#ifndef VECZ_ANALYSIS_LIVENESS_ANALYSIS_H
+#define VECZ_ANALYSIS_LIVENESS_ANALYSIS_H
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Loop;
+class LoopInfo;
+class Function;
+class BasicBlock;
+class Value;
+} // namespace llvm
+
+namespace vecz {
+class VectorizationUnit;
+
+struct BlockLivenessInfo {
+  using LiveSet = llvm::SmallVector<llvm::Value *, 16>;
+
+  LiveSet LiveIn;
+  LiveSet LiveOut;
+  size_t MaxRegistersInBlock = 0;
+};
+
+class LivenessResult {
+public:
+  LivenessResult(llvm::Function &F) : F(F) {}
+
+  LivenessResult() = delete;
+  LivenessResult(const LivenessResult &) = delete;
+  LivenessResult(LivenessResult &&) = default;
+  ~LivenessResult() = default;
+
+  void recalculate();
+
+  size_t getMaxLiveVirtualRegisters() const;
+  const BlockLivenessInfo &getBlockInfo(const llvm::BasicBlock *) const;
+
+private:
+  class Impl;
+
+  llvm::Function &F;
+
+  size_t maxNumberOfLiveValues;
+
+  llvm::DenseMap<const llvm::BasicBlock *, BlockLivenessInfo> BlockInfos;
+};
+
+/// Analysis pass to perform liveness analysis and estimate register pressure by
+/// counting the number of live virtual registers in a function.
+///
+/// Values in a basic block's live set are guaranteed to be in program order.
+class LivenessAnalysis : public llvm::AnalysisInfoMixin<LivenessAnalysis> {
+  friend llvm::AnalysisInfoMixin<LivenessAnalysis>;
+
+public:
+  using Result = LivenessResult;
+
+  LivenessAnalysis() = default;
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Liveness analysis"; }
+
+  /// Estimate the number of registers needed by F by counting the number of
+  /// live values.
+  ///
+  /// Assumes a reducible CFG. In OpenCL 1.2 whether or not irreducible control
+  /// flow is illegal is implementation defined.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &);
+
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_LIVENESS_ANALYSIS_H
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
new file mode 100644
index 0000000000000..ddd8c97d0c8f2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/packetization_analysis.h
@@ -0,0 +1,106 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Stride analysis.
+
+#ifndef VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Function;
+class Value;
+} // namespace llvm
+
+namespace vecz {
+
+class StrideAnalysisResult;
+struct UniformValueResult;
+
+/// @brief Holds the result of Packetization Analysis for a given function.
+class PacketizationAnalysisResult {
+public:
+  /// @brief The function being analyzed
+  llvm::Function &F;
+  /// @brief The Stride Analysis Result to use during analysis
+  StrideAnalysisResult &SAR;
+  /// @brief The Uniform Value Result to use during analysis
+  UniformValueResult &UVR;
+
+  /// @brief Traverse the function, starting from the vector leaves, and mark
+  /// instructions for packetization where needed. Note that the resulting set
+  /// MAY not be exhaustive, since it is not always easy to predict where the
+  /// packetizer might fail and fall back on instantiation, in which case
+  /// pointers will need to be packetized regardless of linear stride.
+  PacketizationAnalysisResult(llvm::Function &f, StrideAnalysisResult &sar);
+
+  /// @brief Returns whether the packetization set is empty or not.
+  bool isEmpty() const { return toPacketize.empty(); }
+
+  /// @brief query whether the given value has been marked for packetization.
+  ///
+  /// @param[in] V the value to query
+  /// @return true if the value was marked for packetization, false otherwise.
+  bool needsPacketization(const llvm::Value *V) const {
+    return toPacketize.contains(V);
+  }
+
+private:
+  void markForPacketization(llvm::Value *V);
+
+  /// @brief The set of instructions that need to be packetized.
+  /// This equates to all non-uniform values except for values used only in
+  /// address computations with constant linear strides.
+  llvm::DenseSet<const llvm::Value *> toPacketize;
+};
+
+/// @brief Analysis that determines whether pointer operands of memory
+/// operations have a linear dependence on the work item ID.
+class PacketizationAnalysis
+    : public llvm::AnalysisInfoMixin<PacketizationAnalysis> {
+  friend AnalysisInfoMixin<PacketizationAnalysis>;
+
+public:
+  /// @brief Create a new analysis object.
+  PacketizationAnalysis() {}
+
+  using Result = PacketizationAnalysisResult;
+
+  /// @brief Run the Packetization Analysis
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Analysis result for the function.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Packetization analysis"; }
+
+private:
+  /// @brief Unique identifier for the pass.
+  static llvm::AnalysisKey Key;
+};
+
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_PACKETIZATION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
new file mode 100644
index 0000000000000..bee7f0f1c0046
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/simd_width_analysis.h
@@ -0,0 +1,68 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief SIMD width analysis.
+
+#ifndef VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include "vectorization_unit.h"
+
+namespace vecz {
+
+class LivenessResult;
+
+/// @brief Choose a good SIMD width for the given function.
+class SimdWidthAnalysis : public llvm::AnalysisInfoMixin<SimdWidthAnalysis> {
+  friend AnalysisInfoMixin<SimdWidthAnalysis>;
+
+public:
+  /// @brief Create a new instance of the pass.
+  SimdWidthAnalysis() = default;
+
+  /// @brief Type of result produced by the analysis.
+  struct Result {
+    Result(unsigned value) : value(value) {}
+    unsigned value;
+  };
+
+  /// @brief Run the SIMD width analysis pass on the given function.
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  /// @return Preferred SIMD vectorization factor for the function or zero.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "SIMD width analysis"; }
+
+private:
+  unsigned avoidSpillImpl(llvm::Function &, llvm::FunctionAnalysisManager &,
+                          unsigned MinWidth = 2);
+
+  /// @brief Vector register width from TTI, if available.
+  unsigned MaxVecRegBitWidth;
+
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_SIMD_WIDTH_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
new file mode 100644
index 0000000000000..8b6d641e29681
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/stride_analysis.h
@@ -0,0 +1,141 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Stride analysis.
+
+#ifndef VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Analysis/AssumptionCache.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+#include "offset_info.h"
+
+namespace llvm {
+class Function;
+class Value;
+} // namespace llvm
+
+namespace vecz {
+
+struct UniformValueResult;
+
+/// @brief Holds the result of Stride Analysis for a given function.
+class StrideAnalysisResult {
+public:
+  /// @brief The function being analyzed
+  llvm::Function &F;
+  /// @brief The Uniform Value Result to use during analysis
+  UniformValueResult &UVR;
+  /// @brief AssumptionCache for computing live bits of uniform values
+  llvm::AssumptionCache &AC;
+
+  StrideAnalysisResult(llvm::Function &f, UniformValueResult &uvr,
+                       llvm::AssumptionCache &AC);
+
+  /// @brief generate stride `ConstantInt`s or `Instruction`s for all analyzed
+  /// values.
+  void manifestAll(llvm::IRBuilder<> &B);
+
+  /// @brief gets a pointer to the info struct for this value's analysis.
+  OffsetInfo *getInfo(llvm::Value *V) {
+    const auto find = analyzed.find(V);
+    return (find != analyzed.end()) ? &find->second : nullptr;
+  }
+
+  /// @brief gets a pointer to the info struct for this value's analysis.
+  const OffsetInfo *getInfo(llvm::Value *V) const {
+    const auto find = analyzed.find(V);
+    return (find != analyzed.end()) ? &find->second : nullptr;
+  }
+
+  /// @brief construct the offset info for the given value.
+  OffsetInfo &analyze(llvm::Value *V);
+
+  /// @brief build the strides as `Instructions` or `ConstantInts`.
+  /// Strides may be needed as `llvm::Values` by transform passes, but we are
+  /// not allowed to construct them during an analysis pass. However, note that
+  /// information about manifested stride `Value`s will survive until the
+  /// analysis is invalidated.
+  const OffsetInfo &manifest(llvm::IRBuilder<> &B, llvm::Value *V) {
+    const auto find = analyzed.find(V);
+    assert(find != analyzed.end() &&
+           "Trying to manifest unanalyzed OffsetInfo");
+    return find->second.manifest(B, *this);
+  }
+
+  /// @brief gets the manifested memory stride for this value, if present.
+  ///
+  /// @param[in] B IRBuilder for creating new instructions/values
+  /// @param[in] Ptr the pointer to calculate the stride for
+  /// @param[in] EleTy the type that the pointer points to
+  /// @returns the stride of the memory operation, in number of elements
+  llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Value *Ptr,
+                                 llvm::Type *EleTy) const;
+
+private:
+  /// @brief A map of values onto OffsetInfos that were already analyzed.
+  llvm::DenseMap<llvm::Value *, OffsetInfo> analyzed;
+};
+
+/// @brief Analysis that determines whether pointer operands of memory
+/// operations have a linear dependence on the work item ID.
+class StrideAnalysis : public llvm::AnalysisInfoMixin<StrideAnalysis> {
+  friend AnalysisInfoMixin<StrideAnalysis>;
+
+public:
+  /// @brief Create a new analysis object.
+  StrideAnalysis() {}
+
+  using Result = StrideAnalysisResult;
+
+  /// @brief Run the Stride Analysis
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Analysis result for the function.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Stride analysis"; }
+
+private:
+  /// @brief Unique identifier for the pass.
+  static llvm::AnalysisKey Key;
+};
+
+/// @brief Helper pass to print out the contents of the StrideAnalysis
+/// analysis.
+class StrideAnalysisPrinterPass
+    : public llvm::PassInfoMixin<StrideAnalysisPrinterPass> {
+  llvm::raw_ostream &OS;
+
+public:
+  explicit StrideAnalysisPrinterPass(llvm::raw_ostream &OS) : OS(OS) {}
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+};
+
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_STRIDE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
new file mode 100644
index 0000000000000..a221e6cba1447
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/uniform_value_analysis.h
@@ -0,0 +1,200 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Uniform Value analysis.
+
+#ifndef VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/PassManager.h>
+
+#include <vector>
+
+namespace llvm {
+class Value;
+class Instruction;
+} // namespace llvm
+
+namespace vecz {
+
+class VectorizationContext;
+class VectorizationUnit;
+
+/// @brief Holds the result of Uniform Value Analysis for a given function.
+struct UniformValueResult {
+  enum class VaryingKind {
+    /// @brief The value is truly uniform on all active and inactive lanes.
+    eValueTrueUniform,
+    /// @brief The value is uniform on active lanes. May be poison or undefined
+    /// on inactive lanes.
+    eValueActiveUniform,
+    /// @brief The value is varying and lanes may see different values.
+    eValueVarying,
+    /// @brief The value is uniform, but its mask is not.
+    /// Used for masked memory operations with a uniform address but varying
+    /// mask.
+    eMaskVarying,
+  };
+
+  /// @brief The function the analysis was run on.
+  llvm::Function &F;
+  /// @brief Vectorization unit the analysis was run on.
+  VectorizationUnit &VU;
+  /// @brief The Vectorization Context of the analysis.
+  VectorizationContext &Ctx;
+  /// @brief The vectorization dimension
+  unsigned dimension;
+  /// @brief The actual results of the analysis.
+  llvm::DenseMap<const llvm::Value *, VaryingKind> varying;
+
+  /// @brief Create a new UVA result for the given unit.
+  /// @param[in] F Function to analyze.
+  /// @param[in] VU Function to analyze.
+  UniformValueResult(llvm::Function &F, VectorizationUnit &VU);
+
+  /// @brief Determine whether the given value needs to be packetized or not.
+  ///
+  /// @param[in] V Value to analyze.
+  ///
+  /// @return true if the value needs to be packetized, false otherwise.
+  bool isVarying(const llvm::Value *V) const;
+
+  /// @brief Determine whether the given value has a varying mask or not.
+  ///
+  /// @param[in] V Value to analyze.
+  ///
+  /// @return true if the value has a varying mask, false otherwise.
+  bool isMaskVarying(const llvm::Value *V) const;
+
+  /// @brief Determine whether the given value has a varying mask or not.
+  ///
+  /// @param[in] V Value to analyze.
+  ///
+  /// @return true if the value is varying or has a varying mask, false
+  /// otherwise.
+  bool isValueOrMaskVarying(const llvm::Value *V) const;
+
+  /// @brief Determine (on demand) whether the given value is a true uniform
+  /// value.
+  ///
+  /// @param[in] V Value to analyze.
+  ///
+  /// @return true if the value is true uniform, false otherwise. Caches the
+  /// result for future queries.
+  bool isTrueUniform(const llvm::Value *V);
+
+  /// @brief Remove the value from the analysis.
+  ///
+  /// @param[in] V Value to remove.
+  void remove(const llvm::Value *V) { varying.erase(V); }
+
+  /// @brief Uncritically set a value to varying.
+  /// This can be used to keep the result valid after expression transforms.
+  /// Use with care, since it does not recursively update value users.
+  ///
+  /// @param[in] V Value to set.
+  void setVarying(const llvm::Value *V) {
+    varying[V] = VaryingKind::eValueVarying;
+  }
+
+  /// @brief Look for vector roots in the function.
+  ///
+  /// Roots are values which are scalar in the original function but are defined
+  /// to be vector in the vectorized function.
+  ///
+  /// Users of roots need to be vectorized too but are not considered roots.
+  /// As such they will not be returned in Roots.
+  ///
+  /// Examples:
+  /// * Calls to get_global_id()
+  /// * Calls to get_local_id()
+  ///
+  /// @param[in,out] Roots List of roots to update.
+  void findVectorRoots(std::vector<llvm::Value *> &Roots) const;
+
+  /// @brief Look for vector leaves in the function.
+  ///
+  /// Leaves are instructions that allow vectorized values to 'escape' from the
+  /// function.
+  ///
+  /// Examples:
+  /// * Store instructions (when the value to store is vectorized)
+  /// * Operands of call instructions (when the call needs to be vectorized)
+  /// * Return instructions
+  ///
+  /// @param[in,out] Leaves List of leaves to update.
+  void findVectorLeaves(std::vector<llvm::Instruction *> &Leaves) const;
+
+  /// @brief Find the alloca that this pointer points to
+  ///
+  /// @param[in] Pointer The pointer that is (potentially) pointing in an alloca
+  ///
+  /// @return the alloca if found, or nullptr otherwise
+  static llvm::AllocaInst *findAllocaFromPointer(llvm::Value *Pointer);
+
+  /// @brief Try to extract the base pointer of the address.
+  ///
+  /// @param[in] Address Address to split into base and offset.
+  ///
+  /// @return Base address.
+  llvm::Value *extractMemBase(llvm::Value *Address);
+
+  // private:
+  /// @brief Mark any value in the function that depends on V as being varying.
+  ///
+  /// @param[in] V Value used to start the vectorization search.
+  /// @param[in] From Optional value being used by `V`.
+  void markVaryingValues(llvm::Value *V, llvm::Value *From = nullptr);
+};
+
+/// @brief Analysis that determine whether values in a function are uniform or
+/// varying.
+class UniformValueAnalysis
+    : public llvm::AnalysisInfoMixin<UniformValueAnalysis> {
+  friend AnalysisInfoMixin<UniformValueAnalysis>;
+
+public:
+  /// @brief Create a new analysis object.
+  UniformValueAnalysis() {}
+
+  /// @brief Type of result produced by the analysis.
+  using Result = UniformValueResult;
+
+  /// @brief Determine which values in the function are uniform and which are
+  /// potentially varying.
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Analysis result for the function.
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Uniform value analysis"; }
+
+private:
+  /// @brief Unique identifier for the pass.
+  static llvm::AnalysisKey Key;
+};
+
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_UNIFORM_VALUE_RANGE_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
new file mode 100644
index 0000000000000..6bc813caeea0e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorizable_function_analysis.h
@@ -0,0 +1,71 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Vectorizable Function analysis.
+
+#ifndef VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED
+#define VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Value;
+}
+
+namespace vecz {
+
+/// @brief Determines whether vectorization of a function is possible.
+class VectorizableFunctionAnalysis
+    : public llvm::AnalysisInfoMixin<VectorizableFunctionAnalysis> {
+  friend AnalysisInfoMixin<VectorizableFunctionAnalysis>;
+
+public:
+  /// @brief Create a new instance of the pass.
+  VectorizableFunctionAnalysis() = default;
+
+  /// @brief Type of result produced by the analysis.
+  struct Result {
+    /// @brief Whether the function can be vectorized.
+    bool canVectorize = false;
+
+    /// @brief Handle invalidation events from the new pass manager.
+    ///
+    /// @return false, as this analysis can never be invalidated.
+    bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &,
+                    llvm::FunctionAnalysisManager::Invalidator &) {
+      return false;
+    }
+  };
+
+  /// @brief Determine whether vectorization of a function is possible.
+  /// @param[in] F Function to analyze.
+  /// @return VectorizationUnit corresponding to this function
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "Vectorizable Function analysis"; }
+
+private:
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_VECTORIZABLE_FUNCTION_ANALYSIS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
new file mode 100644
index 0000000000000..7244236587d2f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/analysis/vectorization_unit_analysis.h
@@ -0,0 +1,121 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file vectorization_unit_analysis.h
+///
+/// @brief VectorizationUnit analysis.
+
+#ifndef VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED
+#define VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include <cassert>
+
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+namespace vecz {
+
+/// @brief Caches and returns the VectorizationUnit for a Function.
+class VectorizationUnitAnalysis
+    : public llvm::AnalysisInfoMixin<VectorizationUnitAnalysis> {
+  friend AnalysisInfoMixin<VectorizationUnitAnalysis>;
+
+public:
+  /// @brief Create a new instance of the pass.
+  VectorizationUnitAnalysis(const VectorizationContext &Ctx) : Ctx(Ctx) {}
+
+  /// @brief Type of result produced by the analysis.
+  class Result {
+    VectorizationUnit *VU = nullptr;
+
+  public:
+    Result() = default;
+    Result(VectorizationUnit *VU) : VU(VU) {}
+    VectorizationUnit &getVU() {
+      assert(hasResult());
+      return *VU;
+    }
+    bool hasResult() { return VU; }
+
+    /// @brief Handle invalidation events from the new pass manager.
+    ///
+    /// @return false, as this analysis can never be invalidated.
+    bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &,
+                    llvm::FunctionAnalysisManager::Invalidator &) {
+      return false;
+    }
+  };
+
+  /// @brief Retrieve the VectorizationUnit for the requested function.
+  /// @param[in] F Function to analyze.
+  /// @return VectorizationUnit corresponding to this function
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "VectorizationUnit analysis"; }
+
+private:
+  const VectorizationContext &Ctx;
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+
+/// @brief Caches and returns the VectorizationContext for a Function.
+class VectorizationContextAnalysis
+    : public llvm::AnalysisInfoMixin<VectorizationContextAnalysis> {
+  friend AnalysisInfoMixin<VectorizationContextAnalysis>;
+
+public:
+  /// @brief Create a new instance of the pass.
+  VectorizationContextAnalysis(VectorizationContext &Ctx) : Context(Ctx) {}
+
+  /// @brief Type of result produced by the analysis.
+  class Result {
+    VectorizationContext &Ctx;
+
+  public:
+    Result(VectorizationContext &Ctx) : Ctx(Ctx) {}
+    VectorizationContext &getContext() { return Ctx; }
+    const VectorizationContext &getContext() const { return Ctx; }
+
+    /// @brief Handle invalidation events from the new pass manager.
+    ///
+    /// @return false, as this analysis can never be invalidated.
+    bool invalidate(llvm::Function &, const llvm::PreservedAnalyses &,
+                    llvm::FunctionAnalysisManager::Invalidator &) {
+      return false;
+    }
+  };
+
+  /// @brief Retrieve the VectorizationContext for the requested function.
+  /// @param[in] F Function to analyze.
+  /// @return VectorizationContext corresponding to this function
+  Result run(llvm::Function &F, llvm::FunctionAnalysisManager &);
+
+  /// @brief Return the name of the pass.
+  static llvm::StringRef name() { return "VectorizationContext analysis"; }
+
+private:
+  VectorizationContext &Context;
+  /// @brief Unique pass identifier.
+  static llvm::AnalysisKey Key;
+};
+} // namespace vecz
+
+#endif // VECZ_ANALYSIS_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
new file mode 100644
index 0000000000000..cad9caaa7bead
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_boscc.h
@@ -0,0 +1,267 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief BOSCC control flow transformation.
+///
+/// Style guideline 004 exemption note: This inner class declaration is in its
+/// own header file, because it's quite large.
+
+#ifndef VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED
+#define VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+
+#include <utility>
+#include <vector>
+
+#include "transform/control_flow_conversion_pass.h"
+
+namespace llvm {
+class Instruction;
+class BasicBlock;
+class Function;
+class Loop;
+} // namespace llvm
+
+namespace vecz {
+
+class LivenessResult;
+
+class ControlFlowConversionState::BOSCCGadget final {
+public:
+  BOSCCGadget(ControlFlowConversionState &Pass)
+      : PassState(Pass), F(Pass.F), AM(Pass.AM), DT(Pass.DT), PDT(Pass.PDT),
+        LI(Pass.LI), DR(Pass.DR), RC(Pass.RC.get()) {}
+
+  /// @brief Region of code that will remain uniform after vectorization.
+  ///
+  /// Such regions won't have heir instructions predicated. A UniformRegion
+  /// is delimited by a single-entry-single-exit region and is represented
+  /// by the blocks it contains.
+  struct UniformRegion final {
+    /// @brief Predicated blocks duplicated in the region.
+    llvm::DenseSet<llvm::BasicBlock *> predicatedBlocks;
+    /// @brief Uniform blocks created in the region.
+    llvm::DenseSet<llvm::BasicBlock *> uniformBlocks;
+    /// @brief Divergent branches that need a connection from the uniform
+    /// region.
+    std::vector<llvm::BasicBlock *> divergentBranches;
+    /// @brief The entry block of the uniform region.
+    llvm::BasicBlock *entryBlock;
+    /// @brief The exit block of the uniform region.
+    llvm::BasicBlock *exitBlock;
+
+    /// @brief Mapping between a connection point of a predicated region
+    ///        and the blend points of that region impacted by the former.
+    ///
+    /// Said "impacted blocks" are blocks with more than one predecessors that
+    /// need to have blend instructions because instructions defined within
+    /// that region may no longer dominate said "impacted blocks".
+    llvm::DenseMap<llvm::BasicBlock *, llvm::SmallVector<llvm::BasicBlock *, 2>>
+        blendPoints;
+
+    /// @brief It stores up information about the connection points while
+    ///        the CFG is being updated, to be applied afterwards.
+    struct ConnectionInfo {
+      llvm::BasicBlock *connectionPoint;
+      std::pair<llvm::BasicBlock *, llvm::BasicBlock *> incoming;
+    };
+
+    /// @brief The list of ConnectionInfos to be applied at finalization.
+    std::vector<ConnectionInfo> connections;
+
+    /// @brief It stores up information about new blocks created to contain
+    ///        blend LCSSA PHI nodes, so they can be created after the CFG
+    ///        has been updated.
+    struct StoreBlock {
+      llvm::BasicBlock *connectionPoint;
+      llvm::BasicBlock *target;
+      llvm::BasicBlock *runtimeCheckerBlock;
+    };
+
+    /// @brief The list of blend `StoreBlocks` to be applied at finalization.
+    llvm::SmallVector<StoreBlock, 4> storeBlocks;
+
+    /// @brief Find if a predicated block belongs to this region.
+    /// @param[in] B Block to look for in the region
+    /// @return Whether the block belong to the region or not.
+    bool contains(llvm::BasicBlock *B) const {
+      return predicatedBlocks.contains(B);
+    }
+  };
+  /// @brief List of all duplicated uniform regions.
+  using UniformRegions = std::vector<UniformRegion>;
+
+  /// @brief Create uniform regions to duplicate the blocks within such
+  /// regions.
+  ///
+  /// This allows to retain their uniform version to skip divergent branches
+  /// when the entry mask of a div causing block is dynamically uniform (i.e.
+  /// all true or all false). Nested uniform regions need not be duplicated
+  /// multiple times.
+  ///
+  /// @return true if no problem occurred, false otherwise.
+  bool duplicateUniformRegions();
+
+  /// @brief Connect the BOSCC regions.
+  /// @return true if no problem occured, false otherwise.
+  bool connectBOSCCRegions();
+
+  /// @brief Get the uniform version of 'B'.
+  /// @param[in] B The predicated block whose uniform version we want.
+  /// @return A uniform block if it exists, nullptr otherwise.
+  llvm::BasicBlock *getBlock(llvm::BasicBlock *B);
+  /// @brief Get the uniform version of 'L'.
+  /// @param[in] L The predicated loop whose uniform version we want.
+  /// @return A uniform loop if it exists, nullptr otherwise.
+  llvm::Loop *getLoop(llvm::Loop *L);
+
+  /// @brief Get the region entry blocks that have not been duplicated.
+  /// @param[out] blocks SmallVector to hold the result
+  void getUnduplicatedEntryBlocks(
+      llvm::SmallVectorImpl<llvm::BasicBlock *> &blocks) const;
+
+  /// @brief Create an entry in the VMap so that 'uni' becomes a uniform
+  ///        equivalent of 'pred'.
+  /// @param[in] pred Predicate value
+  /// @param[in] uni Uniform value
+  /// @param[in] needsMapping Whether 'uni' needs to me remapped
+  void createReference(llvm::Value *pred, llvm::Value *uni,
+                       bool needsMapping = false);
+  /// @brief Add an entry in the VMap so that the uniform equivalent of
+  ///        'old' becomes the uniform equivalent of 'pred' as well.
+  /// @param[in] pred Predicate value
+  /// @param[in] old Predicate value whose uniform equivalent we want
+  void addReference(llvm::Value *pred, llvm::Value *old);
+  /// @brief Add a new block to all the regions the reference block is part
+  /// of.
+  /// @param[in] newB New block
+  /// @param[in] refB Rference block
+  void addInRegions(llvm::BasicBlock *newB, llvm::BasicBlock *refB);
+
+  /// @brief Link the masks of the predicated regions to the uniform regions.
+  /// @return true on success, false on failure.
+  bool linkMasks();
+
+  /// @brief Retrieve the uniform version, if one exists, of predicatedV
+  ///        defined in src.
+  /// @param[in] predicatedV The predicated value whose uniform version we
+  ///            want to get.
+  /// @return the uniform version if it exists, null otherwise.
+  llvm::Value *getUniformV(llvm::Value *predicatedV);
+  /// @brief Update the value a uniform value should be a duplicate of.
+  /// @param[in] from The old value
+  /// @param[in] to The new value
+  void updateValue(llvm::Value *from, llvm::Value *to);
+
+  /// @brief Clean up redundant PHI nodes created by BOSCC.
+  /// @return true if no problem occured, false otherwise.
+  bool cleanUp();
+
+private:
+  ControlFlowConversionState &PassState;
+  llvm::Function &F;
+  llvm::FunctionAnalysisManager &AM;
+  llvm::DominatorTree *DT = nullptr;
+  llvm::PostDominatorTree *PDT = nullptr;
+  llvm::LoopInfo *LI = nullptr;
+  DivergenceResult *DR = nullptr;
+  Reachability *RC = nullptr;
+
+  /// @brief Mapping between the uniform version and the predicated version
+  ///        of the BOSCC. This is useful to keep information between both
+  ///        versions shared, such as exit masks.
+  llvm::ValueToValueMapTy VMap;
+
+  /// @brief Mapping between the predicated version and the uniform version
+  ///        of the BOSCC loops.
+  llvm::DenseMap<const llvm::Loop *, llvm::Loop *> LMap;
+
+  UniformRegions uniformRegions;
+
+  /// @brief Original edges of the CFG. Used to connect the uniform regions
+  ///        to their predicated version.
+  llvm::DenseMap<llvm::BasicBlock *, llvm::SmallVector<llvm::BasicBlock *, 2>>
+      uniformEdges;
+
+  /// @brief Mapping between a block from which a value should be replaced by
+  ///        its blended value.
+  using URVBlender =
+      std::vector<std::pair<llvm::BasicBlock *,
+                            std::pair<llvm::Value *, llvm::Instruction *>>>;
+
+  URVBlender URVB;
+
+  LivenessResult *liveness = nullptr;
+
+  /// @brief Create uniform regions
+  /// @return true if no problem occurred, false otherwise.
+  bool createUniformRegions(
+      const llvm::DenseSet<llvm::BasicBlock *> &noDuplicateBlocks);
+  /// @brief Duplicate a loop, creating a new looptag and updating all the
+  ///        relevant information.
+  /// @param[in] L The loop to duplicate
+  /// @return true if no problem occurred, false otherwise.
+  bool duplicateUniformLoops(llvm::Loop *L);
+
+  /// @brief Connect the uniform blocks that belong to the uniform region
+  /// @param[in] region Uniform region we are connecting
+  /// @param[in] predicatedB Div causing block in the predicated version
+  /// @param[in] uniformB Div causing block in the uniform version
+  /// @return true if no problem occured, false otherwise.
+  bool connectUniformRegion(UniformRegion &region,
+                            llvm::BasicBlock *predicatedB,
+                            llvm::BasicBlock *uniformB);
+
+  /// @brief Blend uniform region instructions into the predicated region
+  ///        connection point 'CP'.
+  /// @param[in] CP Connection point between a uniform and predicated region.
+  /// @param[in] incoming Predicated and uniform incoming block of 'CP'.
+  /// @return true if no problem occured, false otherwise.
+  bool blendConnectionPoint(
+      llvm::BasicBlock *CP,
+      const std::pair<llvm::BasicBlock *, llvm::BasicBlock *> &incoming);
+
+  /// @brief Apply all the changes stored up by `connectUniformRegion`
+  ///        and `blendConnectionPoint` once the CFG has been fully updated.
+  /// @return true if no problem occured, false otherwise.
+  bool blendFinalize();
+
+  /// @brief Update blend values in loop headers.
+  /// @param[in] LTag Loop whose blend values we update
+  /// @param[in] from The value we want to update
+  /// @param[in] to The value we update 'from' with.
+  /// @return true if no problem occured, false otherwise.
+  bool updateLoopBlendValues(LoopTag *LTag, llvm::Instruction *from,
+                             llvm::Instruction *to);
+
+  /// @brief Generate a block ordering.
+  ///
+  /// This ordering differs a little bit from the one in
+  /// ControlFlowConversionPass as we must process all the blocks that belong
+  /// in the same uniform region at once.
+  ///
+  /// @returns true if no errors occurred.
+  bool computeBlockOrdering();
+};
+} // namespace vecz
+
+#endif // VECZ_CONTROL_FLOW_BOSCC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
new file mode 100644
index 0000000000000..187299c997307
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/control_flow_roscc.h
@@ -0,0 +1,56 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief ROSCC control flow transformation.
+///
+/// Style guideline 004 exemption note: This inner class declaration is in its
+/// own header to match `control_flow_boscc.h`.
+
+#ifndef VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED
+#define VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED
+
+#include "transform/control_flow_conversion_pass.h"
+
+namespace llvm {
+class Instruction;
+class BasicBlock;
+class Loop;
+} // namespace llvm
+
+namespace vecz {
+
+/// @brief class that encapsulates the ROSCC transformation, which stands for
+///        "Return On Superword Condition Code" and optimizes non-uniform
+///        branches to the function return block(s).
+class ControlFlowConversionState::ROSCCGadget final {
+public:
+  ROSCCGadget(ControlFlowConversionState &Pass)
+      : UVR(Pass.UVR), DT(Pass.DT), PDT(Pass.PDT), LI(Pass.LI) {}
+
+  /// @brief perform the ROSCC transformation
+  bool run(llvm::Function &F);
+
+private:
+  UniformValueResult *UVR = nullptr;
+  llvm::DominatorTree *DT = nullptr;
+  llvm::PostDominatorTree *PDT = nullptr;
+  llvm::LoopInfo *LI = nullptr;
+};
+} // namespace vecz
+
+#endif // VECZ_CONTROL_FLOW_ROSCC_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
new file mode 100644
index 0000000000000..0be9fa33de99a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/debugging.h
@@ -0,0 +1,200 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Functions, macros, etc used for debugging
+
+#ifndef VECZ_DEBUGGING_H_INCLUDED
+#define VECZ_DEBUGGING_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/DiagnosticInfo.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instruction.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/Value.h>
+#include <llvm/Support/Error.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <cstdlib>
+#include <memory>
+#include <optional>
+
+namespace vecz {
+
+/// @brief Namespace used for vecz utils that we don't want to pollute the whole
+/// vecz namespace
+namespace internal {
+/// @brief Helper type for signaling a failure from functions that return either
+/// a pointer or a boolean to indicate if vectorization was successful or not
+struct VeczFailResult {
+  /// @brief For functions that return a boolean value
+  operator bool() const { return false; }
+  /// @brief For functions that return a pointer
+  template <typename T> operator T *() const { return nullptr; }
+  /// @brief For functions that return an std::shared_ptr
+  template <typename T> operator std::shared_ptr<T>() const { return nullptr; }
+  /// @brief For functions that return an std::unique_ptr
+  template <typename T> operator std::unique_ptr<T>() const { return nullptr; }
+  /// @brief For functions that return an llvm::Optional
+  template <typename T> operator std::optional<T>() const {
+    return std::nullopt;
+  }
+
+  /// @brief For functions that return an llvm::Error
+  operator llvm::Error() const {
+    return llvm::make_error<llvm::StringError>("Unknown VeczFailResult",
+                                               llvm::inconvertibleErrorCode());
+  }
+};
+
+struct AnalysisFailResult : public internal::VeczFailResult {
+  AnalysisFailResult() = default;
+  ~AnalysisFailResult() = default;
+  // If an optimization failed we'd better not have altered the validity of any
+  // analysis...
+  operator llvm::PreservedAnalyses() const {
+    return llvm::PreservedAnalyses::all();
+  }
+};
+
+/*
+ * The following macros are available:
+ *
+ * VECZ_FAIL: Return from the function with a failure value (e.g. `false` or
+ * `nullptr`).
+ *
+ * VECZ_FAIL_IF(cond): If (cond == true) then VECZ_FAIL
+ *
+ * VECZ_STAT_FAIL_IF(cond, stat): If (cond == true) then VECZ_FAIL and increment
+ * stat
+ *
+ * VECZ_ERROR_IF(cond, message): Similar to VECZ_FAIL_IF, but when NDEBUG is not
+ * set it aborts instead of returning a failure value.
+ *
+ * VECZ_ERROR(message): Similar to VECZ_ERROR_IF(true, message)
+ *
+ * VECZ_WARN_IF(cond, message): Similar to VECZ_ERROR_IF, but it doesn't abort
+ * but warns and carries on.
+ *
+ * VECZ_UNREACHABLE(message): Unconditionally terminate with an error message.
+ *
+ * For all the macros, the message is <<'d to llvm::errs(), so it is possible to
+ * print llvm Values etc. For example, this works:
+ *   VECZ_WARN_IF(cond, "Warning: Value = " << *V)
+ */
+
+#define VECZ_FAIL() return vecz::internal::VeczFailResult()
+
+#define VECZ_FAIL_IF(cond)                                                     \
+  do {                                                                         \
+    if (cond) {                                                                \
+      VECZ_FAIL();                                                             \
+    }                                                                          \
+  } while (false)
+
+#define VECZ_STAT_FAIL_IF(cond, stat)                                          \
+  do {                                                                         \
+    if (cond) {                                                                \
+      ++stat;                                                                  \
+      VECZ_FAIL();                                                             \
+    }                                                                          \
+  } while (false)
+
+#define VECZ_ERROR_IF(cond, message)                                           \
+  do {                                                                         \
+    if (cond) {                                                                \
+      VECZ_ERROR(message);                                                     \
+    }                                                                          \
+  } while (false)
+
+#ifdef NDEBUG
+
+#define VECZ_ERROR(message)                                                    \
+  do {                                                                         \
+    llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__        \
+                 << "\n";                                                      \
+    llvm::errs() << "!! Reason: " << message << "\n";                          \
+    VECZ_FAIL();                                                               \
+  } while (false)
+
+#define VECZ_WARN_IF(cond, message) /* Nothing */
+#define VECZ_UNREACHABLE(message)   /* Nothing */
+
+#else /* !NDEBUG */
+
+#define VECZ_ERROR(message)                                                    \
+  do {                                                                         \
+    llvm::errs() << "!! Vecz: ERROR in " << __FILE__ << ":" << __LINE__        \
+                 << "\n";                                                      \
+    llvm::errs() << "!! Reason: " << (message) << "\n";                        \
+    std::abort();                                                              \
+  } while (false)
+
+#define VECZ_WARN_IF(cond, message)                                            \
+  do {                                                                         \
+    if (cond) {                                                                \
+      llvm::errs() << "!! Vecz: WARNING in " << __FILE__ << ":" << __LINE__    \
+                   << "\n";                                                    \
+      llvm::errs() << "!! Reason: " << (message) << "\n";                      \
+    }                                                                          \
+  } while (false)
+
+#define VECZ_UNREACHABLE(message)                                              \
+  do {                                                                         \
+    llvm::errs() << "!! Vecz: UNREACHABLE reached in " << __FILE__ << ":"      \
+                 << __LINE__ << "\n";                                          \
+    llvm::errs() << "!! Message: " << (message) << "\n";                       \
+    std::abort();                                                              \
+  } while (false)
+#endif /* NDEBUG */
+} // namespace internal
+
+#define VECZ_UNUSED(x) ((void)(x))
+
+/// @brief Emit a RemarkMissed message
+///
+/// @param[in] F The function in which we are currently working
+/// @param[in] V The value (can be `nullptr`) to be included in the message
+/// @param[in] Msg The main remark message text
+/// @param[in] Note An optional additional note to provide more context/info.
+void emitVeczRemarkMissed(const llvm::Function *F, const llvm::Value *V,
+                          llvm::StringRef Msg, llvm::StringRef Note = "");
+/// @brief Emit a RemarkMissed message
+///
+/// @param[in] F The function in which we are currently working
+/// @param[in] Msg The main remark message text
+/// @param[in] Note An optional additional note to provide more context/info.
+void emitVeczRemarkMissed(const llvm::Function *F, llvm::StringRef Msg,
+                          llvm::StringRef Note = "");
+/// @brief Emit a Remark message
+///
+/// @param[in] F The function in which we are currently working
+/// @param[in] V The value (can be `nullptr`) to be included in the message
+/// @param[in] Msg The main remark message text
+void emitVeczRemark(const llvm::Function *F, const llvm::Value *V,
+                    llvm::StringRef Msg);
+/// @brief Emit a Remark message
+///
+/// @param[in] F The function in which we are currently working
+/// @param[in] Msg The main remark message text
+void emitVeczRemark(const llvm::Function *F, llvm::StringRef Msg);
+
+} // namespace vecz
+
+#endif // VECZ_DEBUGGING_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
new file mode 100644
index 0000000000000..1321237311322
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/ir_cleanup.h
@@ -0,0 +1,52 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_IR_CLEANUP_H_INCLUDED
+#define VECZ_IR_CLEANUP_H_INCLUDED
+
+#include <llvm/ADT/SmallPtrSet.h>
+
+namespace llvm {
+class Instruction;
+}
+
+namespace vecz {
+class IRCleanup {
+public:
+  /// @brief Mark the instruction as needing deletion. It will only be deleted
+  /// if it is unused. This is used to mark instructions with side-effects
+  /// (e.g. call, load, store and leaves) that have been replaced and are no
+  /// longer needed. Dead Code Elimination will not touch such instructions.
+  ///
+  /// @param[in] I Instruction to mark as needing deletion.
+  void deleteInstructionLater(llvm::Instruction *I);
+
+  /// @brief Get rid of instructions that have been marked for deletion.
+  void deleteInstructions();
+
+  /// @brief Immediately delete an instruction, and replace all uses with undef
+  ///
+  /// @param[in] I Instruction to delete.
+  static void deleteInstructionNow(llvm::Instruction *I);
+
+private:
+  /// @brief Instructions that have been marked for deletion.
+  llvm::SmallPtrSet<llvm::Instruction *, 16> InstructionsToDelete;
+};
+
+} // namespace vecz
+
+#endif // VECZ_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
new file mode 100644
index 0000000000000..d4aafaa610cc5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/llvm_helpers.h
@@ -0,0 +1,54 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief LLVM helper methods.
+
+#ifndef VECZ_LLVM_HELPERS_H_INCLUDED
+#define VECZ_LLVM_HELPERS_H_INCLUDED
+
+#include <llvm/ADT/ArrayRef.h>
+#include <llvm/IR/IRBuilder.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/vector_type_helper.h>
+
+namespace vecz {
+
+/// @brief Determine if the value has vector type, and return it.
+///
+/// @param[in] V Value to analyze.
+///
+/// @return Vector type of V or null.
+llvm::FixedVectorType *getVectorType(llvm::Value *V);
+
+/// @brief Get the default value for a type.
+///
+/// @param[in] T Type to get default value of.
+/// @param[in] V Default value to use for numeric type
+///
+/// @return Default value, which will be poison for non-numeric types
+llvm::Value *getDefaultValue(llvm::Type *T, uint64_t V = 0UL);
+
+/// @brief Get the shuffle mask as sequence of integers.
+///
+/// @param[in] Shuffle Instruction
+///
+/// @return Array of integers representing the Shuffle mask
+llvm::ArrayRef<int> getShuffleVecMask(llvm::ShuffleVectorInst *Shuffle);
+} // namespace vecz
+
+#endif // VECZ_LLVM_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
new file mode 100644
index 0000000000000..a02bb446174d4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/memory_operations.h
@@ -0,0 +1,615 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Manipulation of memory operations like loads and stores.
+
+#ifndef VECZ_MEMORY_OPERATIONS_H_INCLUDED
+#define VECZ_MEMORY_OPERATIONS_H_INCLUDED
+
+#include <inttypes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+
+#include <optional>
+
+namespace llvm {
+class CallInst;
+class LoadInst;
+class StoreInst;
+class Argument;
+class Function;
+class Instruction;
+class Value;
+class Type;
+} // namespace llvm
+
+namespace vecz {
+
+class VectorizationContext;
+struct UniformValueResult;
+
+/// @brief Return or declare a masked memory operation builtin function.
+///
+/// @param[in] Ctx Context used to manipulate internal builtins.
+/// @param[in] DataTy Loaded type or stored value type.
+/// @param[in] PtrTy Pointer type. Must either be opaque or have its pointee
+/// type match DataTy.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] IsLoad true if defined a masked load, false if a masked store.
+/// @param[in] IsVP true if defining a vector-predicated operation
+///
+/// @return Masked builtin function.
+llvm::Function *getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
+                                         llvm::Type *DataTy,
+                                         llvm::PointerType *PtrTy,
+                                         unsigned Alignment, bool IsLoad,
+                                         bool IsVP);
+
+/// @brief Create a call to a masked load operation builtin function.
+///
+/// @param[in] Ctx Context used to retrieve the builtin function.
+/// @param[in] Ty Type to load from memory.
+/// @param[in] Ptr Pointer. Internally bitcast to point to Ty.
+/// @param[in] Mask Mask.
+/// @param[in] EVL vector length as i32, else null (full width operation).
+/// @param[in] Alignment Alignment
+/// @param[in] Name Name to give to the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createMaskedLoad(VectorizationContext &Ctx, llvm::Type *Ty,
+                                 llvm::Value *Ptr, llvm::Value *Mask,
+                                 llvm::Value *EVL, unsigned Alignment,
+                                 llvm::Twine Name = "");
+
+/// @brief Create a call to a masked store operation builtin function.
+///
+/// @param[in] Ctx Context used to retrieve the builtin function.
+/// @param[in] Data Stored value.
+/// @param[in] Ptr Pointer. Internally bitcast to pointer to Data's type.
+/// @param[in] Mask Mask.
+/// @param[in] EVL vector length as i32, else null (full width operation).
+/// @param[in] Alignment Alignment
+/// @param[in] Name Name to give to the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createMaskedStore(VectorizationContext &Ctx, llvm::Value *Data,
+                                  llvm::Value *Ptr, llvm::Value *Mask,
+                                  llvm::Value *EVL, unsigned Alignment,
+                                  llvm::Twine Name = "");
+
+/// @brief Return or declare a (masked) interleaved memory operation builtin
+/// function.
+
+/// @param[in] Ctx Context used to manipulate internal builtins.
+/// @param[in] DataTy Loaded type or stored value type.
+/// @param[in] PtrTy Pointer type. Must either be opaque or have its pointee
+/// type match DataTy's element type.
+/// @param[in] Stride The stride of the access. May be null in which case the
+/// default stride is used.
+/// @param[in] MaskTy The mask type. May be null for an unmasked operation.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] IsLoad true if defining a load, false if defining a store.
+/// @param[in] IsVP true if defining a vector-predicated operation
+///
+/// @return (Masked) interleaved builtin function.
+llvm::Function *
+getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx, llvm::Type *DataTy,
+                              llvm::PointerType *PtrTy, llvm::Value *Stride,
+                              llvm::Type *MaskTy, unsigned Alignment,
+                              bool IsLoad, bool IsVP);
+
+/// @brief Create a call to a (masked) interleaved load builtin function. Also
+/// known as a strided load.
+///
+/// @param[in] Ctx Vectorization Context used to retrieve the builtin info.
+/// @param[in] Ty Type to load from memory
+/// @param[in] Ptr Pointer. Internally bitcast to a pointer to Ty's element
+/// type.
+/// @param[in] Stride The stride of the operation. May be null in which case
+/// the default stride is used.
+/// @param[in] Mask The mask controlling the operation. May be null in which
+/// case an unmasked builtin is called.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] Name Name to give to the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createInterleavedLoad(VectorizationContext &Ctx, llvm::Type *Ty,
+                                      llvm::Value *Ptr, llvm::Value *Stride,
+                                      llvm::Value *Mask, llvm::Value *EVL,
+                                      unsigned Alignment,
+                                      llvm::Twine Name = "");
+
+/// @brief Create a call to a (masked) interleaved store builtin function. Also
+/// known as a strided store.
+///
+/// @param[in] Ctx Vectorization Context used to retrieve the builtin info.
+/// @param[in] Data Data value to store to memory.
+/// @param[in] Ptr Pointer. Internally bitcast to a pointer to Data's element
+/// type.
+/// @param[in] Stride The stride of the operation. May be null in which case
+/// the default stride is used.
+/// @param[in] Mask The mask controlling the operation. May be null in which
+/// case an unmasked builtin is called.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] Name Name to give to the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createInterleavedStore(VectorizationContext &Ctx,
+                                       llvm::Value *Data, llvm::Value *Ptr,
+                                       llvm::Value *Stride, llvm::Value *Mask,
+                                       llvm::Value *EVL, unsigned Alignment,
+                                       llvm::Twine Name = "");
+
+/// @brief Return or declare a (masked) scatter/gather memory operation builtin
+/// function.
+///
+/// @param[in] Ctx Context used to manipulate internal builtins.
+/// @param[in] DataTy Loaded type or stored value type.
+/// @param[in] VecPtrTy Pointer type. Must be a vector of pointers, each of
+/// which are either opaque or have a pointee type matching DataTy's element
+/// type.
+/// @param[in] MaskTy The mask type. May be null for an unmasked operation.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] IsGather true if defining a gather (load), false if defining a
+/// scatter (store).
+/// @param[in] IsVP true if defining a vector-predicated operation
+///
+/// @return Scatter/gather builtin function.
+llvm::Function *getOrCreateScatterGatherMemOpFn(vecz::VectorizationContext &Ctx,
+                                                llvm::Type *DataTy,
+                                                llvm::VectorType *VecPtrTy,
+                                                llvm::Type *MaskTy,
+                                                unsigned Alignment,
+                                                bool IsGather, bool IsVP);
+
+/// @brief Create a call to a (masked) gather memory operation builtin
+/// function.
+///
+/// @param[in] Ctx Context used to retrieve the builtin function.
+/// @param[in] Ty Type to load from memory.
+/// @param[in] VecPtr Pointer value. Must be a vector of pointers, each of
+/// which are either opaque or have a pointee type matching DataTy's element
+/// type.
+/// @param[in] Mask The predicate of the masked instruction. May be null in
+/// which case an unmasked builtin is created.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] EVL vector length as i32, else null (full width operation).
+/// @param[in] Name Name to give to the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createGather(VectorizationContext &Ctx, llvm::Type *Ty,
+                             llvm::Value *VecPtr, llvm::Value *Mask,
+                             llvm::Value *EVL, unsigned Alignment,
+                             llvm::Twine Name = "");
+
+/// @brief Create a call to a (masked) scatter memory operation builtin
+/// function.
+///
+/// @param[in] Ctx Context used to retrieve the builtin function.
+/// @param[in] VecData Value to store to memory.
+/// @param[in] VecPtr Pointer value. Must be a vector of pointers, each of
+/// which are either opaque or have a pointee type matching DataTy's element
+/// type.
+/// @param[in] Mask The predicate of the masked instruction. May be null in
+/// which case an unmasked builtin is created.
+/// @param[in] Alignment Alignment of the operation.
+/// @param[in] EVL vector length as i32, else null (full width operation).
+/// @param[in] Name Name to give to the call instruction.
+///
+/// @return Call instruction or null on error.
+llvm::CallInst *createScatter(VectorizationContext &Ctx, llvm::Value *VecData,
+                              llvm::Value *VecPtr, llvm::Value *Mask,
+                              llvm::Value *EVL, unsigned Alignment,
+                              llvm::Twine Name = "");
+
+/// @brief an enum to distinguish between loads and stores, and between builtin
+/// memop calls and native IR memop instructions.
+enum class MemOpKind : int {
+  /// @brief The object does not contain a valid memory operation.
+  Invalid = 0,
+  /// @brief The object contains a LLVM load instruction.
+  LoadInstruction,
+  /// @brief The object contains a LLVM store instruction.
+  StoreInstruction,
+  /// @brief The object contains a 'load-like' function call.
+  LoadCall,
+  /// @brief The object contains a 'store-like' function call.
+  StoreCall,
+};
+
+/// @brief an enum to distinguish between different memory access patterns
+enum class MemOpAccessKind : int {
+  /// @brief The object does not represent a vecz memop call
+  Native = 0,
+  /// @brief The object represents a masked memory operation
+  Masked,
+  /// @brief The object represents an interleaved memory operation
+  Interleaved,
+  /// @brief The object represents a masked interleaved memory operation
+  MaskedInterleaved,
+  /// @brief The object represents a scatter/gather memory operation
+  ScatterGather,
+  /// @brief The object represents a masked scatter/gather memory operation
+  MaskedScatterGather,
+};
+
+struct MemOp;
+
+/// @brief Describes a memory operation such as a load or a store.
+class MemOpDesc {
+  /// @brief Type of the data operand for stores, or memory type for loads.
+  llvm::Type *DataTy;
+  /// @brief Type of the pointer used to access memory.
+  llvm::Type *PtrTy;
+  /// @brief In the case of masked operations, type of the mask operand.
+  llvm::Type *MaskTy;
+  /// @brief Identifies the kind of memory operation which is performed.
+  MemOpKind Kind;
+  /// @brief Idenfities the kind of memory access pattern
+  MemOpAccessKind AccessKind;
+  /// @brief Whether or not the memory access is vector-length predicated.
+  bool IsVLOp;
+  /// @brief Memory alignment.
+  unsigned Alignment;
+  /// @brief Distance between consecutive elements in memory, in number of
+  /// elements. Zero means uniform access, one means sequential access.
+  /// Negative values mean the access is done is reverse order.
+  llvm::Value *Stride;
+  /// @brief Index of the data operand, for stores, or negative value.
+  int8_t DataOpIdx;
+  /// @brief Index of the pointer operand.
+  int8_t PtrOpIdx;
+  /// @brief Index of the mask operand, for masked operations, or negative
+  /// value.
+  int8_t MaskOpIdx;
+  /// @brief Index of vector length operand, or negative value.
+  int8_t VLOpIdx;
+
+  friend struct MemOp;
+
+public:
+  /// @brief Create an invalid memory operation.
+  MemOpDesc();
+
+  bool isMaskedMemOp() const { return AccessKind == MemOpAccessKind::Masked; }
+  bool isInterleavedMemOp() const {
+    return AccessKind == MemOpAccessKind::Interleaved;
+  }
+  bool isMaskedInterleavedMemOp() const {
+    return AccessKind == MemOpAccessKind::MaskedInterleaved;
+  }
+  bool isScatterGatherMemOp() const {
+    return AccessKind == MemOpAccessKind::ScatterGather;
+  }
+  bool isMaskedScatterGatherMemOp() const {
+    return AccessKind == MemOpAccessKind::MaskedScatterGather;
+  }
+
+  /// @brief In the case of stores, return the data element being stored.
+  llvm::Value *getDataOperand(llvm::Function *F) const {
+    return getOperand(F, DataOpIdx);
+  }
+
+  /// @brief Return the pointer used by the memory operation.
+  llvm::Value *getPointerOperand(llvm::Function *F) const {
+    return getOperand(F, PtrOpIdx);
+  }
+
+  /// @brief In the case of a masked memory operation, return the mask.
+  llvm::Value *getMaskOperand(llvm::Function *F) const {
+    return getOperand(F, MaskOpIdx);
+  }
+
+  /// @brief In the case of a masked memory operation, return the vector
+  /// length.
+  llvm::Value *getVLOperand(llvm::Function *F) const {
+    return getOperand(F, VLOpIdx);
+  }
+
+  /// @brief Index of the data operand of the MemOp
+  /// @return The index, or -1 if no data operand
+  int8_t getDataOperandIndex() const { return DataOpIdx; }
+  /// @brief Index of the pointer operand of the MemOp
+  /// @return The index, or -1 if no pointer operand
+  int8_t getPointerOperandIndex() const { return PtrOpIdx; }
+  /// @brief Index of the mask operand of the MemOp
+  /// @return The index, or -1 if no mask operand
+  int8_t getMaskOperandIndex() const { return MaskOpIdx; }
+  /// @brief Index of the vector-length operand of the MemOp
+  /// @return The index, or -1 if no mask operand
+  int8_t getVLOperandIndex() const { return VLOpIdx; }
+
+  /// @brief Get what kind of memory operation this is.
+  /// @return The kind of the memory operation
+  MemOpKind getKind() const { return Kind; }
+
+  /// @brief Get the alignment of the memory operation.
+  /// @return The alignment in bytes
+  unsigned getAlignment() const { return Alignment; }
+
+  /// @brief In the case of a interleaved memory operation, return the stride.
+  /// @return The Value determining the stride
+  llvm::Value *getStride() const { return Stride; }
+  /// @brief Determine if the stride is an integer whose value can be determined
+  /// at compile time.
+  /// @return True is the stride is a compile time integer constant
+  bool isStrideConstantInt() const;
+  /// @brief Get the stride as a constant int. It assumes that it is possible
+  /// and valid to do so.
+  /// @return The stride in elements
+  int64_t getStrideAsConstantInt() const;
+
+  /// @brief Return the type of data element being accessed in memory.
+  /// @return The type of the data element being accessed in memory.
+  llvm::Type *getDataType() const { return DataTy; }
+
+  /// @brief Return the type of the pointer operand.
+  /// @return The type the pointer operand
+  llvm::Type *getPointerType() const { return PtrTy; }
+
+  /// @brief Return the specified operand from the function.
+  ///
+  /// @param[in] F Function to retrieve the operand from.
+  /// @param[in] OpIdx Index of the operand to retrieve.
+  ///
+  /// @return Operand or null.
+  llvm::Argument *getOperand(llvm::Function *F, int OpIdx) const;
+
+  /// @brief Determine whether the given function is a memory operation.
+  /// If that's the case, the descriptor is populated and returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a memory operation.
+  /// std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeMemOpFunction(llvm::Function &F);
+
+  /// @brief Determine whether the given function is a masked memory operation.
+  /// If that's the case, the descriptor is populated and returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a masked memory operation.
+  /// std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeMaskedMemOp(llvm::Function &F);
+
+  /// @brief Determine whether the given function is an interleaved memory
+  /// operation or not. If that's the case, the descriptor is populated and
+  /// returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is an interleaved memory
+  /// operation. std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeInterleavedMemOp(llvm::Function &F);
+
+  /// @brief Determine whether the given function is a masked interleaved memory
+  /// operation or not. If that's the case, the descriptor is populated and
+  /// returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a masked interleaved memory
+  /// operation. std::nullopt otherwise.
+  static std::optional<MemOpDesc>
+  analyzeMaskedInterleavedMemOp(llvm::Function &F);
+
+  /// @brief Determine whether the given function is a scatter/gather memory
+  /// operation or not. If that's the case, the descriptor is populated and
+  /// returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a scatter/gather operation.
+  /// std::nullopt otherwise.
+  static std::optional<MemOpDesc> analyzeScatterGatherMemOp(llvm::Function &F);
+
+  /// @brief Determine whether the given function is a scatter/gather memory
+  /// operation or not. If that's the case, the descriptor is populated and
+  /// returned.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return A MemOpDesc if the given function is a masked scatter/gather
+  /// operation. std::nullopt otherwise.
+  static std::optional<MemOpDesc>
+  analyzeMaskedScatterGatherMemOp(llvm::Function &F);
+
+  /// @brief Determine whether the operation is a load or not.
+  bool isLoad() const {
+    switch (Kind) {
+    default:
+      return false;
+    case MemOpKind::LoadInstruction:
+    case MemOpKind::LoadCall:
+      return true;
+    }
+  }
+
+  /// @brief Determine whether the operation is a store or not.
+  bool isStore() const {
+    switch (Kind) {
+    default:
+      return false;
+    case MemOpKind::StoreInstruction:
+    case MemOpKind::StoreCall:
+      return true;
+    }
+  }
+
+  /// @brief Determine whether the operation is an instruction or not.
+  bool isLoadStoreInst() const {
+    switch (Kind) {
+    default:
+      return false;
+    case MemOpKind::LoadInstruction:
+    case MemOpKind::StoreInstruction:
+      return true;
+    }
+  }
+
+  bool isVLOp() const { return IsVLOp; }
+};
+
+/// @brief Wrapper that combines a memory operation descriptor and instruction.
+/// This allows manipulating different kinds of memory operations (load and
+/// store instructions, vecz builtins) in the same way.
+struct MemOp {
+  /// @brief Create an invalid memory operation.
+  MemOp() {}
+  /// @brief Create a memory operation from an instruction and an existing
+  /// memory operation descriptor.
+  ///
+  /// @param[in] I Memory instruction.
+  /// @param[in] Desc Memory operation descriptor.
+  MemOp(llvm::Instruction *I, const MemOpDesc &Desc);
+  /// @brief Create a memory operation from an instruction.
+  /// @param[in] I Instruction that may be a memory operation.
+  static std::optional<MemOp> get(llvm::Instruction *I);
+  /// @brief Create a memory operation from an instruction and an existing
+  /// memory operation descriptor.
+  ///
+  /// @param[in] CI Memory builtin call instruction.
+  /// @param[in] AccessKind the kind of access to consider
+  static std::optional<MemOp> get(llvm::CallInst *CI,
+                                  MemOpAccessKind AccessKind);
+
+  /// @brief Access the memory operation descriptor.
+  const MemOpDesc &getDesc() const { return Desc; }
+
+  /// @brief Access the memory operation descriptor.
+  MemOpDesc &getDesc() { return Desc; }
+
+  /// @brief Return the instruction that performs the memory operation.
+  llvm::Instruction *getInstr() const { return Ins; }
+
+  /// @brief Return the alignment in bytes.
+  unsigned getAlignment() const { return Desc.getAlignment(); }
+
+  /// @brief In the case of a interleaved memory operation, return the stride.
+  llvm::Value *getStride() const { return Desc.getStride(); }
+
+  /// @brief Return the type of data element being accessed in memory.
+  llvm::Type *getDataType() const { return Desc.getDataType(); }
+
+  /// @brief Return the type of the pointer operand.
+  llvm::Type *getPointerType() const { return Desc.getPointerType(); }
+
+  /// @brief Determine whether the operation is a load or not.
+  bool isLoad() const { return Desc.isLoad(); }
+
+  /// @brief Determine whether the operation is a store or not.
+  bool isStore() const { return Desc.isStore(); }
+
+  /// @brief Determine whether the operation is an instruction or not.
+  bool isLoadStoreInst() const { return Desc.isLoadStoreInst(); }
+
+  /// @brief Determine whether the operation is a masked memop call
+  bool isMaskedMemOp() const { return Desc.isMaskedMemOp(); }
+
+  /// @brief Determine whether the operation is a scatter/gather memop call
+  bool isMaskedScatterGatherMemOp() const {
+    return Desc.isMaskedScatterGatherMemOp();
+  }
+
+  /// @brief Determine whether the operation is a masked interleaved memop call
+  bool isMaskedInterleavedMemOp() const {
+    return Desc.isMaskedInterleavedMemOp();
+  }
+
+  /// @brief In the case of stores, return the data element being stored.
+  /// @return Data operand or null.
+  llvm::Value *getDataOperand() const;
+  /// @brief Return the pointer used by the memory operation.
+  /// @return Pointer used by the memory operation or null for invalid
+  /// operations.
+  llvm::Value *getPointerOperand() const;
+  /// @brief In the case of a masked memory operation, return the mask.
+  /// @return Mask operand or null.
+  llvm::Value *getMaskOperand() const;
+
+  /// @brief In the case of stores, set the data element being stored.
+  /// @return true on success.
+  bool setDataOperand(llvm::Value *V);
+  /// @brief Set the pointer used by the memory operation.
+  /// @return true on success.
+  bool setPointerOperand(llvm::Value *V);
+  /// @brief In the case of a masked memory operation, set the mask.
+  /// @return true on success.
+  bool setMaskOperand(llvm::Value *V);
+
+  /// @brief In the case of a builtin memory operation, return the call.
+  /// @return Call instruction or null.
+  llvm::CallInst *getCall() const;
+
+  /// @brief Determine if the stride is an integer whose value can be determined
+  /// at compile time.
+  /// @return True is the stride is a compile time integer constant
+  bool isStrideConstantInt() const { return Desc.isStrideConstantInt(); }
+  /// @brief Get the stride as a constant int. It assumes that it is possible
+  /// and valid to do so.
+  /// @return The stride in elements
+  int64_t getStrideAsConstantInt() const {
+    return Desc.getStrideAsConstantInt();
+  }
+
+private:
+  /// @brief Access an operand of the call instruction.
+  ///
+  /// @param[in] OpIdx Index of the operand to access.
+  ///
+  /// @return Specified operand of the call instruction.
+  llvm::Value *getCallOperand(int OpIdx) const;
+
+  /// @brief Set an operand of the call instruction.
+  ///
+  /// @param[in] OpIdx Index of the operand to access.
+  /// @param[in] V the Value to set
+  ///
+  /// @return true on success.
+  bool setCallOperand(int OpIdx, llvm::Value *V);
+
+  /// @brief Instruction that performs the memory operation.
+  llvm::Instruction *Ins;
+  /// @brief Describes the memory operation.
+  MemOpDesc Desc;
+};
+
+namespace {
+inline llvm::ConstantInt *getSizeInt(llvm::IRBuilder<> &B, int64_t val) {
+  if (B.GetInsertBlock()->getModule()->getDataLayout().getPointerSize() == 4) {
+    return B.getInt32(val);
+  }
+  return B.getInt64(val);
+}
+
+inline llvm::IntegerType *getSizeTy(llvm::Module &M) {
+  if (M.getDataLayout().getPointerSize() == 4) {
+    return llvm::Type::getInt32Ty(M.getContext());
+  }
+  return llvm::Type::getInt64Ty(M.getContext());
+}
+
+inline llvm::IntegerType *getSizeTy(llvm::IRBuilder<> &B) {
+  return getSizeTy(*(B.GetInsertBlock()->getModule()));
+}
+} // namespace
+} // namespace vecz
+
+#endif // VECZ_MEMORY_OPERATIONS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
new file mode 100644
index 0000000000000..2ad2d60a3a78c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/offset_info.h
@@ -0,0 +1,268 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Analysis of memory pointer offsets.
+
+#ifndef VECZ_OFFSET_INFO_H_INCLUDED
+#define VECZ_OFFSET_INFO_H_INCLUDED
+
+#include <inttypes.h>
+#include <llvm/IR/IRBuilder.h>
+
+namespace llvm {
+class CallInst;
+class Value;
+class Type;
+} // namespace llvm
+
+namespace vecz {
+
+struct UniformValueResult;
+class ValueTagMap;
+
+/// @brief Item ID dependence kinds that an expression can have.
+/// Note that these are all mutually exclusive.
+enum OffsetKind {
+  /// @brief The offset may diverge in unmodelled ways when vectorized. This
+  /// state is to be assumed unless it can be proved otherwise.
+  eOffsetMayDiverge,
+  /// @brief The offset is a compile-time constant.
+  eOffsetConstant,
+  /// @brief The offset is a uniform variable.
+  eOffsetUniformVariable,
+  /// @brief The offset has a work-item ID dependence. The ID might be scaled
+  /// by some stride != 1, in which case loads or stores dependent on it will
+  /// be interleaved.
+  eOffsetLinear
+};
+
+class StrideAnalysisResult;
+
+/// @brief Describes an offset used by a load or store instruction we want to
+/// vectorize.
+struct OffsetInfo {
+  /// @brief Properties of the offset, which may prevent vectorization.
+  OffsetKind Kind;
+  /// @brief The actual value of the analyzed expression.
+  llvm::Value *const ActualValue;
+  /// @brief The difference in this value between two consecutive work items,
+  /// as a constant integer.
+  /// When the stride is a pointer, the difference is in bytes.
+  int64_t StrideInt;
+  /// @brief The difference in this value between two consecutive work items,
+  /// as a uniform value.
+  /// When the stride is a pointer, the difference is in bytes.
+  /// This is nullptr after analysis and is set upon calling `manifest()`.
+  llvm::Value *ManifestStride;
+
+  /// @brief A bit mask indicating which bits of the value it is possible to be
+  /// set, based on the expressions it depends on.
+  uint64_t BitMask;
+
+  /// @brief Construct a new offset information object from a general value
+  /// @param[in] SAR The StrideAnalysisResult used to retrieve other
+  /// OffsetInfos.
+  /// @param[in] V Offset value to analyze.
+  OffsetInfo(StrideAnalysisResult &SAR, llvm::Value *V);
+
+  OffsetInfo() = delete;
+  OffsetInfo(const OffsetInfo &) = default;
+
+  /// @brief Return whether the offset has a non-analytical dedpendence on work
+  /// item ID.
+  bool mayDiverge() const { return Kind == eOffsetMayDiverge; }
+
+  /// @brief Return whether the offset has a linear dependence on work item ID.
+  bool hasStride() const { return Kind == eOffsetLinear; }
+
+  /// @brief Return whether the offset is a compile-time constant.
+  bool isConstant() const { return Kind == eOffsetConstant; }
+
+  /// @brief Return whether the offset has no dependence on work item ID.
+  bool isUniform() const {
+    return Kind == eOffsetConstant || Kind == eOffsetUniformVariable;
+  }
+
+  /// @brief Returns the actual value of the analyzed offset if it is uniform.
+  ///
+  /// @return The uniform Value or nullptr otherwise
+  llvm::Value *getUniformValue() const;
+  /// @brief Get the offset as a constant int. It assumes that it is possible to
+  /// do so.
+  /// @return The offset as an integer
+  int64_t getValueAsConstantInt() const;
+  /// @brief Get the Stride of the analyzed and manifested value.
+  /// @return The stride in number of elements
+  llvm::Value *getStride() const { return ManifestStride; }
+  /// @brief Determine whether the stride is simply a constant compile time
+  /// integer.
+  /// @return true if the stride is linear and constant, false otherwise.
+  bool isStrideConstantInt() const;
+  /// @brief Get the stride as a constant int.
+  /// @return The stride as an integer, or zero if the stride is not constant.
+  int64_t getStrideAsConstantInt() const;
+
+  /// @brief Convert the bytewise stride into an element-wise stride based on
+  /// the data type and data layout, as an integer.
+  ///
+  /// @param[in] PtrEleTy The element data type.
+  /// @param[in] DL The Data Layout.
+  /// @return The memory stride as number of elements.
+
+  uint64_t getConstantMemoryStride(llvm::Type *PtrEleTy,
+                                   const llvm::DataLayout *DL) const;
+
+  /// @brief Convert the bytewise stride into an element-wise stride based on
+  /// the data type and data layout, building instructions where needed. Note
+  /// that the stride must be manifest first.
+  ///
+  /// @param[in] B an IRBuilder used for creating constants or instructions.
+  /// @param[in] PtrEleTy The element data type.
+  /// @param[in] DL The Data Layout.
+  /// @return The memory stride as number of elements.
+  llvm::Value *buildMemoryStride(llvm::IRBuilder<> &B, llvm::Type *PtrEleTy,
+                                 const llvm::DataLayout *DL) const;
+
+  /// @brief Create Values that represent or compute strides.
+  ///
+  /// @param[in] B an IRBuilder used for creating constants or instructions.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &manifest(llvm::IRBuilder<> &B, StrideAnalysisResult &SAR);
+
+private:
+  /// @brief Mark this offset with the given flag.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &setKind(OffsetKind Kind);
+  /// @brief Mark this offset as having a stride component.
+  /// @param[in] Stride Stride component applied to the item ID.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &setStride(llvm::Value *Stride);
+  /// @brief Mark this offset as having a stride component.
+  /// @param[in] Stride Stride component applied to the item ID.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &setStride(int64_t Stride);
+  /// @brief Mark this offset as possibly diverging.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &setMayDiverge();
+
+  /// @brief Analyse the given integer offset for properties that we need to
+  /// know in order to vectorize loads and stores. In particular we are
+  /// interested in knowing whether the offset can diverge (be different for
+  /// different items) or not. We can handle divergence in several cases but not
+  /// all.
+  ///
+  /// @param[in] Offset Offset value to analyze.
+  /// @param[in] SAR Result of the stride analysis.
+  ///
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &analyze(llvm::Value *Offset, StrideAnalysisResult &SAR);
+
+  /// @brief Analyse the given pointer for properties that we need to
+  /// know in order to vectorize loads and stores. In particular we are
+  /// interested in knowing whether the offset can diverge (be different for
+  /// different items) or not. We can handle divergence in several cases but not
+  /// all.
+  ///
+  /// @param[in] Address Pointer to analyze.
+  /// @param[in] SAR Result of the stride analysis.
+  ///
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &analyzePtr(llvm::Value *Address, StrideAnalysisResult &SAR);
+
+  /// @brief Combine the offset info of LHS and RHS operands of an add
+  /// operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineAdd(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestAdd(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a sub operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineSub(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestSub(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of an and
+  /// operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineAnd(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestAnd(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of an or operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineOr(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestOr(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                         const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a xor operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineXor(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestXor(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a shl operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineShl(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestShl(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a ashr
+  /// operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineAShr(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestAShr(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                           const OffsetInfo &RHS);
+
+  /// @brief Combine the offset info of LHS and RHS operands of a mul operation.
+  /// @param[in] LHS Offset info for the LHS operand.
+  /// @param[in] RHS Offset info for the RHS operand.
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &combineMul(const OffsetInfo &LHS, const OffsetInfo &RHS);
+  OffsetInfo &manifestMul(llvm::IRBuilder<> &B, const OffsetInfo &LHS,
+                          const OffsetInfo &RHS);
+
+  /// @brief Copies the stride information from another OffsetInfo into this one
+  /// @param[in] Other the other OffsetInfo to copy from
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &copyStrideFrom(const OffsetInfo &Other);
+
+  /// @brief Copies the stride and bitmask information from another OffsetInfo
+  /// into this one
+  /// @param[in] Other the other OffsetInfo to copy from
+  /// @return Reference to the current object for chaining.
+  OffsetInfo &copyStrideAndBitMaskFrom(const OffsetInfo &Other);
+};
+
+} // namespace vecz
+
+#endif // #define VECZ_OFFSET_INFO_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
new file mode 100644
index 0000000000000..2506c79921928
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/reachability.h
@@ -0,0 +1,116 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief A utility class to speed of reachability queries on a CFG
+
+#ifndef VECZ_REACHABILITY_H_INCLUDED
+#define VECZ_REACHABILITY_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+
+#include <vector>
+
+namespace llvm {
+class BasicBlock;
+class DominatorTree;
+class Function;
+class LoopInfo;
+class PostDominatorTree;
+} // namespace llvm
+
+namespace vecz {
+
+/// @brief A data structure to handle reachability queries
+class Reachability {
+public:
+  /// @brief Construct the Reachability computation from a Dominator Tree
+  ///        and a Post-Dominator Tree, that are used to speed up the queries.
+  /// @param[in] DT the Dominator Tree
+  /// @param[in] PDT the Post-Dominator Tree
+  /// @param[in] LI the Loop Info
+  Reachability(llvm::DominatorTree &DT, llvm::PostDominatorTree &PDT,
+               llvm::LoopInfo &LI);
+
+  /// @brief Destructor
+  ~Reachability() = default;
+
+  /// @brief Computes a new data structure from the provided block tag list,
+  ///        overwriting any data that was already present.
+  ///
+  /// Back edges are disregarded during this process.
+  void recalculate(llvm::Function &F);
+
+  /// @brief Computes a new data structure from the provided block tag list,
+  ///        only if the structure is currently empty. Otherwise, does nothing.
+  void update(llvm::Function &F);
+
+  /// @brief Clears the data structure.
+  ///
+  /// Updating the underlying CFG invalidates the Reachability computations,
+  /// so it is required to clear the data ready to accept a new CFG.
+  void clear();
+
+  /// @brief Checks the internal consistency of the computed data structure.
+  bool validate() const;
+
+  /// @brief Check if a block is reachable from another.
+  ///
+  /// @param[in] from the BasicBlock to start from
+  /// @param[in] to the BasicBlock we are trying to reach
+  ///
+  /// @return True if "to" is reachable from "from"
+  bool isReachable(llvm::BasicBlock *from, llvm::BasicBlock *to) const;
+
+private:
+  /// @brief Internal implementation of isReachable
+  ///
+  /// @param[in] from the graph node index to start from
+  /// @param[in] to the graph node index we are trying to reach
+  ///
+  /// @return True if "to" is reachable from "from"
+  bool isReachableImpl(size_t from, size_t to) const;
+
+  /// @brief The Dominator Tree
+  llvm::DominatorTree &DT;
+  /// @brief The Post-Dominator Tree
+  llvm::PostDominatorTree &PDT;
+  /// @brief The Loop Info, used to determine back-edges
+  llvm::LoopInfo &LI;
+
+  /// @brief Node structure containing implementational details
+  ///        computed and used by the algorithm.
+  struct Rnode {
+    size_t X = 0;
+    size_t Y = 0;
+    size_t dom = 0;
+    size_t postDom = 0;
+    unsigned predTmp = 0;
+    unsigned predecessors = 0;
+    llvm::SmallVector<size_t, 2> successors;
+  };
+
+  /// @brief The list of graph nodes that encode the graph.
+  std::vector<Rnode> graph;
+
+  /// @brief A mapping between BasicBlock pointers and graph node indices.
+  llvm::DenseMap<llvm::BasicBlock *, size_t> indexMap;
+};
+} // namespace vecz
+
+#endif // VECZ_REACHABILITY_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
new file mode 100644
index 0000000000000..40acd42336a0e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/simd_packet.h
@@ -0,0 +1,99 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief SIMD packets hold a value for each lane.
+
+#ifndef VECZ_SIMD_PACKET_H_INCLUDED
+#define VECZ_SIMD_PACKET_H_INCLUDED
+
+#include "debugging.h"
+
+namespace llvm {
+class Value;
+}
+
+namespace vecz {
+
+/// @brief Represents the status of lanes within a packet. The most common
+/// status would be that a lane can be either enabled or disabled.
+struct PacketMask {
+  /// @brief Create a new mask where all lanes are disabled.
+  explicit PacketMask() : Value(0) {}
+  /// @brief Create a new mask using an existing bit field.
+  explicit PacketMask(uint64_t Mask) : Value(Mask) {}
+
+  /// @brief Determine whether the given lane is enabled or not.
+  /// @param[in] Lane Index of the lane to test.
+  /// @return true if the lane is enabled, false otherwise.
+  bool isEnabled(unsigned Lane) const {
+    assert(Lane < CHAR_BIT * sizeof(Value) &&
+           "Invalid lane, possible mask overflow");
+    return (Value & (1ull << Lane)) != 0ull;
+  }
+
+  /// @brief Enable the given lane.
+  /// @param[in] Lane Index of the lane to enable.
+  void enable(unsigned Lane) {
+    assert(Lane < CHAR_BIT * sizeof(Value) &&
+           "Invalid lane, possible mask overflow");
+    Value |= (1ull << Lane);
+  }
+
+  /// @brief Disable the given lane.
+  /// @param[in] Lane Index of the lane to disable.
+  void disable(unsigned Lane) {
+    assert(Lane < CHAR_BIT * sizeof(Value) &&
+           "Invalid lane, possible mask overflow");
+    Value &= ~(1ull << Lane);
+  }
+  /// @brief Enable multiple lanes [0: NumLanes)
+  /// @param[in] NumLanes Number of lanes to enable.
+  void enableAll(unsigned NumLanes);
+
+  /// @brief Bit field that describes which lanes are enabled.
+  /// NOTE: The length of bitfield is limited to sizeof(uint64_t) * CHAR_BIT(8)
+  uint64_t Value;
+};
+
+/// @brief Packet of LLVM values (e.g. instructions), one for each SIMD lane.
+struct SimdPacket : public llvm::SmallVector<llvm::Value *, 4> {
+  using SmallVector::SmallVector;
+
+  /// @brief Return the value at the given index.
+  /// @param[in] Index Index of the value to return.
+  /// @return Value at the given index or null.
+  llvm::Value *at(unsigned Index) const;
+  /// @brief Set the value at the given index and enable the corresponding lane.
+  /// @param[in] Index Index of the value to set.
+  /// @param[in] V Value to store at the given index.
+  void set(unsigned Index, llvm::Value *V);
+  /// @brief Copy all enabled lanes from the other packet and update the mask.
+  /// @param[in] Other Packet to copy values from.
+  /// @return Reference to the current packet.
+  SimdPacket &update(const SimdPacket &Other);
+
+  /// @brief Bitmask of lanes that are 'enabled' in this packet.
+  /// This can mean different things depending on the context:
+  /// * By default, only lanes that are 'enabled' have a valid value.
+  /// * When scalarizing, only lanes that are 'enabled' will be scalarized.
+  PacketMask Mask;
+};
+
+} // namespace vecz
+
+#endif // VECZ_SIMD_PACKET_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
new file mode 100644
index 0000000000000..15f848257d446
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/common_gep_elimination_pass.h
@@ -0,0 +1,56 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Remove duplicate GEP instructions.
+
+#ifndef VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+class VectorizationUnit;
+
+/// @brief This pass removes every duplicate GEP instruction before the
+/// packetization pass.
+class CommonGEPEliminationPass
+    : public llvm::PassInfoMixin<CommonGEPEliminationPass> {
+public:
+  static void *ID() { return (void *)&PassID; };
+
+  /// @brief Remove duplicate GEP instructions.
+  ///
+  /// @param[in] F Function to optimize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved passes.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() { return "Common GEP Elimination pass"; }
+
+private:
+  /// @brief Identifier for the pass.
+  static char PassID;
+};
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_COMMON_GEP_ELIMINATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
new file mode 100644
index 0000000000000..9cffc83720217
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/control_flow_conversion_pass.h
@@ -0,0 +1,155 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Control flow partial linearization transform.
+
+#ifndef VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+#include <memory>
+
+namespace llvm {
+class BasicBlock;
+class Function;
+class Instruction;
+class Value;
+class DominatorTree;
+class PostDominatorTree;
+class PreservedAnalyses;
+class LoopInfo;
+} // namespace llvm
+
+namespace vecz {
+struct BasicBlockTag;
+struct LoopTag;
+struct UniformValueResult;
+class DivergenceResult;
+class VectorizationUnit;
+class VectorizationContext;
+class Reachability;
+
+/// \addtogroup cfg-conversion Control Flow Conversion Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief Pass that convert performs control-flow to data-flow conversion for
+/// a function.
+class ControlFlowConversionPass
+    : public llvm::PassInfoMixin<ControlFlowConversionPass> {
+public:
+  /// @brief Unique identifier for the pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Perform control-flow to data-flow conversion on the function's CFG.
+  ///
+  /// @param[in] F Function to convert.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() {
+    return "Control flow to data flow conversion";
+  }
+
+private:
+  /// @brief Unique identifier for the pass.
+  static char PassID;
+};
+
+class ControlFlowConversionState {
+public:
+  /// @brief The actual implementation of this pass
+  class Impl;
+
+protected:
+  ControlFlowConversionState(llvm::Function &,
+                             llvm::FunctionAnalysisManager &AM);
+
+  /// @brief BOSCC (Branch On Superword Codition Code) data structure that
+  ///        encloses regions of the CFG that contain blocks that need to be
+  ///        duplicated.
+  class BOSCCGadget;
+
+  /// @brief ROSCC (Return On Superword Codition Code) utility class to
+  ///        optimize conditional function return branches.
+  class ROSCCGadget;
+
+  llvm::Function &F;
+  llvm::FunctionAnalysisManager &AM;
+  VectorizationUnit &VU;
+  VectorizationContext &Ctx;
+  llvm::DominatorTree *DT = nullptr;
+  llvm::PostDominatorTree *PDT = nullptr;
+  llvm::LoopInfo *LI = nullptr;
+  DivergenceResult *DR = nullptr;
+  UniformValueResult *UVR = nullptr;
+  std::unique_ptr<BOSCCGadget> BOSCC;
+  std::unique_ptr<Reachability> RC;
+
+private:
+  struct MaskInfo {
+    /// @brief Mask that describes which lanes have exited the block.
+    llvm::SmallDenseMap<llvm::BasicBlock *, llvm::Value *, 4> exitMasks;
+    /// @brief Mask that describes which lanes are active at the start of the
+    /// basic block.
+    llvm::Instruction *entryMask = nullptr;
+  };
+  llvm::DenseMap<llvm::BasicBlock *, MaskInfo> MaskInfos;
+
+  /// @brief get the Mask Info struct for a Basic Block.
+  /// Note that the returned reference may be invalidated by subsequent calls.
+  ///
+  /// @param[in] BB the BasicBlock
+  /// @returns a reference to the MaskInfo
+  const MaskInfo &getMaskInfo(llvm::BasicBlock *BB) const {
+    const auto found = MaskInfos.find(BB);
+    assert(found != MaskInfos.end() &&
+           "Mask Info not constructed for Basic Block!");
+    return found->second;
+  }
+
+  /// @brief replaces reachable uses of a value
+  ///
+  /// @param[in] RC the reachability computation to use
+  /// @param[in] from the value to replace
+  /// @param[in] to the value to substitute
+  /// @param[in] src the basic block from which the value must be reachable
+  ///
+  /// @returns true
+  static bool replaceReachableUses(Reachability &RC, llvm::Instruction *from,
+                                   llvm::Value *to, llvm::BasicBlock *src);
+
+  /// @brief Generate a block ordering.
+  ///
+  /// This is based on a dominance-compact block indexing (DCBI) where we
+  /// topologically order blocks that belong to the same dominator tree.
+  ///
+  /// @returns true if no errors occurred.
+  bool computeBlockOrdering();
+};
+
+/// @}
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_CONTROL_FLOW_CONVERSION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
new file mode 100644
index 0000000000000..bcd63aa00cac6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/inline_post_vectorization_pass.h
@@ -0,0 +1,49 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Replace calls to certain builtins with an inline implementation after
+/// vectorization.
+
+#ifndef VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+/// @brief This pass replaces calls to builtins that require special attention
+/// after vectorization.
+class InlinePostVectorizationPass
+    : public llvm::PassInfoMixin<InlinePostVectorizationPass> {
+public:
+  /// @brief Create a new pass object.
+  InlinePostVectorizationPass() {}
+
+  /// @brief The entry point to the pass.
+  /// @param[in,out] F Function to optimize.
+  /// @param[in,out] AM FunctionAnalysisManager providing analyses.
+  /// @returns Whether or not the pass changed anything.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+  /// @brief Retrieve the pass's name.
+  /// @return pointer to text description.
+  static llvm::StringRef name() { return "Inline Post Vectorization pass"; }
+};
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_INLINE_POST_VECTORIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
new file mode 100644
index 0000000000000..ce9140ad64586
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/instantiation_pass.h
@@ -0,0 +1,113 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function instantiator.
+
+#ifndef VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+
+namespace vecz {
+
+class Packetizer;
+class VectorizationContext;
+class PacketRange;
+struct MemOp;
+
+/// @brief Instantiation pass where instructions that need it (vector or not)
+/// are instantiated (i.e. duplicated with lane ID substitution), starting from
+/// the leaves.
+class InstantiationPass {
+public:
+  /// @brief Create a new instantiation pass.
+  ///
+  /// @param[in] PP The packetizer object to call back to when required.
+  InstantiationPass(Packetizer &PP);
+
+  /// @brief Instantiate the given value from the function.
+  /// The returned value is equivalent to a clone of the V 'expression' with any
+  /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID.
+  ///
+  /// @param[in] V Value to instantiate.
+  ///
+  /// @return Instantiated value.
+  PacketRange instantiate(llvm::Value *V);
+
+private:
+  /// @brief Duplicates an instruction across all SIMD Lanes.
+  ///
+  /// @param[in] I The instruction to duplicate across lanes
+  ///
+  /// @return The SIMD Packet
+  PacketRange instantiateByCloning(llvm::Instruction *I);
+  /// @brief Broadcasts an instruction across all SIMD Lanes.
+  ///
+  /// @param[in] I The instruction to extract elements from
+  ///
+  /// @return The SIMD Packet
+  PacketRange simdBroadcast(llvm::Instruction *I);
+  /// @brief Instantiate the given value from the function.
+  /// The returned value is equivalent to a clone of the V 'expression' with any
+  /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID.
+  ///
+  /// @param[in] V Value to instantiate.
+  ///
+  /// @return Instantiated value.
+  PacketRange instantiateInternal(llvm::Value *V);
+  /// @brief Instantiate the given intruction from the function.
+  /// The returned value is equivalent to a clone of the V 'expression' with any
+  /// work-item ID (e.g. from get_global_id) adjusted with the lane's ID.
+  ///
+  /// @param[in] Ins instruction to instantiate.
+  ///
+  /// @return Instantiated value.
+  PacketRange instantiateInstruction(llvm::Instruction *Ins);
+  /// @brief Perform post-instantiation tasks.
+  ///
+  /// @param[in] P Packet that is the result of instantiation or null.
+  /// @param[in] V Value that was instantiated.
+  ///
+  /// @return Instantiated packet or null.
+  PacketRange assignInstance(const PacketRange P, llvm::Value *V);
+  /// @brief Create a packet where all lanes contain the same value.
+  ///
+  /// @param[in] V Value to broadcast.
+  ///
+  /// @return Packet with the broadcasted value.
+  PacketRange broadcast(llvm::Value *V);
+  /// @brief Instantiate a call instruction.
+  ///
+  /// @param[in] CI Instruction to instantiate.
+  ///
+  /// @return Instantiated packet for the given instruction.
+  PacketRange instantiateCall(llvm::CallInst *CI);
+  /// @brief Instantiate an alloca instruction.
+  ///
+  /// @param[in] Alloca Instruction to instantiate.
+  ///
+  /// @return Instantiated packet for the given instruction.
+  PacketRange instantiateAlloca(llvm::AllocaInst *Alloca);
+
+  VectorizationContext &Ctx;
+  Packetizer &packetizer;
+};
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_INSTANTIATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
new file mode 100644
index 0000000000000..ae6deb613826c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/interleaved_group_combine_pass.h
@@ -0,0 +1,94 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Combine groups of interleaved memory operations.
+
+#ifndef VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "vecz/vecz_target_info.h"
+
+namespace llvm {
+class ScalarEvolution;
+}
+
+namespace vecz {
+
+class VectorizationUnit;
+
+/// @brief Combine groups of interleaved memory operations.
+class InterleavedGroupCombinePass
+    : public llvm::PassInfoMixin<InterleavedGroupCombinePass> {
+public:
+  /// @brief Create a new pass object.
+  ///
+  /// @param[in] kind Kind of interleaved operation to combine.
+  InterleavedGroupCombinePass(InterleavedOperation kind)
+      : Kind(kind), scalarEvolution(nullptr) {}
+
+  /// @brief Unique identifier for the pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Combine groups of interleaved operations.
+  ///
+  /// @param[in] F Function to analyze.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() {
+    return "Combine interleaved memory instructions";
+  }
+
+private:
+  /// @brief Information about an interleaved operation.
+  struct InterleavedOpInfo;
+
+  /// @brief Information about a group of interleaved operations.
+  struct InterleavedGroupInfo;
+
+  /// @brief Try to find a group of interleaved instructions that have the same
+  /// stride and collectively access a consecutive chunk of memory.
+  ///
+  /// @param[in] Ops List of interleaved operations to analyze.
+  /// @param[in] UVR Result of uniform value analysis.
+  /// @param[out] Info information about a group of interleaved instructions.
+  ///
+  /// @return true if a group was found or false otherwise.
+  bool findGroup(const std::vector<InterleavedOpInfo> &Ops,
+                 UniformValueResult &UVR, InterleavedGroupInfo &Info);
+
+  /// @brief Unique identifier for the pass.
+  static char PassID;
+  /// @brief Kind of interleaved operation to combine.
+  InterleavedOperation Kind;
+
+  /// @brief Scalar Evolution Analysis that allows us to subtract two pointers
+  /// to find any constant offset between them.
+  llvm::ScalarEvolution *scalarEvolution;
+};
+
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_INTERLEAVED_GROUP_COMBINE_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
new file mode 100644
index 0000000000000..1c9cfe79dac53
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_helpers.h
@@ -0,0 +1,261 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function packetizer helper classes.
+
+#ifndef VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
+#define VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/IRBuilder.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <memory>
+
+#include "debugging.h"
+
+namespace llvm {
+class Value;
+class ShuffleVectorInst;
+class Twine;
+} // namespace llvm
+
+namespace vecz {
+class TargetInfo;
+struct SimdPacket;
+
+/// @brief Determines the insertion point after the value V. If V has a position
+/// in the function, (e.g., an Instruction), this method will return an
+/// IRBuilder set to the next point after that. If V has no position (e.g., a
+/// Constant or an Argument) then this method will return an IRBuilder set to a
+/// suitable insertion point at the beginning of the function.
+///
+/// @param[in] V Value to insert instructions after, if an llvm::Instruction.
+/// @param[in] F Function to insert instructions into, if V is not an
+/// llvm::Instruction.
+/// @param[in] IsPhi true if the instructions to insert are phis, false if the
+/// insertion point should be after all phis in the basic block.
+///
+/// @return IRBuilder set to a suitable insertion point.
+llvm::IRBuilder<> buildAfter(llvm::Value *V, llvm::Function &F,
+                             bool IsPhi = false);
+
+/// @brief Utility function for building a shufflevector instruction, absorbing
+/// its operands where possible.
+///
+/// @param[in] B IRBuilder to build any new instruction created
+/// @param[in] srcA the first vector operand of the new shuffle
+/// @param[in] srcB the second vector operand of the new shuffle
+/// @param[in] mask the shuffle mask
+/// @param[in] name the name of the new instruction
+///
+/// @return a value identical to the requested shufflevector
+llvm::Value *createOptimalShuffle(llvm::IRBuilder<> &B, llvm::Value *srcA,
+                                  llvm::Value *srcB,
+                                  const llvm::SmallVectorImpl<int> &mask,
+                                  const llvm::Twine &name = llvm::Twine());
+
+/// @brief Utility function for splatting a vector of scalars to create a
+/// "vector of vectors", being the concatenation of vector splats of its
+/// elements. eg. subSplat("ABCD", 4) == "AAAABBBBCCCCDDDD"
+///
+/// Only works on fixed vector types.
+///
+/// @param[in] TI TargetInfo for target-dependent optimizations
+/// @param[in] B IRBuilder to build any new instructions created
+/// @param[in,out] srcs The packet of vectors to sub-splat
+/// @param[in] subWidth The width of the individual splats
+///
+/// @return true on success
+bool createSubSplats(const vecz::TargetInfo &TI, llvm::IRBuilder<> &B,
+                     llvm::SmallVectorImpl<llvm::Value *> &srcs,
+                     unsigned subWidth);
+
+/// @brief Utility function for creating a reduction operation.
+///
+/// The value must be a vector.
+///
+/// If VL is passed and is non-null, it is assumed to be the i32 value
+/// representing the active vector length. The reduction will be
+/// vector-predicated according to this length.
+///
+/// Only works on RecurKind::And, Or, Xor, Add, Mul, FAdd, FMul, {S,U,F}Min,
+/// {S,U,F}Max.
+llvm::Value *createMaybeVPReduction(llvm::IRBuilderBase &B, llvm::Value *Val,
+                                    llvm::RecurKind Kind,
+                                    llvm::Value *VL = nullptr);
+
+/// @brief Utility function to obtain an indices vector to be used in a gather
+/// operation.
+///
+/// When accessing a vector using an indices vector, this must be
+/// modified taking into account the SIMD width.
+///
+/// @return An indices vector to be used in a gather operation; nullptr for LLVM
+/// version < 13.
+///
+/// @param[in] B IRBuilder to build any new instructions created
+/// @param[in] Indices Original indices vector
+/// @param[in] Ty Type of the output vector
+/// @param[in] FixedVecElts Original vector length
+/// @param[in] N Name of the output variable
+llvm::Value *getGatherIndicesVector(llvm::IRBuilder<> &B, llvm::Value *Indices,
+                                    llvm::Type *Ty, unsigned FixedVecElts,
+                                    const llvm::Twine &N = "");
+
+/// @brief Returns a boolean vector with all elements set to 'true'.
+llvm::Value *createAllTrueMask(llvm::IRBuilderBase &B, llvm::ElementCount EC);
+
+/// @brief Returns an integer step vector, representing the sequence 0 ... N-1.
+llvm::Value *createIndexSequence(llvm::IRBuilder<> &Builder,
+                                 llvm::VectorType *VecTy,
+                                 const llvm::Twine &Name = "");
+
+/// @brief Class that represents a range in a vector of Value pointers.
+/// The range is represented by its integer starting index and length, so that
+/// it remains valid if the vector re-allocates its storage.
+class PacketRange {
+public:
+  using value_type = llvm::Value *;
+  using iterator = value_type *;
+  using const_iterator = const value_type *;
+  using reference = value_type &;
+  using const_reference = const value_type &;
+
+  /// @brief Construct an empty range
+  constexpr PacketRange(std::vector<llvm::Value *> &d)
+      : data(d), start(0), length(0) {}
+  /// @brief Construct a range with given start index and length
+  constexpr PacketRange(std::vector<llvm::Value *> &d, size_t s, size_t l)
+      : data(d), start(s), length(l) {}
+
+  /// @brief Copy constructor
+  constexpr PacketRange(const PacketRange &) = default;
+  /// @brief Move constructor
+  constexpr PacketRange(PacketRange &&) = default;
+  /// @brief Destructor
+  ~PacketRange() = default;
+
+  /// @brief Return the length of the range
+  size_t size() const { return length; }
+  /// @brief Standard container begin iterator
+  iterator begin() { return &*data.begin() + start; }
+  /// @brief Standard container begin const iterator
+  const_iterator begin() const { return &*data.begin() + start; }
+  /// @brief Standard container end iterator
+  iterator end() { return begin() + length; }
+  /// @brief Standard container end const iterator
+  const_iterator end() const { return begin() + length; }
+  /// @brief Return a reference to the element at given index
+  reference at(size_t i) { return data[start + i]; }
+  /// @brief Return a const reference to the element at given index
+  const_reference at(size_t i) const { return data[start + i]; }
+  /// @brief Return a reference to the element at given index
+  reference operator[](size_t i) { return at(i); }
+  /// @brief Return a const reference to the element at given index
+  const_reference operator[](size_t i) const { return at(i); }
+  /// @brief Return a reference to the first element in the range
+  reference front() { return data[start]; }
+  /// @brief Return a const reference to the first element in the range
+  const_reference front() const { return data[start]; }
+  /// @brief Return a reference to the last element in the range
+  reference back() { return data[start + length - 1]; }
+  /// @brief Return a const reference to the last element in the range
+  const_reference back() const { return data[start + length - 1]; }
+
+  /// @brief Convert to bool
+  /// @returns false if length is zero, true otherwise
+  operator bool() const { return length != 0; }
+
+private:
+  std::vector<llvm::Value *> &data;
+  const size_t start;
+  const size_t length;
+};
+
+/// @brief Structure to hold the strategy-agnostic result of packetizing an
+/// instruction (i.e. can represent either a vectorized or an instantiated
+/// value) that enables the result to be converted on demand.
+struct PacketInfo {
+  /// @brief The number of instances created during packetization
+  unsigned numInstances = 0;
+
+  /// @brief Vectorized value. Each element in the vector represents a scalar
+  /// instance (SIMD lane).
+  llvm::Value *vector = nullptr;
+
+  /// @brief Map of vector widths to packet range start indices
+  llvm::SmallDenseMap<unsigned, unsigned, 2> packets;
+
+  /// @brief Default constructor
+  PacketInfo() = default;
+  /// @brief Deleted copy constructor
+  PacketInfo(const PacketInfo &) = delete;
+  /// @brief Move constructor
+  PacketInfo(PacketInfo &&) = default;
+  /// @brief Destructor
+  ~PacketInfo() = default;
+  /// @brief Deleted copy assignment operator
+  PacketInfo &operator=(const PacketInfo &) = delete;
+  /// @brief Move assignment operator
+  PacketInfo &operator=(PacketInfo &&) = default;
+
+  /// @brief get the range of values for a given packet width
+  PacketRange getRange(std::vector<llvm::Value *> &d, unsigned width) const;
+
+  /// @brief get the range of values for the originally created packet.
+  PacketRange getRange(std::vector<llvm::Value *> &d) const {
+    return getRange(d, numInstances);
+  }
+};
+
+inline llvm::Type *getWideType(llvm::Type *ty, llvm::ElementCount factor) {
+  if (!ty->isVectorTy()) {
+    // The wide type of a struct literal is the wide type of each of its
+    // elements.
+    if (auto *structTy = llvm::dyn_cast<llvm::StructType>(ty);
+        structTy && structTy->isLiteral()) {
+      llvm::SmallVector<llvm::Type *, 4> wideElts(structTy->elements());
+      for (unsigned i = 0, e = wideElts.size(); i != e; i++) {
+        wideElts[i] = getWideType(wideElts[i], factor);
+      }
+      return llvm::StructType::get(ty->getContext(), wideElts);
+    } else if (structTy) {
+      VECZ_ERROR("Can't create wide type for structure type");
+    }
+    return llvm::VectorType::get(ty, factor);
+  }
+  const bool isScalable = llvm::isa<llvm::ScalableVectorType>(ty);
+  assert((!factor.isScalable() || !isScalable) &&
+         "Can't widen a scalable vector by a scalable amount");
+  auto *vecTy = llvm::cast<llvm::VectorType>(ty);
+  const unsigned elts = vecTy->getElementCount().getKnownMinValue();
+  // If we're widening a scalable type then set the fixed factor to scalable
+  // here.
+  if (isScalable && !factor.isScalable()) {
+    factor = llvm::ElementCount::getScalable(factor.getKnownMinValue());
+  }
+  ty = vecTy->getElementType();
+  return llvm::VectorType::get(ty, factor * elts);
+}
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_PACKETIZATION_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
new file mode 100644
index 0000000000000..fb5b49bc106ba
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetization_pass.h
@@ -0,0 +1,77 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function packetizer.
+
+#ifndef VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+class VectorizationUnit;
+
+/// \addtogroup packetization Packetization Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief Vectorization pass where scalar instructions that need it are
+/// packetized, starting from leaves.
+class PacketizationPass : public llvm::PassInfoMixin<PacketizationPass> {
+public:
+  /// @brief Create a new packetization pass object.
+  PacketizationPass() = default;
+
+  /// @brief Create a new packetization pass object.
+  ///
+  /// @param[in] P Pass to move.
+  PacketizationPass(PacketizationPass &&P) = default;
+
+  // Mark default copy constructor as deleted
+  PacketizationPass(const PacketizationPass &) = delete;
+
+  /// @brief Deleted move assignment operator.
+  ///
+  /// Also deletes the copy assignment operator.
+  PacketizationPass &operator=(PacketizationPass &&) = delete;
+
+  /// @brief Unique identifier for the pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Packetize the given function, duplicating its behaviour (defined
+  /// values and side effects) for each lane of a SIMD packet.
+  ///
+  /// @param[in] F Function to packetize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved passes.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() { return "Function packetization"; }
+
+  /// @brief Unique identifier for the pass.
+  static char PassID;
+};
+
+/// @}
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_PACKETIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
new file mode 100644
index 0000000000000..4e9ff96a07e56
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/packetizer.h
@@ -0,0 +1,234 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function packetizer.
+
+#ifndef VECZ_TRANSFORM_PACKETIZER_H_INCLUDED
+#define VECZ_TRANSFORM_PACKETIZER_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/TypeSize.h>
+#include <multi_llvm/llvm_version.h>
+
+#include <memory>
+
+#include "ir_cleanup.h"
+#include "transform/packetization_helpers.h"
+
+namespace vecz {
+
+struct MemOp;
+class InstantiationPass;
+class PacketizationAnalysisResult;
+class StrideAnalysisResult;
+struct UniformValueResult;
+class VectorizationUnit;
+class VectorizationContext;
+class VectorizationChoices;
+
+/// \addtogroup packetization Packetization Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief The implementation of the packetization process
+class Packetizer {
+public:
+  class Result {
+    friend class Packetizer;
+
+  public:
+    Result() = delete;
+    Result(const Result &) = default;
+    constexpr Result(Result &&) = default;
+
+    Result(Packetizer &p) : packetizer(p), scalar(nullptr), info(nullptr) {}
+    Result(Packetizer &p, llvm::Value *s, PacketInfo *i)
+        : packetizer(p), scalar(s), info(i) {}
+
+    operator bool() const { return info; }
+
+    /// @brief Get a packetized/instantiated instruction as a vector value.
+    /// If the value was instantiated, this will construct and return a gather
+    /// of the SIMD lanes.
+    ///
+    /// @return Packetized value
+    llvm::Value *getAsValue() const;
+
+    /// @brief Get a packetized/instantiated instruction as a SIMD packet.
+    /// If the value was packetized, this will construct a new packet by
+    /// extracting the elements.
+    ///
+    /// @param[in] width the width of the packet to get.
+    ///
+    /// @return Instantiated packet
+    PacketRange getAsPacket(unsigned width) const;
+
+    /// @brief Get a copy of all the Values from the vector or packet, as
+    /// the width it was originally packetized to.
+    ///
+    /// @param[out] vals a vector of Values representing the result.
+    void getPacketValues(llvm::SmallVectorImpl<llvm::Value *> &vals) const;
+
+    /// @brief Get a copy of all the Values from the vector or packet.
+    /// When `width == 1` this will return a length-1 result containing the
+    /// vector valued result. Otherwise, it copies the values from the
+    /// packet of the requested width.
+    ///
+    /// @param[in] width the width of the packet to get.
+    /// @param[out] vals a vector of Values representing the result.
+    void getPacketValues(unsigned width,
+                         llvm::SmallVectorImpl<llvm::Value *> &vals) const;
+
+  private:
+    Packetizer &packetizer;
+    llvm::Value *const scalar;
+    PacketInfo *const info;
+
+    PacketRange createPacket(unsigned width) const;
+    PacketRange getRange(unsigned width) const;
+    PacketRange widen(unsigned width) const;
+    PacketRange narrow(unsigned width) const;
+    const Result &broadcast(unsigned width) const;
+  };
+
+  /// @brief Packetize the given function, duplicating its behaviour (defined
+  /// values and side effects) for each lane of a SIMD packet.
+  ///
+  /// @param[in] F Function to packetize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  /// @param[in] Width the vectorization factor
+  /// @param[in] Dim the vectorization dimension
+  ///
+  /// @return true if the function was packetized, false otherwise.
+  static bool packetize(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
+                        llvm::ElementCount Width, unsigned Dim);
+
+  /// @brief Packetize the given value from the function.
+  ///
+  /// @param[in] V Value to packetize.
+  ///
+  /// @return Packetized value.
+  Result packetize(llvm::Value *V);
+
+  /// @brief Return an already packetized value.
+  ///
+  /// @param[in] V Value to query.
+  ///
+  /// @return Packetized value or nullptr.
+  Result getPacketized(llvm::Value *V);
+
+  /// @brief Create a new SIMD packet to hold an instantiated value.
+  ///
+  /// @param[in] V the value the packet will represent
+  /// @param[in] width the SIMD width of the packet
+  ///
+  /// @returns a new packet
+  PacketRange createPacket(llvm::Value *V, unsigned width);
+
+  /// @brief Get the Uniform Value Result
+  ///
+  /// @return the Uniform Value Result
+  const UniformValueResult &uniform() const { return UVR; }
+
+  /// @brief get the vectorization factor.
+  llvm::ElementCount width() const { return SimdWidth; }
+
+  /// @brief get the vectorization factor.
+  unsigned dimension() const { return Dimension; }
+
+  /// @brief get the function being packetized
+  llvm::Function &function() { return F; }
+
+  /// @brief get the Vectorization Context
+  VectorizationContext &context() { return Ctx; }
+
+  /// @brief get the Vectorization Context
+  const VectorizationChoices &choices() const { return Choices; }
+
+  PacketRange getEmptyRange() { return PacketRange(packetData); }
+
+  /// @brief mark the instruction for deletion when packetization finishes
+  void deleteInstructionLater(llvm::Instruction *I) {
+    IC.deleteInstructionLater(I);
+  }
+
+private:
+  Packetizer(llvm::Function &, llvm::FunctionAnalysisManager &AM,
+             llvm::ElementCount Width, unsigned Dim);
+  Packetizer() = delete;
+  Packetizer(const Packetizer &) = delete;
+  Packetizer(Packetizer &&) = delete;
+  ~Packetizer() = default;
+
+  llvm::FunctionAnalysisManager &AM;
+  VectorizationUnit &VU;
+  VectorizationContext &Ctx;
+  const VectorizationChoices &Choices;
+  UniformValueResult &UVR;
+  StrideAnalysisResult &SAR;
+  PacketizationAnalysisResult &PAR;
+  llvm::Function &F;
+  IRCleanup IC;
+
+  /// @brief Vectorization factor
+  llvm::ElementCount SimdWidth;
+
+  /// @brief Vectorization dimension
+  unsigned Dimension;
+
+  /// @brief Map onto packetized versions of scalar values
+  llvm::DenseMap<llvm::Value *, PacketInfo> packets;
+
+  /// @brief Central storage for all the packetized values
+  ///
+  /// This vector is a contiguous storage for all the wide packets created
+  /// during the packetization process. New packets get allocated to a
+  /// range at the end of the vector, and are referenced by index so that
+  /// they are not invalidated when the storage is re-allocated. Vector
+  /// elements will never be erased during packetization, and the data will
+  /// not be cleared until the packetizer itself is destroyed.
+  /*
+                 /^ ^\
+     "No take"  / 0 0 \
+                V\ Y /V  */
+  std::vector<llvm::Value *> packetData;
+  /*             |    \
+                 || (__V  "ONLY GROW"
+  */
+
+  /// @brief The value representing the current (dynamic) active vector length
+  /// for this kernel. This value is the *base* vector length for one scalar
+  /// work-item; vector operations must be scaled according to their vector
+  /// width.
+  /// If non-null, packetized operations are required to respect this active
+  /// length if they would produce side effects.
+  llvm::Value *VL = nullptr;
+
+  /// @brief This class contains the private implementation of the packetizer.
+  /// Declaring it as an inner class of the Packetizer class allows it access
+  /// to its private members (including its constructor).
+  class Impl;
+};
+
+/// @}
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_PACKETIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
new file mode 100644
index 0000000000000..bbc9cd6428a2c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/passes.h
@@ -0,0 +1,209 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Factory functions for some Vecz support passes
+
+#ifndef VECZ_TRANSFORM_PASSES_H_INCLUDED
+#define VECZ_TRANSFORM_PASSES_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+#include <llvm/Transforms/Scalar/LoopPassManager.h>
+
+namespace compiler {
+namespace utils {
+class BuiltinInfo;
+} // namespace utils
+} // namespace compiler
+
+namespace vecz {
+class SimplifyInfiniteLoopPass
+    : public llvm::PassInfoMixin<SimplifyInfiniteLoopPass> {
+public:
+  SimplifyInfiniteLoopPass() = default;
+
+  llvm::PreservedAnalyses run(llvm::Loop &L, llvm::LoopAnalysisManager &,
+                              llvm::LoopStandardAnalysisResults &,
+                              llvm::LPMUpdater &);
+};
+
+/// @brief This pass replaces calls to builtins that require special attention
+/// (e.g. there is no scalar or vector equivalent) with inline implementations.
+class BuiltinInliningPass : public llvm::PassInfoMixin<BuiltinInliningPass> {
+public:
+  /// @brief Create a new pass object.
+  BuiltinInliningPass() = default;
+
+  /// @brief The entry point to the pass.
+  /// @param[in,out] M Module to optimize.
+  /// @param[in,out] AM ModuleAnalysisManager providing analyses.
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+
+  /// @brief Retrieve the pass's name.
+  /// @return pointer to text description.
+  static llvm::StringRef name() { return "OpenCL builtin inlining pass"; }
+
+private:
+  /// @brief Process a call site, inlining it or marking it as needing inlining
+  /// if required.
+  ///
+  /// @param[in] CI Call site to inspect.
+  /// @param[out] NeedLLVMInline Whether the call site needs LLVM inlining.
+  ///
+  /// @return New return value for the call instruction.
+  llvm::Value *processCallSite(llvm::CallInst *CI, bool &NeedLLVMInline);
+};
+
+/// @brief This pass tries to remove unecessary allocas that are not optimized
+/// away by LLVM's Mem2Reg pass, for example in the presence of bitcasts. It is
+/// however much simpler than LLVM's.
+class BasicMem2RegPass : public llvm::PassInfoMixin<BasicMem2RegPass> {
+public:
+  BasicMem2RegPass() {};
+
+  /// @brief The entry point to the pass.
+  /// @param[in,out] F Function to optimize.
+  /// @param[in,out] AM FunctionAnalysisManager providing analyses.
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+  /// @brief Retrieve the pass's name.
+  /// @return pointer to text description.
+  static llvm::StringRef name() { return "Basic Mem2Reg Pass"; }
+
+private:
+  /// @brief Determine whether the alloca can be promoted or not.
+  ///
+  /// This is the case when it is inside the entry block, there is at most one
+  /// store to it and all other users are loads (possibly through bitcasts).
+  /// The store must also be in the entry block and precede all loads.
+  ///
+  /// @param[in] Alloca Alloca instruction to analyze.
+  /// @return true if the alloca can be promoted, false otherwise.
+  bool canPromoteAlloca(llvm::AllocaInst *Alloca) const;
+  /// @brief Try to promote the alloca, remove store users and replacing load
+  /// users by the stored values. The alloca itself isn't touched.
+  /// @param[in] Alloca Alloca instruction to promote.
+  /// @return true if the alloca was promoted, false otherwise.
+  bool promoteAlloca(llvm::AllocaInst *Alloca) const;
+};
+
+class PreLinearizePass : public llvm::PassInfoMixin<PreLinearizePass> {
+public:
+  PreLinearizePass() = default;
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  static llvm::StringRef name() { return "Prepare for SPMD linearization"; }
+};
+
+/// @brief Wraps llvm's LoopRotatePass but retricts the range of loops on which
+/// it works.
+class VeczLoopRotatePass : public llvm::PassInfoMixin<VeczLoopRotatePass> {
+public:
+  VeczLoopRotatePass() {}
+
+  llvm::PreservedAnalyses run(llvm::Loop &L, llvm::LoopAnalysisManager &,
+                              llvm::LoopStandardAnalysisResults &,
+                              llvm::LPMUpdater &);
+
+  static llvm::StringRef name() { return "Vecz Loop Rotation Wrapper"; };
+};
+
+class RemoveIntPtrPass : public llvm::PassInfoMixin<RemoveIntPtrPass> {
+public:
+  RemoveIntPtrPass() = default;
+
+  static llvm::StringRef name() { return "Remove IntPtr instructions"; }
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &);
+};
+
+class SquashSmallVectorsPass
+    : public llvm::PassInfoMixin<SquashSmallVectorsPass> {
+public:
+  SquashSmallVectorsPass() = default;
+
+  static llvm::StringRef name() { return "Squash Small Vectors"; }
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &);
+};
+
+/// @brief Try to replace or remove masked memory operations that are trivially
+/// not needed or can be converted to non-masked operations.
+class SimplifyMaskedMemOpsPass
+    : public llvm::PassInfoMixin<SimplifyMaskedMemOpsPass> {
+public:
+  /// @brief Create a new pass object.
+  SimplifyMaskedMemOpsPass() = default;
+
+  /// @brief Replace masked memory operations that use 'all true' masks by
+  /// regular memory operations, and remove masked operations that use 'all
+  /// false' masks.
+  ///
+  /// @param[in] F Function to optimize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() { return "Simplify masked memory operations"; }
+};
+
+/// @brief reassociate uniform binary operators and split branches
+class UniformReassociationPass
+    : public llvm::PassInfoMixin<UniformReassociationPass> {
+public:
+  UniformReassociationPass() = default;
+
+  static llvm::StringRef name() { return "Reassociate uniform binops"; }
+
+  llvm::PreservedAnalyses run(llvm::Function &,
+                              llvm::FunctionAnalysisManager &);
+};
+
+/// @brief Removes uniform divergence reductions created by CFG conversion
+class DivergenceCleanupPass
+    : public llvm::PassInfoMixin<DivergenceCleanupPass> {
+public:
+  /// @brief Create a new pass object.
+  DivergenceCleanupPass() = default;
+
+  /// @brief Remove uniform divergence reductions.
+  ///
+  /// @param[in] F Function to optimize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Pass name.
+  static llvm::StringRef name() {
+    return "Remove uniform divergence reductions";
+  }
+};
+
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_PASSES_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
new file mode 100644
index 0000000000000..2d4885059b3db
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/printf_scalarizer.h
@@ -0,0 +1,117 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+
+#ifndef VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED
+#define VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED
+
+#include <string>
+
+namespace llvm {
+class Module;
+class User;
+class Instruction;
+template <typename T, unsigned N> class SmallVector;
+class GlobalVariable;
+class Value;
+class CallInst;
+} // namespace llvm
+
+namespace vecz {
+
+/// @brief An enumeration of errors that can occur when processing a format
+/// string.
+enum EnumPrintfError {
+  kPrintfError_success,
+  kPrintfError_fail,
+  kPrintfError_invalidFormatString
+};
+
+/// @brief Retrieves a module-level global variable for a printf format string
+/// from an Value.
+/// @param[in] op The value that uses a global variable representing a printf
+/// format string.
+/// @return The module-level global variable for the printf format string.
+llvm::GlobalVariable *GetFormatStringAsValue(llvm::Value *op);
+
+/// @brief Extracts the raw string contents from a module-level global variable
+/// containing a printf format string.
+///
+/// The @p op parameter must be an GlobalVariable with an initializer.
+///
+/// @param[in] op The module-level global variable for a printf format string.
+/// @return The raw string contents of the format string global variable, or ""
+/// if there was an error.
+std::string GetFormatStringAsString(llvm::Value *op);
+
+/// @brief Creates a global variable for a scalarized format string.
+/// @param[in,out] module The parent module given to the pass.
+/// @param[in] string_value The GlobalVariable for the old format string,
+/// used to copy attributes over.
+/// @param[in]  new_format_string The scalarized format string to create a
+/// global variable from.
+/// @return The newly created global variable for the format string.
+llvm::GlobalVariable *
+GetNewFormatStringAsGlobalVar(llvm::Module &module,
+                              llvm::GlobalVariable *const string_value,
+                              const std::string &new_format_string);
+
+/// @brief This function transforms an OpenCL printf format string into a
+/// C99-conformant one.
+
+/// Its main job is to scalarize vector format specifiers into scalarized form.
+/// It does this by taking a vector specifier and determining the specifier
+/// corresponding to each vector element. It then emits the element specifier
+/// into the new format string for each element in the vector, separated by a
+/// comma.
+///
+/// Special care needs to be taken for modifiers that aren't supported by C99
+/// such as the 'hl' length modifier. The new format string will have 'hl'
+/// stripped out.
+///
+/// Examples:
+/// @code{.cpp}
+/// // vector 2, 8-bit sized hexadecimal integers
+/// "%v2hhx"  --> "%hhx,%hhx"
+/// // vector 4, 32-bit sized floats
+/// "%v4hlf"  --> "%f,%f,%f,%f"
+/// @endcode
+///
+/// It also does some checking to ensure the printf string is conformant to the
+/// OpenCL 1.2 specification, and returns an error if it is not.
+/// @param[in] str The format string to scalarize and check.
+/// @param[out] new_str The new, scalarized, format string.
+/// @return The status of the scalarization (kPrintfError_success on success,
+/// otherwise kPrintfError_invalidFormatString if we detected an illegal OpenCL
+/// printf format string).
+EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
+                                              std::string &new_str);
+
+/// @brief Builds a new scalarized printf call given an existing call and a new
+/// format string.
+///
+/// @param[in,out] module The parent module given to the pass.
+/// @param[in] old_inst The old call to the printf function.
+/// @param[in] new_format_string_gvar The module-level global variable for the
+/// new format string.
+/// @return A new call instruction to the new printf function.
+llvm::Instruction *
+BuildNewPrintfCall(llvm::Module &module, llvm::CallInst *const old_inst,
+                   llvm::GlobalVariable *const new_format_string_gvar);
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_PRINTF_SCALARIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
new file mode 100644
index 0000000000000..a494a1945e0a6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarization_pass.h
@@ -0,0 +1,68 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function scalarizer.
+
+#ifndef VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Function;
+} // namespace llvm
+
+namespace vecz {
+
+class VectorizationUnit;
+
+/// \addtogroup scalarization Scalarization Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief Scalarization pass where vector instructions that need it are
+/// scalarized, starting from leaves.
+class ScalarizationPass : public llvm::PassInfoMixin<ScalarizationPass> {
+public:
+  /// @brief Create a new scalarizaation pass.
+  ScalarizationPass();
+
+  /// @brief Unique identifier for the pass.
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Scalarize the given function.
+  ///
+  /// @param[in] F Function to scalarize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  ///
+  /// @return Preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  /// @brief Name of the pass.
+  static llvm::StringRef name() { return "Function scalarization"; }
+
+private:
+  static char PassID;
+};
+
+/// @}
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_SCALARIZATION_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
new file mode 100644
index 0000000000000..ecb2136c6b73d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/scalarizer.h
@@ -0,0 +1,323 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Function scalarizer.
+
+#ifndef VECZ_TRANSFORM_SCALARIZER_H_INCLUDED
+#define VECZ_TRANSFORM_SCALARIZER_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/ADT/DenseSet.h>
+#include <multi_llvm/llvm_version.h>
+
+#include <vector>
+
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "simd_packet.h"
+
+namespace llvm {
+class Instruction;
+class LoadInst;
+class StoreInst;
+class CastInst;
+class BitCastInst;
+class BinaryOperator;
+class FreezeInst;
+class GetElementPtrInst;
+class UnaryOperator;
+class ICmpInst;
+class FCmpInst;
+class SelectInst;
+class CallInst;
+class ShuffleVectorInst;
+class InsertElementInst;
+class PHINode;
+class ExtractElementInst;
+class IntrinsicInst;
+} // namespace llvm
+
+namespace vecz {
+
+class VectorizationChoices;
+class VectorizationContext;
+struct MemOp;
+struct PacketMask;
+struct SimdPacket;
+
+/// \addtogroup scalarization Scalarization Stage
+/// @{
+/// \ingroup vecz
+
+/// @brief Holds the result of scalarization analysis for a given function.
+class Scalarizer {
+public:
+  /// @brief Create new scalarization results for the function.
+  ///
+  /// @param[in] F Function to scalarize.
+  /// @param[in] Ctx VectorizationContext for this Function.
+  /// @param[in] DoubleSuport True if double-precision floating point is
+  /// supported
+  Scalarizer(llvm::Function &F, VectorizationContext &Ctx, bool DoubleSuport);
+
+  /// @brief Mark the value as needing scalarization.
+  /// @param[in] V Value that needs scalarization.
+  void setNeedsScalarization(llvm::Value *V);
+
+  /// @brief Scalarize everything that has been marked for scalarization
+  bool scalarizeAll();
+
+  /// @brief A container type for instructions that failed to scalarize
+  using FailureSet = llvm::DenseSet<const llvm::Value *>;
+
+  /// @brief Get the list of instructions that failed to scalarize
+  const FailureSet &failures() const { return Failures; }
+
+private:
+  /// @brief Vectorization context for the function to scalarize.
+  VectorizationContext &Ctx;
+  llvm::Function &F;
+  IRCleanup IC;
+  bool DoubleSupport;
+
+  /// @brief The values to scalarize, in order
+  std::vector<llvm::Value *> ToScalarize;
+
+  /// @brief The un-ordered set of values to scalarize for fast lookup
+  llvm::DenseSet<llvm::Value *> ScalarizeSet;
+
+  /// @brief Map of values to a gather of their scalarized elements
+  llvm::DenseMap<llvm::Value *, llvm::Value *> Gathers;
+
+  /// @brief Map onto packetized versions of scalar values
+  llvm::DenseMap<const llvm::Value *, std::unique_ptr<SimdPacket>> packets;
+
+  /// @brief The number of instructions that failed to scalarize
+  FailureSet Failures;
+
+  /// @brief Transform values that have non-vector types and vector operands
+  /// by scalarizing their operands.
+  ///
+  /// @param[in] I Instruction whose operands to scalarize.
+  ///
+  /// @return A different value than V if the operands were scalarized; null if
+  /// scalarization failed; or V if the value has no vector operand.
+  llvm::Value *scalarizeOperands(llvm::Instruction *I);
+
+  /// @brief Scalarize the given value from the function. Multiple calls to this
+  /// function with the same value should return a cached result.
+  ///
+  /// @param[in] V Value to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarize(llvm::Value *V, PacketMask PM);
+
+  /// @brief Get or create a packet for the given value.
+  ///
+  /// @param[in] V Value to retrieve a packet for.
+  /// @param[in] SimdWidth Number of lanes in the packet.
+  /// @param[in] Create true if a packet should be created if not present.
+  ///
+  /// @return SIMD packet for the given value.
+  SimdPacket *getPacket(const llvm::Value *V, unsigned SimdWidth,
+                        bool Create = true);
+
+  llvm::Value *getGather(llvm::Value *V);
+
+  /// @brief Perform post-scalarization tasks for the given value.
+  ///
+  /// @param[in] P Packet resulting from scalarization or null.
+  /// @param[in] V Value to scalarize.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *assignScalar(SimdPacket *P, llvm::Value *V);
+  /// @brief Extract an element's values, for use by scalarized users
+  ///
+  /// @param[in] V Value to extract.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *extractLanes(llvm::Value *V, PacketMask PM);
+  /// @brief Scalarize a load instruction.
+  ///
+  /// @param[in] Load Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeLoad(llvm::LoadInst *Load, PacketMask PM);
+  /// @brief Scalarize a store instruction.
+  ///
+  /// @param[in] Store Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeStore(llvm::StoreInst *Store, PacketMask PM);
+  /// @brief Scalarize a cast instruction.
+  ///
+  /// @param[in] CastI Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeCast(llvm::CastInst *CastI, PacketMask PM);
+  /// @brief Scalarize a bitcast instruction.
+  ///
+  /// @param[in] BC Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeBitCast(llvm::BitCastInst *BC, PacketMask PM);
+  /// @brief Scalarize a binary operation instruction.
+  ///
+  /// @param[in] BinOp Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeBinaryOp(llvm::BinaryOperator *BinOp, PacketMask PM);
+  // Freeze instruction is not available in LLVM versions prior 10.0
+  // and not used in LLVM versions prior to 11.0
+  /// @brief Scalarize a freeze instruction.
+  ///
+  /// @param[in] FreezeInst Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeFreeze(llvm::FreezeInst *FreezeI, PacketMask PM);
+  /// @brief Scalarize a unary operation instruction.
+  ///
+  /// @param[in] UnOp Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeUnaryOp(llvm::UnaryOperator *UnOp, PacketMask PM);
+  /// @brief Scalarize an interger compare instruction.
+  ///
+  /// @param[in] ICmp Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeICmp(llvm::ICmpInst *ICmp, PacketMask PM);
+  /// @brief Scalarize a floating-point compare instruction.
+  ///
+  /// @param[in] FCmp Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeFCmp(llvm::FCmpInst *FCmp, PacketMask PM);
+  /// @brief Scalarize a select instruction.
+  ///
+  /// @param[in] Select Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeSelect(llvm::SelectInst *Select, PacketMask PM);
+  /// @brief Scalarize a call instruction.
+  ///
+  /// @param[in] CI Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeCall(llvm::CallInst *CI, PacketMask PM);
+  /// @brief Scalarize a call instruction to a masked mem op.
+  ///
+  /// @param[in] CI Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  /// @param[in] MaskedOp Masked memory operation to scalarize.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeMaskedMemOp(llvm::CallInst *CI, PacketMask PM,
+                                   MemOp &MaskedOp);
+  /// @brief Scalarize a shuffle vector instruction.
+  ///
+  /// @param[in] Shuffle Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeShuffleVector(llvm::ShuffleVectorInst *Shuffle,
+                                     PacketMask PM);
+  /// @brief Scalarize an insert element instruction.
+  ///
+  /// @param[in] Insert Instruction to scalarize.
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return Packet containing scalarized values or null.
+  SimdPacket *scalarizeInsertElement(llvm::InsertElementInst *Insert,
+                                     PacketMask PM);
+  /// @brief Scalarize GEPs with vector arguments
+  ///
+  /// @param[in] GEP The GEP to scalarize
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return The packet containing the scalarized values or null
+  SimdPacket *scalarizeGEP(llvm::GetElementPtrInst *GEP, PacketMask PM);
+  /// @brief Scalarize Phi nodes with vector arguments
+  ///
+  /// @param[in] Phi The Phi node to scalarize
+  /// @param[in] PM Mask indicating which lanes are required.
+  ///
+  /// @return The packet containing the scalarized values or null
+  SimdPacket *scalarizePHI(llvm::PHINode *Phi, PacketMask PM);
+  /// @brief Preserves debug information attached to old instruction
+  ///        we have just scalarized before it is removed.
+  ///
+  /// @param[in] Original Vector instruction which has been scalarized.
+  /// @param[in] Packet Packetized instruction after scalarization.
+  /// @param[in] Width SIMD width of packet.
+  void scalarizeDI(llvm::Instruction *Original, const SimdPacket *Packet,
+                   unsigned Width);
+
+  // These functions work on scalar values that use vector values.
+
+  /// @brief Scalarize the operands of an extract element instruction.
+  ///
+  /// @param[in] Extr Instruction to scalarize.
+  ///
+  /// @return A different value than Extr if the operands were scalarized; null
+  /// if scalarization failed; or Extr if the value has no vector operand.
+  llvm::Value *scalarizeOperandsExtractElement(llvm::ExtractElementInst *Extr);
+  /// @brief Scalarize the operands of a bitcast instruction.
+  ///
+  /// @param[in] BC Instruction to scalarize.
+  ///
+  /// @return A different value than BC if the operands were scalarized; null if
+  /// scalarization failed; or BC if the value has no vector operand.
+  llvm::Value *scalarizeOperandsBitCast(llvm::BitCastInst *BC);
+
+  /// @brief Scalarize the operands of a printf call.
+  ///
+  /// @param[in] CI Instruction to scalarize.
+  ///
+  /// @return A different value than CI if the operands were scalarized;
+  /// null if scalarization failed; or CI if the value has no vector
+  /// operand.
+  llvm::Value *scalarizeOperandsPrintf(llvm::CallInst *CI);
+
+  /// @brief Scalarize the operands of a binary operation instruction.
+  ///
+  /// @param[in] Intrin Instruction to scalarize.
+  ///
+  /// @return A different value than Intrin if the operands were scalarized;
+  /// null if scalarization failed; or Intrin if the value has no vector
+  /// operand.
+  llvm::Value *scalarizeReduceIntrinsic(llvm::IntrinsicInst *Intrin);
+};
+
+/// @}
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_SCALARIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
new file mode 100644
index 0000000000000..a428b84ba9aa9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/transform/ternary_transform_pass.h
@@ -0,0 +1,49 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file
+///
+/// @brief Transform the pattern generated by ternary operators to a
+/// vectorizable instruction set
+
+#ifndef VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED
+#define VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED
+
+#include <llvm/IR/PassManager.h>
+
+namespace vecz {
+
+/// @brief This pass tries to transform selects with pointer operands,
+/// transforms to individual GEPs followed by masked memory operations.
+class TernaryTransformPass : public llvm::PassInfoMixin<TernaryTransformPass> {
+public:
+  TernaryTransformPass() = default;
+
+  /// @brief The entry point to the pass.
+  //
+  /// @param[in] F Function to optimize.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  //
+  /// @return The preserved analyses.
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+
+  // @brief Pass name.
+  static llvm::StringRef name() { return "Ternary transform pass"; }
+};
+} // namespace vecz
+
+#endif // VECZ_TRANSFORM_TERNARY_TRANSFORM_PASS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
new file mode 100644
index 0000000000000..9d231c8b7b1d7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_context.h
@@ -0,0 +1,388 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file vectorization_context.h
+///
+/// @brief Hold global state and objects used for vectorization.
+
+#ifndef VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
+#define VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
+
+#include <llvm/ADT/DenseMap.h>
+#include <llvm/Analysis/IVDescriptors.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/ValueHandle.h>
+#include <llvm/Support/AtomicOrdering.h>
+#include <llvm/Support/TypeSize.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <map>
+#include <memory>
+
+namespace llvm {
+class TargetTransformInfo;
+} // namespace llvm
+
+namespace compiler {
+namespace utils {
+class BuiltinInfo;
+} // namespace utils
+} // namespace compiler
+
+namespace vecz {
+class MemOpDesc;
+class TargetInfo;
+struct UniformValueResult;
+class VectorizationChoices;
+struct VectorizationResult;
+class VectorizationUnit;
+
+using ActiveUnitMap = llvm::DenseMap<llvm::PoisoningVH<const llvm::Function>,
+                                     VectorizationUnit *>;
+
+/// @brief Holds global (per-module) vectorization state.
+class VectorizationContext {
+public:
+  /// @brief Create a new vectorization context object.
+  ///
+  /// @param[in] target Module in which vectorization happens.
+  /// @param[in] vti Target information.
+  /// @param[in] bi Builtins information
+  VectorizationContext(llvm::Module &target, TargetInfo &vti,
+                       compiler::utils::BuiltinInfo &bi);
+
+  /// @brief Access the public vectorizer API.
+
+  /// @brief Module in which vectorization happens.
+  llvm::Module &module() const { return Module; }
+
+  /// @brief Data layout for the target.
+  const llvm::DataLayout *dataLayout() const { return DL; }
+
+  /// @brief Information about the target.
+  TargetInfo &targetInfo() { return VTI; }
+
+  /// @brief Information about the target.
+  const TargetInfo &targetInfo() const { return VTI; }
+
+  llvm::TargetTransformInfo getTargetTransformInfo(llvm::Function &F) const;
+
+  /// @brief Construct and initialize the PassManager to be used for
+  /// vectorizing.
+  /// @return true if no problem occurred, false otherwise.
+  bool buildPassPipeline();
+  VectorizationUnit *getActiveVU(const llvm::Function *F) const;
+
+  /// @brief Log the Function's VectorizationUnit as the one governing the
+  /// current vectorization.
+  void setActiveVU(llvm::Function *F, VectorizationUnit *VU) {
+    ActiveVUs[F] = VU;
+  }
+  /// @brief Log the Function's VectorizationUnit as the one governing the
+  /// current vectorization.
+  void clearActiveVU(llvm::Function *F) { ActiveVUs.erase(F); }
+
+  /// @brief Builtin database.
+  compiler::utils::BuiltinInfo &builtins();
+
+  /// @brief Builtin database.
+  const compiler::utils::BuiltinInfo &builtins() const;
+
+  /// @brief Determine whether the function is an internal builtin or not.
+  ///
+  /// @param[in] F Function to analyze.
+  ///
+  /// @return true if F is an internal builtin function, false otherwise.
+  static bool isInternalBuiltin(const llvm::Function *F);
+  /// @brief Create a new function with the given name and type, unless it
+  /// already exists in the module. Mark it as an internal builtin.
+  ///
+  /// @param[in] Name Name of the builtin function.
+  /// @param[in] FT Function type for the builtin.
+  ///
+  /// @return Internal builtin function with the given Name.
+  llvm::Function *getOrCreateInternalBuiltin(llvm::StringRef Name,
+                                             llvm::FunctionType *FT = nullptr);
+  /// @brief Define the internal builtin function, i.e. generate its body.
+  ///
+  /// @param[in] F Function declaration to emit a body for.
+  ///
+  /// @return true if the body of the builtin was emitted, false otherwise.
+  bool defineInternalBuiltin(llvm::Function *F);
+  /// @brief Given a scalar builtin function, return a vector equivalent if it
+  /// is an internal builtin.
+  ///
+  /// @param[in] ScalarFn Scalar builtin to map to a vector equivalent.
+  /// @param[in] SimdWidth SIMD width used to determine which vector equivalent
+  /// to select.
+  ///
+  /// @return Equivalent vector builtin function on success, or null.
+  llvm::Function *getInternalVectorEquivalent(llvm::Function *ScalarFn,
+                                              unsigned SimdWidth);
+
+  /// @brief Check if the given function is a masked version of another function
+  ///
+  /// @param[in] F The function to check
+  /// @return true if the function is a masked version, or false otherwise
+  bool isMaskedFunction(const llvm::Function *F) const;
+  /// @brief Get the original non-masked function from a masked function
+  ///
+  /// @param[in] F The masked function
+  /// @return Original masked function if it exists, or null
+  llvm::Function *getOriginalMaskedFunction(llvm::Function *F);
+  /// @brief Get (if it exists already) or create the masked version of a
+  /// function
+  ///
+  /// @param[in] CI Call to the function to be masked
+  /// @return The masked version of the function
+  llvm::Function *getOrCreateMaskedFunction(llvm::CallInst *CI);
+
+  /// @brief Represents either an atomicrmw or cmpxchg operation.
+  ///
+  /// Most fields are shared, with the exception of CmpXchgFailureOrdering and
+  /// IsWeak, which are only to be set for cmpxchg, and BinOp, which is only to
+  /// be set to a valid value for atomicrmw.
+  struct MaskedAtomic {
+    llvm::Type *PointerTy;
+    llvm::Type *ValTy;
+    /// @brief Must be set to BAD_BINOP for cmpxchg instructions
+    llvm::AtomicRMWInst::BinOp BinOp;
+    llvm::Align Align;
+    bool IsVolatile = false;
+    llvm::SyncScope::ID SyncScope;
+    llvm::AtomicOrdering Ordering;
+    /// @brief Must be set for cmpxchg instructions
+    std::optional<llvm::AtomicOrdering> CmpXchgFailureOrdering = std::nullopt;
+    /// @brief Must only be set for cmpxchg instructions
+    bool IsWeak = false;
+    // Vectorization info
+    llvm::ElementCount VF;
+    bool IsVectorPredicated = false;
+
+    /// @brief Returns true if this MaskedAtomic represents a cmpxchg operation.
+    bool isCmpXchg() const {
+      if (CmpXchgFailureOrdering.has_value()) {
+        // 'binop' only applies to atomicrmw
+        assert(BinOp == llvm::AtomicRMWInst::BAD_BINOP &&
+               "Invalid MaskedAtomic state");
+        return true;
+      }
+      // 'weak' only applies to cmpxchg
+      assert(!IsWeak && "Invalid MaskedAtomic state");
+      return false;
+    }
+  };
+
+  /// @brief Check if the given function is a masked version of an atomicrmw or
+  /// cmpxchg operation.
+  ///
+  /// @param[in] F The function to check
+  /// @return A MaskedAtomic instance detailing the atomic operation if the
+  /// function is a masked atomic, or std::nullopt otherwise
+  std::optional<MaskedAtomic>
+  isMaskedAtomicFunction(const llvm::Function &F) const;
+  /// @brief Get (if it exists already) or create the function representing the
+  /// masked version of an atomicrmw/cmpxchg operation.
+  ///
+  /// @param[in] I Atomic to be masked
+  /// @param[in] Choices Choices to mangle into the function name
+  /// @param[in] VF The vectorization factor of the atomic operation
+  /// @return The masked version of the function
+  llvm::Function *
+  getOrCreateMaskedAtomicFunction(MaskedAtomic &I,
+                                  const VectorizationChoices &Choices,
+                                  llvm::ElementCount VF);
+
+  /// @brief Create a VectorizationUnit to use to vectorize the given scalar
+  /// function.
+  ///
+  /// The lifetime of the returned VectorizationUnit is managed by the
+  /// VectorizationContext.
+  ///
+  /// @param[in] F Function to vectorize.
+  /// @param[in] VF vectorization factor to use.
+  /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
+  /// @param[in] Ch Vectorization Choices for the vectorization.
+  VectorizationUnit *createVectorizationUnit(llvm::Function &F,
+                                             llvm::ElementCount VF,
+                                             unsigned Dimension,
+                                             const VectorizationChoices &Ch);
+
+  /// @brief Vectorizes all Vectorization Units in the context
+  void vectorize();
+
+  /// @brief Try to get a vectorization result for the scalar builtin function.
+  ///
+  /// @param[in] F Builtin function to create or retrieve an unit for.
+  /// @param[in] SimdWidth Vectorization factor to use.
+  ///
+  /// @return a VectorizationResult representing the vectorized function.
+  VectorizationResult &getOrCreateBuiltin(llvm::Function &F,
+                                          unsigned SimdWidth);
+
+  /// @brief Vectorize a builtin function by a given factor
+  ///
+  /// @param[in] F the function to vectorize.
+  /// @param[in] factor the vectorization factor.
+  ///
+  /// @return a VectorizationResult representing the vectorized function.
+  VectorizationResult getVectorizedFunction(llvm::Function &F,
+                                            llvm::ElementCount factor);
+
+  /// @brief Determine whether I is a vector instruction or not, i.e. it has any
+  /// vector operand.
+  ///
+  /// @param[in] I Instruction to analyze.
+  ///
+  /// @return true if I is a vector instruction.
+  static bool isVector(const llvm::Instruction &I);
+
+  static const char *InternalBuiltinPrefix;
+
+private:
+  /// @brief Determine whether this scalar builtin function can be safely
+  /// expanded at vector call sites, i.e. it has not side effects.
+  ///
+  /// @param[in] ScalarFn Builtin function to analyze.
+  ///
+  /// @return true if the function can be expanded.
+  bool canExpandBuiltin(const llvm::Function *ScalarFn) const;
+
+  /// @brief Emit the body for the masked load or store internal builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitMaskedMemOpBody(llvm::Function &F, const MemOpDesc &Desc) const;
+  /// @brief Emit the body for the interleaved load or store internal builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitInterleavedMemOpBody(llvm::Function &F, const MemOpDesc &Desc) const;
+  /// @brief Emit the body for the masked interleaved load/store internal
+  /// builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitMaskedInterleavedMemOpBody(llvm::Function &F,
+                                      const MemOpDesc &Desc) const;
+  /// @brief Emit the body for the scatter or gather internal builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitScatterGatherMemOpBody(llvm::Function &F,
+                                  const MemOpDesc &Desc) const;
+  /// @brief Emit the body for the masked scatter or gather internal builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] Desc The MemOpDesc for the memory operation
+  /// @returns true on success, false otherwise
+  bool emitMaskedScatterGatherMemOpBody(llvm::Function &F,
+                                        const MemOpDesc &Desc) const;
+  /// @brief Add the masked function to the tracking set
+  ///
+  /// @param[in] F The function to add
+  /// @param[in] WrappedF The original function being masked
+  /// @return false if the function was already in the set, or true otherwise
+  bool insertMaskedFunction(llvm::Function *F, llvm::Function *WrappedF);
+
+  /// @brief Emit the body for the subgroup scan builtins
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] IsInclusive whether the scan should be inclusive (on true) or
+  /// exclusive (on false).
+  /// @param[in] OpKind the kind of scan to emit. Note: not all values of
+  /// llvm::RecurKind are supported scan operations.
+  /// @param[in] IsVP whether the scan is vector-predicated.
+  /// @returns true on success, false otherwise
+  bool emitSubgroupScanBody(llvm::Function &F, bool IsInclusive,
+                            llvm::RecurKind OpKind, bool IsVP) const;
+
+  /// @brief Emit the body for a masked atomic builtin
+  ///
+  /// @param[in] F The empty (declaration only) function to emit the body in
+  /// @param[in] MA The MaskedAtomic information
+  /// @returns true on success, false otherwise
+  bool emitMaskedAtomicBody(llvm::Function &F, const MaskedAtomic &MA) const;
+
+  /// @brief Helper for non-vectorization tasks.
+  TargetInfo &VTI;
+  /// @brief Module in which the vectorization happens.
+  llvm::Module &Module;
+  /// @brief Builtins database.
+  compiler::utils::BuiltinInfo &BI;
+  /// @brief Data layout object used to determine the size and alignment of
+  /// types.
+  const llvm::DataLayout *DL;
+  /// @brief Persistent storage for Kernel Vectorization Units
+  std::vector<std::unique_ptr<VectorizationUnit>> KernelUnits;
+  /// @brief Mapping between functions in the module and vectorization units.
+  llvm::DenseMap<const llvm::Function *,
+                 llvm::SmallDenseMap<unsigned, VectorizationResult, 1>>
+      VectorizedBuiltins;
+  /// @brief Maps vector functions to their VectorizationUnits
+  ActiveUnitMap ActiveVUs;
+  /// @brief Map of masked functions used in the module to their original
+  /// non-masked function.
+  llvm::ValueToValueMapTy MaskedFunctionsMap;
+  /// @brief All the masked versions of functions generated by Vecz
+  ///
+  /// Keeps track of all the functions we already have masked versions of. We
+  /// use the name of the masked function instead of just the Function pointer
+  /// because vararg functions have different masked versions for different
+  /// argument types.
+  std::map<std::string, llvm::Function *> MaskedVersions;
+};
+
+/// \addtogroup passes Passes
+/// @{
+/// \ingroup vecz
+
+/// @brief Implement internal builtins.
+class DefineInternalBuiltinsPass
+    : public llvm::PassInfoMixin<DefineInternalBuiltinsPass> {
+public:
+  /// @brief Create a new pass object.
+  DefineInternalBuiltinsPass() {}
+
+  static void *ID() { return (void *)&PassID; }
+
+  /// @brief Define all used internal builtins in the module, expanding bodies
+  /// for declaration only references.
+  ///
+  /// @param[in] M Module in which to define internal builtins.
+  /// @param[in] AM ModuleAnalysisManager providing analyses.
+  ///
+  /// @return Set of preserved analyses (all analyses).
+  llvm::PreservedAnalyses run(llvm::Module &M, llvm::ModuleAnalysisManager &AM);
+
+  static llvm::StringRef name() { return "Define internal builtins"; }
+
+private:
+  /// @brief Identifier for the DefineInternalBuiltin pass.
+  static char PassID;
+};
+
+/// @}
+} // namespace vecz
+
+#endif // VECZ_VECTORIZATION_CONTEXT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
new file mode 100644
index 0000000000000..c865601b90a55
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_helpers.h
@@ -0,0 +1,82 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_VECTORIZATION_HELPERS_H_INCLUDED
+#define VECZ_VECTORIZATION_HELPERS_H_INCLUDED
+
+#include <llvm/Support/TypeSize.h>
+
+#include <string>
+
+namespace llvm {
+class Function;
+class StringRef;
+} // namespace llvm
+
+namespace vecz {
+class VectorizationUnit;
+class VectorizationChoices;
+
+/// @brief Generate a name for the vectorized function, which depends on the
+/// original function name and SIMD width.
+///
+/// @param[in] ScalarName Name of the original function.
+/// @param[in] VF vectorization factor of the vectorized function.
+/// @param[in] Choices choices used for vectorization
+/// @param[in] IsBuiltin True if this is an internal builtin.
+///
+/// @return Name for the vectorized function.
+std::string getVectorizedFunctionName(llvm::StringRef ScalarName,
+                                      llvm::ElementCount VF,
+                                      VectorizationChoices Choices,
+                                      bool IsBuiltin = false);
+
+/// @brief Parses a name generated for a vectorized function
+///
+/// @see getVectorizedFunctionName.
+///
+/// @param[in] Name Name of the vectorized function.
+///
+/// @return A tuple containing the original name of the function, and the
+/// element count and choices it was encoded with. Returns std::nullopt on
+/// failure.
+std::optional<std::tuple<std::string, llvm::ElementCount, VectorizationChoices>>
+decodeVectorizedFunctionName(llvm::StringRef Name);
+
+/// @brief Clone the scalar function's body into the function to vectorize,
+/// vectorizing function argument types where required.
+///
+/// @param[in] VU the Vectorization Unit of the scalar function to clone.
+///
+/// @return The cloned function.
+llvm::Function *cloneFunctionToVector(const VectorizationUnit &VU);
+
+/// @brief Create a copy of the scalar functions debug info metatadata
+//         nodes and set the scope of the copied DI to the vectorized
+//         function.
+void cloneDebugInfo(const VectorizationUnit &VU);
+
+/// @brief Clone OpenCL related metadata from the scalar kernel to the
+/// vectorized one.
+///
+/// This function will copy any 'opencl.kernels' or
+/// 'opencl.kernel_wg_size_info' metadata from the scalar kernel to the
+/// vectorized one. Obviously, the kernel itself has to be cloned before
+/// calling this function.
+void cloneOpenCLMetadata(const VectorizationUnit &VU);
+} // namespace vecz
+
+#endif // VECZ_VECTORIZATION_HELPERS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
new file mode 100644
index 0000000000000..e80949be23143
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_heuristics.h
@@ -0,0 +1,43 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED
+#define VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED
+
+#include <llvm/Support/TypeSize.h>
+
+namespace llvm {
+class Function;
+} // namespace llvm
+
+namespace vecz {
+class VectorizationContext;
+
+/// @brief Decide whether a function is worth vectorizing for a given
+/// vectorization factor.
+///
+/// @param[in] F the function to analyze
+/// @param[in] Ctx the vectorization context
+/// @param[in] VF the vectorization factor
+/// @param[in] SimdDimIdx the vectorization dimension
+///
+/// @return Whether we should vectorize the function or not.
+bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx,
+                     llvm::ElementCount VF, unsigned SimdDimIdx);
+
+} // namespace vecz
+
+#endif // VECZ_VECTORIZATION_HEURISTICS_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
new file mode 100644
index 0000000000000..820b83d53ad86
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorization_unit.h
@@ -0,0 +1,258 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef VECZ_VECTORIZATION_UNIT_H_INCLUDED
+#define VECZ_VECTORIZATION_UNIT_H_INCLUDED
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Support/TypeSize.h>
+#include <llvm/Transforms/Utils/ValueMapper.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace llvm {
+class Function;
+class FunctionType;
+class Module;
+class Instruction;
+class Argument;
+} // namespace llvm
+
+namespace vecz {
+namespace internal {
+struct VeczFailResult;
+struct AnalysisFailResult;
+} // namespace internal
+
+struct SimdPacket;
+struct UniformValueResult;
+class ValueTagMap;
+class VectorizationContext;
+class VectorizationChoices;
+
+template <typename T> class AnalysisWrapper;
+
+/// @brief Describe an argument of a function that needs to be vectorized.
+struct VectorizerTargetArgument {
+  /// @brief Argument of the scalar function.
+  llvm::Argument *OldArg;
+  /// @brief Argument of the vectorized function. Might be scalar or vector.
+  llvm::Argument *NewArg;
+  /// @brief Whether the argument needs to be vectorized or not.
+  bool IsVectorized;
+  /// @brief If the argument is a 'byref' pointer used to return a value, this
+  /// is the type of that value. Else it is null.
+  llvm::Type *PointerRetPointeeTy;
+  /// @brief Placeholder instruction for arguments needing vectorization.
+  llvm::Instruction *Placeholder;
+};
+
+/// @brief Analysis flags that can be attached to LLVM functions.
+enum FunctionFlags {
+  eFunctionNoFlag = 0,
+  /// @brief The function has been analyzed.
+  /// Set by the preliminary vectorization analysis (canVectorize). Set once.
+  eFunctionAnalysisDone = (1 << 0),
+  /// @brief The function can be vectorized.
+  /// Set by the preliminary vectorization analysis (canVectorize). Set once.
+  eFunctionVectorizable = (1 << 1),
+  /// @brief Vectorization of the function failed.
+  /// Can be set by any pass. Set once.
+  eFunctionVectorizationFailed = (1 << 2),
+};
+
+/// @brief struct to hold only the data needed to use a vectorized function
+struct VectorizationResult {
+  struct Arg {
+    enum Kind { SCALAR, VECTORIZED, POINTER_RETURN } kind;
+    llvm::Type *type;
+    llvm::Type *pointerRetPointeeTy = nullptr;
+    constexpr Arg(Kind k, llvm::Type *ty, llvm::Type *ptrRetTy)
+        : kind(k), type(ty), pointerRetPointeeTy(ptrRetTy) {}
+  };
+
+  llvm::Function *func = nullptr;
+  llvm::SmallVector<Arg, 2> args;
+
+  operator bool() const { return func; }
+  llvm::Function *get() const { return func; }
+};
+
+/// @brief Describe a function that needs to be vectorized.
+class VectorizationUnit {
+public:
+  /// @brief Create a new vectorization unit for the given scalar function.
+  ///
+  /// @param[in] F Function to vectorize.
+  /// @param[in] Width SIMD width (i.e. vectorization factor) to use.
+  /// @param[in] Dimension SIMD dimension to use (0 => x, 1 => y, 2 => z).
+  /// @param[in] Ctx Context for vectorization.
+  /// @param[in] Ch Vectorization Choices for the vectorization.
+  VectorizationUnit(llvm::Function &F, llvm::ElementCount Width,
+                    unsigned Dimension, VectorizationContext &Ctx,
+                    const VectorizationChoices &Ch);
+  /// @brief Free up any resource used by the function.
+  ~VectorizationUnit();
+
+  /// @brief Access the vectorization context linked to this function.
+  VectorizationContext &context() { return Ctx; }
+
+  /// @brief Access the vectorization context linked to this function.
+  const VectorizationContext &context() const { return Ctx; }
+
+  /// @brief Number of available SIMD lanes, i.e. vectorization factor.
+  llvm::ElementCount width() const { return SimdWidth; }
+
+  /// @brief Get the work group size along the vectorization dimension.
+  uint64_t getLocalSize() const { return LocalSize; }
+
+  /// @brief Whether to run the SIMD Width Analysis during vectorization.
+  bool autoWidth() const { return AutoSimdWidth; }
+
+  /// @brief Index of SIMD dimension used in vectorization.
+  unsigned dimension() const { return SimdDimIdx; }
+
+  /// @brief Set the SIMD width, i.e. vectorization factor. After changing this
+  /// value a possible existing vectorized function is looked up in the module.
+  ///
+  /// @param[in] NewWidth New SIMD width.
+  void setWidth(llvm::ElementCount NewWidth);
+
+  /// @brief Set the work group size along the vectorization dimension.
+  ///
+  /// @param[in] LS the local work group size
+  void setLocalSize(uint64_t LS) { LocalSize = LS; }
+
+  /// @brief Set whether to use the SIMD width analysis
+  ///
+  /// @param[in] Auto true to use auto SIMD width, false otherwise
+  void setAutoWidth(bool Auto) { AutoSimdWidth = Auto; }
+
+  /// @brief Determine whether vectorizing the function failed or not.
+  bool failed() const { return hasFlag(eFunctionVectorizationFailed); }
+
+  /// @brief Mark this function as failing vectorization.
+  /// @param[in] Remark Message to print into the optimization remarks
+  /// @param[in] F Function to pass to emitVeczRemarkMissed
+  /// @param[in] V Value to pass to emitVeczRemarkMissed
+  /// @return unconditionally returns a VeczFailResult which can be safely
+  /// ignored. This can help cut down on some boilerplate in contexts where
+  /// we'll immediately return, via the following idiom:
+  /// ```
+  ///   if (!thing) {
+  ///     return setFailed("thing wasn't");
+  ///   }
+  /// ```
+  internal::AnalysisFailResult setFailed(const char *Remark,
+                                         const llvm::Function *F = nullptr,
+                                         const llvm::Value *V = nullptr);
+
+  /// @brief Check whether the function has the given flag or not.
+  ///
+  /// @param[in] Flag Flag to check.
+  ///
+  /// @return true if the function has the given flag, false otherwise.
+  bool hasFlag(FunctionFlags Flag) const { return (FnFlags & Flag) == Flag; }
+
+  /// @brief Set the given flag to the function.
+  ///
+  /// @param[in] Flag Flag to set.
+  void setFlag(FunctionFlags Flag) {
+    FnFlags = (FunctionFlags)(FnFlags | Flag);
+  }
+
+  /// @brief Clear the given flag from the function.
+  ///
+  /// @param[in] Flag Flag to set.
+  void clearFlag(FunctionFlags Flag) {
+    FnFlags = (FunctionFlags)(FnFlags & ~Flag);
+  }
+
+  /// @brief Access the arguments of the function to vectorize.
+  const llvm::SmallVectorImpl<VectorizerTargetArgument> &arguments() const {
+    return Arguments;
+  }
+
+  /// @brief Return the vectorized function if it exists, otherwise the original
+  /// function.
+  llvm::Function &function();
+
+  /// @brief Return the vectorized function if it exists, otherwise the original
+  /// function.
+  const llvm::Function &function() const;
+
+  /// @brief Original function to vectorize.
+  llvm::Function *scalarFunction() const { return ScalarFn; }
+
+  /// @brief Set the function to vectorize. This updates the function arguments.
+  ///
+  /// @param[in] NewFunction Original function.
+  void setScalarFunction(llvm::Function *NewFunction);
+
+  /// @brief Vectorized function.
+  llvm::Function *vectorizedFunction() const { return VectorizedFn; }
+
+  /// @brief Set the vectorized function. This updates the function arguments.
+  ///
+  /// @param[in] NewFunction Vectorized function.
+  void setVectorizedFunction(llvm::Function *NewFunction);
+
+  /// @brief Name of the current function.
+  llvm::StringRef getName() const { return function().getName(); }
+
+  /// @brief Get the result of the vectorization
+  /// @return The VectorizationResult respresenting the vectorized function
+  VectorizationResult getResult() const;
+
+  /// @brief Get the Vecz optimizations tracker class
+  /// @return The Choices
+  const VectorizationChoices &choices() const { return Choices; };
+
+private:
+  /// @brief Context this function is vectorized in.
+  VectorizationContext &Ctx;
+  /// @brief Which Vecz code generation choices are enabled and which not
+  const VectorizationChoices &Choices;
+  /// @brief Function to vectorize.
+  llvm::Function *ScalarFn;
+  /// @brief Target (vectorized) function.
+  llvm::Function *VectorizedFn;
+  /// @brief Arguments of the function to vectorize.
+  llvm::SmallVector<VectorizerTargetArgument, 4> Arguments;
+  /// @brief Vectorization factor to use.
+  llvm::ElementCount SimdWidth;
+  /// @brief The work group size along the vectorization dimension, if known,
+  /// zero otherwise. For our purposes, this only need be an upper bound.
+  uint64_t LocalSize;
+  /// @brief Use the SIMD Width Analysis to determine the SIMD width
+  bool AutoSimdWidth;
+  /// @brief SimdDimIdx Index of vectorization dimension to use.
+  unsigned SimdDimIdx;
+  /// @brief Name of the builtin function, if the function to vectorize is one.
+  std::string BuiltinName;
+  /// @brief Per-function analysis flags.
+  FunctionFlags FnFlags;
+  /// @brief Placeholder instructions for arguments that will be vectorized.
+  llvm::SmallPtrSet<const llvm::Instruction *, 4> ArgumentPlaceholders;
+};
+
+} // namespace vecz
+
+#endif // VECZ_VECTORIZATION_UNIT_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
new file mode 100644
index 0000000000000..483a46af5c681
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vectorizer.h
@@ -0,0 +1,74 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file vectorizer.h
+///
+/// @brief Entry point for the kernel vectorizer.
+
+#ifndef VECZ_VECTORIZER_H_INCLUDED
+#define VECZ_VECTORIZER_H_INCLUDED
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Function;
+} // namespace llvm
+
+namespace vecz {
+
+/// @brief The maximum number of vectorization dimension that Vecz can handle.
+///
+/// The current limitation is due to the assumption that work groups are
+/// being represented as 1- 2- or 3-dimensional arrays or work items.
+const unsigned MAX_SIMD_DIM = 3;
+
+class VectorizationContext;
+class VectorizationUnit;
+struct VeczPassOptions;
+
+/// @brief Try to create a vectorization unit for the given kernel function,
+///        with the given vectorization factor and vectorization options.
+///
+/// @param[in] Ctx VectorizationContext used to perform the vectorization.
+/// @param[in] Kernel kernel function to vectorize.
+/// @param[in] Opts Vecz Pass Options struct for this vectorization.
+/// @param[in] FAM Function Analysis Manager for running analyses
+/// @param[in] Check check for vectorizability before creating the VU
+///
+/// @return Pointer to a vectorization unit on success, or nullptr on failure.
+VectorizationUnit *createVectorizationUnit(VectorizationContext &Ctx,
+                                           llvm::Function *Kernel,
+                                           const VeczPassOptions &Opts,
+                                           llvm::FunctionAnalysisManager &FAM,
+                                           bool Check);
+
+/// @brief Create metadata for the vectorization unit relating the vectorized
+///        function to the scalar function.
+///
+/// @param[in] VU the vectorization Unit of to create metadata for
+/// @returns true iff vectorization succeeded.
+bool createVectorizedFunctionMetadata(VectorizationUnit &VU);
+
+/// @brief Register failure, success, and update statistics for the given
+/// VectorizationUnit.
+///
+/// @param[in] VU the vectorization Unit of to create metadata for
+/// @returns true iff vectorization succeeded.
+void trackVeczSuccessFailure(VectorizationUnit &VU);
+} // namespace vecz
+
+#endif // VECZ_VECTORIZER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
new file mode 100644
index 0000000000000..a51e66c4ec024
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/include/vecz_pass_builder.h
@@ -0,0 +1,68 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+/// @file vecz_pass_builder.h
+///
+/// @brief class to initialize a Module Pass Manager to perform vectorization.
+
+#ifndef VECZ_VECZ_PASS_BUILDER_H_INCLUDED
+#define VECZ_VECZ_PASS_BUILDER_H_INCLUDED
+
+#include <compiler/utils/pass_machinery.h>
+#include <llvm/IR/PassManager.h>
+
+namespace llvm {
+class Module;
+class TargetTransformInfo;
+class TargetMachine;
+} // namespace llvm
+
+namespace vecz {
+class VectorizationContext;
+
+/// @brief A class that manages the lifetime and initialization of all
+/// components required to set up an LLVM pass manager to run Vecz passes.
+class VeczPassMachinery final : public compiler::utils::PassMachinery {
+public:
+  /// @brief Construct the pass machinery.
+  /// The base class method `initialize(TargetInfo)` must also be called.
+  ///
+  /// @param[in] TM TargetMachine to be used for passes. May be nullptr
+  /// @param[in] ctx the vectorization context object for the module.
+  /// @param[in] verifyEach true if each pass should be verified
+  /// @param[in] debugLogLevel debug logging verbosity.
+  VeczPassMachinery(llvm::LLVMContext &llvmCtx, llvm::TargetMachine *TM,
+                    VectorizationContext &ctx, bool verifyEach,
+                    compiler::utils::DebugLogging debugLogLevel =
+                        compiler::utils::DebugLogging::None);
+
+  virtual void registerPasses() override;
+
+private:
+  virtual void addClassToPassNames() override;
+  virtual void registerPassCallbacks() override;
+
+  VectorizationContext &Ctx;
+};
+
+/// @brief Add the full Vecz pass pipeline to the given pass manager.
+///
+/// @param[in] PM The Module Pass Manager to build.
+/// @return true on success.
+bool buildPassPipeline(llvm::ModulePassManager &PM);
+} // namespace vecz
+
+#endif // VECZ_VECZ_PASS_BUILDER_H_INCLUDED
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
new file mode 100644
index 0000000000000..28ec40fc6f7c7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/ir_cleanup.cpp
@@ -0,0 +1,143 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "ir_cleanup.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/DebugInfo.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/Local.h>
+
+#include "memory_operations.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+
+/// @brief Determine whether all users of the instructions are dead. An user is
+/// dead if it has no use, if it is present in the 'to delete' list or if it is
+/// a phi node whose only use keeps it alive is the 'backedge'.
+///
+/// @param[in] I Instruction to check for deletion.
+/// @param[in] DeadList Instructions marked for deletion.
+/// @param[in,out] WorkList Newly detected Instructions marked for deletion.
+/// @param[in,out] Visited Instructions visited for deletion.
+///
+/// @return true if all users of the instructions are dead, false otherwise.
+bool AreUsersDead(Instruction *I,
+                  const SmallPtrSetImpl<Instruction *> &DeadList,
+                  SmallPtrSetImpl<Instruction *> &WorkList,
+                  SmallPtrSetImpl<Instruction *> &Visited) {
+  for (User *U : I->users()) {
+    // Ignore non-instructions.
+    Instruction *UserI = dyn_cast<Instruction>(U);
+    if (!UserI) {
+      continue;
+    }
+
+    // Trivially dead users can be removed, even if we haven't explicitly marked
+    // them for deletion. The DCE pass would have removed these later on anyway,
+    // and by marking them for deletion here we can be more aggressive about
+    // what we delete.
+    if (isInstructionTriviallyDead(UserI)) {
+      WorkList.insert(UserI);
+    }
+
+    // I is held by a non-dead user.
+    if (!DeadList.contains(UserI) && !WorkList.contains(UserI)) {
+      return false;
+    }
+
+    // Recurse over the user's users.
+    if (!UserI->user_empty() && Visited.insert(UserI).second &&
+        !AreUsersDead(UserI, DeadList, WorkList, Visited)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace
+
+void IRCleanup::deleteInstructionLater(llvm::Instruction *I) {
+  if (InstructionsToDelete.insert(I).second) {
+    LLVM_DEBUG(dbgs() << "Marking for deletion: " << *I << "\n");
+  }
+}
+
+void IRCleanup::deleteInstructions() {
+  SmallPtrSet<Instruction *, 16> WorkList;
+  SmallPtrSet<Instruction *, 16> VisitedForCycles;
+  bool progress = true;
+  while (progress && !InstructionsToDelete.empty()) {
+    progress = false;
+    for (Instruction *I : InstructionsToDelete) {
+      WorkList.erase(I);
+      if (I->use_empty()) {
+        I->eraseFromParent();
+        progress = true;
+      } else if (PHINode *Phi = dyn_cast<PHINode>(I)) {
+        if (AreUsersDead(Phi, InstructionsToDelete, WorkList,
+                         VisitedForCycles)) {
+          Phi->replaceAllUsesWith(PoisonValue::get(Phi->getType()));
+          Phi->eraseFromParent();
+          progress = true;
+        } else {
+          WorkList.insert(Phi);
+        }
+        VisitedForCycles.clear();
+      } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        // MemOps make deleting unnecessary instructions harder, because they
+        // cannot be trivially dead instructions, thus breaking our recursive
+        // deletion. However, if we have packetized a load or a store, we
+        // definitely want to remove the scalar one, as it will be
+        // reading/writing to invalid pointers. To make things simpler, here we
+        // detect internal builtins that perform memory operations and erase
+        // them. Since stores have no users, they will be removed earlier on and
+        // we do not need to check here.
+        auto Op = MemOp::get(CI);
+        if (Op && Op->isLoad()) {
+          // We need to replace loads with nops, as we need to have a value for
+          // their users, which will be removed later on.
+          I->replaceAllUsesWith(PoisonValue::get(Op->getDataType()));
+          I->eraseFromParent();
+        } else {
+          WorkList.insert(I);
+        }
+      } else {
+        WorkList.insert(I);
+      }
+    }
+    InstructionsToDelete = std::move(WorkList);
+    WorkList.clear();
+  }
+
+  // Remove remaining instructions from the list.
+  LLVM_DEBUG(for (Instruction *I : InstructionsToDelete) {
+    dbgs() << "vecz: could not delete " << *I << "\n";
+  });
+  InstructionsToDelete.clear();
+}
+
+void IRCleanup::deleteInstructionNow(Instruction *I) {
+  I->replaceAllUsesWith(PoisonValue::get(I->getType()));
+  I->eraseFromParent();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
new file mode 100644
index 0000000000000..a6252e834ad43
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/llvm_helpers.cpp
@@ -0,0 +1,73 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "llvm_helpers.h"
+
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/IR/Module.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include "debugging.h"
+#include "memory_operations.h"
+
+using namespace llvm;
+
+/// @brief Determine if the value has vector type, and return it.
+///
+/// @param[in] V Value to analyze.
+///
+/// @return Vector type of V or null.
+FixedVectorType *vecz::getVectorType(Value *V) {
+  if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
+    auto *VO = Store->getValueOperand();
+    assert(VO && "Could not get value operand");
+    return dyn_cast<FixedVectorType>(VO->getType());
+  } else if (CallInst *Call = dyn_cast<CallInst>(V)) {
+    if (auto MaskedOp = MemOp::get(Call, MemOpAccessKind::Masked)) {
+      if (MaskedOp->isMaskedMemOp() && MaskedOp->isStore()) {
+        return dyn_cast<FixedVectorType>(MaskedOp->getDataType());
+      }
+    }
+  }
+  return dyn_cast<FixedVectorType>(V->getType());
+}
+
+/// @brief Get the default value for a type.
+///
+/// @param[in] T Type to get default value of.
+/// @param[in] V Default value to use for numeric type
+///
+/// @return Default value, which will be poison for non-numeric types
+Value *vecz::getDefaultValue(Type *T, uint64_t V) {
+  if (T->isIntegerTy()) {
+    return ConstantInt::get(T, V);
+  }
+
+  if (T->isFloatTy() || T->isDoubleTy()) {
+    return ConstantFP::get(T, V);
+  }
+
+  return PoisonValue::get(T);
+}
+
+/// @brief Get the shuffle mask as sequence of integers.
+///
+/// @param[in] Shuffle Instruction
+///
+/// @return Array of integers representing the Shuffle mask
+ArrayRef<int> vecz::getShuffleVecMask(ShuffleVectorInst *Shuffle) {
+  return Shuffle->getShuffleMask();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
new file mode 100644
index 0000000000000..aedcd49128678
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/memory_operations.cpp
@@ -0,0 +1,966 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "memory_operations.h"
+
+#include <compiler/utils/mangling.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <string>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+using namespace vecz;
+using namespace llvm;
+
+static std::string getMaskedMemOpName(Type *DataTy, PointerType *PtrTy,
+                                      Type *MaskTy, unsigned Alignment,
+                                      bool IsLoad, bool IsVP) {
+  if (!DataTy) {
+    return std::string();
+  }
+  compiler::utils::NameMangler Mangler(&DataTy->getContext());
+  const char *BaseName = IsLoad ? "masked_load" : "masked_store";
+  const compiler::utils::TypeQualifiers DataQuals(
+      compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers PtrQuals(
+      compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers MaskQuals(
+      compiler::utils::eTypeQualNone);
+  std::string Name;
+  raw_string_ostream O(Name);
+  O << VectorizationContext::InternalBuiltinPrefix << BaseName << Alignment
+    << "_";
+  if (IsVP) {
+    O << "vp_";
+  }
+  if (!Mangler.mangleType(O, DataTy, DataQuals) ||
+      !Mangler.mangleType(O, PtrTy, PtrQuals) ||
+      !Mangler.mangleType(O, MaskTy, MaskQuals)) {
+    return std::string();
+  }
+  if (IsVP) {
+    const compiler::utils::TypeQualifiers VLQuals(
+        compiler::utils::eTypeQualNone);
+    if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
+                            VLQuals)) {
+      return std::string();
+    }
+  }
+  O.flush();
+  return Name;
+}
+
+Function *vecz::getOrCreateMaskedMemOpFn(VectorizationContext &Ctx,
+                                         Type *DataTy, PointerType *PtrTy,
+                                         unsigned Alignment, bool IsLoad,
+                                         bool IsVP) {
+  const Module &M = Ctx.module();
+  LLVMContext &LLVMCtx = M.getContext();
+  Type *MaskTy = IntegerType::getInt1Ty(LLVMCtx);
+  if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
+    MaskTy = VectorType::get(MaskTy, multi_llvm::getVectorElementCount(VecTy));
+  }
+
+  // Try to retrieve the builtin if it already exists.
+  const std::string Name =
+      getMaskedMemOpName(DataTy, PtrTy, MaskTy, Alignment, IsLoad, IsVP);
+  VECZ_FAIL_IF(Name.empty());
+  Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
+  if (!F) {
+    // Declare it if it doesn't exist.
+    SmallVector<Type *, 4> Tys;
+    if (!IsLoad) {
+      Tys.push_back(DataTy);
+    }
+    Tys.push_back(PtrTy);
+    Tys.push_back(MaskTy);
+    if (IsVP) {
+      Tys.push_back(IntegerType::getInt32Ty(LLVMCtx));
+    }
+
+    Type *RetTy = IsLoad ? DataTy : Type::getVoidTy(LLVMCtx);
+    FunctionType *FT = FunctionType::get(RetTy, Tys, false);
+    F = Ctx.getOrCreateInternalBuiltin(Name, FT);
+  }
+  return F;
+}
+
+static CallInst *createMaskedMemOp(VectorizationContext &Ctx, Value *Data,
+                                   Type *DataTy, Value *Ptr, Value *Mask,
+                                   Value *EVL, unsigned Alignment, Twine Name) {
+  VECZ_FAIL_IF(!DataTy);
+  VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy());
+  VECZ_FAIL_IF(!Mask);
+  assert(!Data || Data->getType() == DataTy);
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
+  Function *F =
+      getOrCreateMaskedMemOpFn(Ctx, DataTy, PtrTy, Alignment,
+                               /*IsLoad*/ Data == nullptr, EVL != nullptr);
+  VECZ_FAIL_IF(!F);
+  SmallVector<Value *, 4> Ops;
+  if (Data) {
+    Ops.push_back(Data);
+  }
+  Ops.push_back(Ptr);
+  Ops.push_back(Mask);
+  if (EVL) {
+    Ops.push_back(EVL);
+  }
+  return CallInst::Create(F, Ops, Name);
+}
+
+CallInst *vecz::createMaskedLoad(VectorizationContext &Ctx, Type *Ty,
+                                 Value *Ptr, Value *Mask, Value *EVL,
+                                 unsigned Alignment, Twine Name) {
+  return createMaskedMemOp(Ctx, /*Data*/ nullptr, Ty, Ptr, Mask, EVL, Alignment,
+                           Name);
+}
+
+CallInst *vecz::createMaskedStore(VectorizationContext &Ctx, Value *Data,
+                                  Value *Ptr, Value *Mask, Value *EVL,
+                                  unsigned Alignment, Twine Name) {
+  return createMaskedMemOp(Ctx, Data, Data->getType(), Ptr, Mask, EVL,
+                           Alignment, Name);
+}
+
+static std::string getInterleavedMemOpName(Type *DataTy, PointerType *PtrTy,
+                                           Value *Stride, Type *MaskTy,
+                                           unsigned Alignment, bool IsLoad,
+                                           bool IsVP) {
+  if (!DataTy) {
+    return std::string();
+  }
+  compiler::utils::NameMangler Mangler(&DataTy->getContext());
+  const char *BaseName = IsLoad ? "interleaved_load" : "interleaved_store";
+  std::string Name;
+  const compiler::utils::TypeQualifiers VecQuals(
+      compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers PtrQuals(
+      compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone);
+  raw_string_ostream O(Name);
+  O << VectorizationContext::InternalBuiltinPrefix;
+  if (MaskTy) {
+    O << "masked_";
+  }
+  O << BaseName << Alignment << "_";
+  if (IsVP) {
+    O << "vp_";
+  }
+  if (auto *CVal = dyn_cast<ConstantInt>(Stride)) {
+    O << CVal->getSExtValue();
+  } else {
+    O << "V";
+  }
+  O << "_";
+  if (!Mangler.mangleType(O, DataTy, VecQuals) ||
+      !Mangler.mangleType(O, PtrTy, PtrQuals)) {
+    return std::string();
+  }
+  if (MaskTy) {
+    const compiler::utils::TypeQualifiers MaskQuals(
+        compiler::utils::eTypeQualNone);
+    if (!Mangler.mangleType(O, MaskTy, MaskQuals)) {
+      return std::string();
+    }
+  }
+  if (IsVP) {
+    const compiler::utils::TypeQualifiers VLQuals(
+        compiler::utils::eTypeQualNone);
+    if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
+                            VLQuals)) {
+      return std::string();
+    }
+  }
+  O.flush();
+  return Name;
+}
+
+Function *vecz::getOrCreateInterleavedMemOpFn(VectorizationContext &Ctx,
+                                              Type *DataTy, PointerType *PtrTy,
+                                              Value *Stride, Type *MaskTy,
+                                              unsigned Alignment, bool IsLoad,
+                                              bool IsVP) {
+  Module &M = Ctx.module();
+  LLVMContext &LLVMCtx = M.getContext();
+
+  // Try to retrieve the builtin if it already exists.
+  const std::string Name = getInterleavedMemOpName(
+      DataTy, PtrTy, Stride, MaskTy, Alignment, IsLoad, IsVP);
+  VECZ_FAIL_IF(Name.empty());
+  Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
+  if (!F) {
+    // Declare it if it doesn't exist.
+    SmallVector<Type *, 6> Tys;
+    if (!IsLoad) {
+      VECZ_FAIL_IF(!DataTy);
+      Tys.push_back(DataTy);
+    }
+    VECZ_FAIL_IF(!PtrTy);
+    Tys.push_back(PtrTy);
+    if (MaskTy) {
+      Tys.push_back(MaskTy);
+    }
+    if (IsVP) {
+      Tys.push_back(IntegerType::getInt32Ty(LLVMCtx));
+    }
+    if (!isa<ConstantInt>(Stride)) {
+      Tys.push_back(getSizeTy(M));
+    }
+    Type *RetTy = IsLoad ? DataTy : Type::getVoidTy(LLVMCtx);
+    FunctionType *FT = FunctionType::get(RetTy, Tys, false);
+    F = Ctx.getOrCreateInternalBuiltin(Name, FT);
+  }
+  return F;
+}
+
+static CallInst *createInterleavedMemOp(VectorizationContext &Ctx, Value *Data,
+                                        Type *DataTy, Value *Ptr, Value *Stride,
+                                        Value *Mask, Value *EVL,
+                                        unsigned Alignment, llvm::Twine Name) {
+  VECZ_FAIL_IF(!DataTy);
+  VECZ_FAIL_IF(!Ptr || !Ptr->getType()->isPointerTy());
+  assert(!Data || Data->getType() == DataTy);
+  auto *PtrTy = cast<PointerType>(Ptr->getType());
+  Type *MaskTy = Mask ? Mask->getType() : nullptr;
+  Function *F = getOrCreateInterleavedMemOpFn(
+      Ctx, DataTy, PtrTy, Stride, MaskTy, Alignment,
+      /*IsLoad*/ Data == nullptr, EVL != nullptr);
+  VECZ_FAIL_IF(!F);
+  SmallVector<Value *, 4> Ops;
+  if (Data) {
+    Ops.push_back(Data);
+  }
+  Ops.push_back(Ptr);
+  if (Mask) {
+    Ops.push_back(Mask);
+  }
+  if (EVL) {
+    Ops.push_back(EVL);
+  }
+  if (!isa<ConstantInt>(Stride)) {
+    Ops.push_back(Stride);
+  }
+  return CallInst::Create(F, Ops, Name);
+}
+
+CallInst *vecz::createInterleavedLoad(VectorizationContext &Ctx, Type *Ty,
+                                      Value *Ptr, Value *Stride, Value *Mask,
+                                      Value *EVL, unsigned Alignment,
+                                      Twine Name) {
+  return createInterleavedMemOp(Ctx, /*Data*/ nullptr, Ty, Ptr, Stride, Mask,
+                                EVL, Alignment, Name);
+}
+
+CallInst *vecz::createInterleavedStore(VectorizationContext &Ctx, Value *Data,
+                                       Value *Ptr, Value *Stride, Value *Mask,
+                                       Value *EVL, unsigned Alignment,
+                                       Twine Name) {
+  return createInterleavedMemOp(Ctx, Data, Data->getType(), Ptr, Stride, Mask,
+                                EVL, Alignment, Name);
+}
+
+static std::string getScatterGatherMemOpName(Type *DataTy, VectorType *VecPtrTy,
+                                             Type *MaskTy, unsigned Alignment,
+                                             bool IsGather, bool IsVP) {
+  if (!DataTy) {
+    return std::string();
+  }
+  compiler::utils::NameMangler Mangler(&DataTy->getContext());
+  const char *BaseName = IsGather ? "gather_load" : "scatter_store";
+  std::string Name;
+  const compiler::utils::TypeQualifiers VecQuals(
+      compiler::utils::eTypeQualNone, compiler::utils::eTypeQualNone);
+  compiler::utils::TypeQualifiers PtrQuals(compiler::utils::eTypeQualNone,
+                                           compiler::utils::eTypeQualNone);
+  const compiler::utils::TypeQualifiers MaskQuals(
+      compiler::utils::eTypeQualNone);
+  PtrQuals.push_back(compiler::utils::eTypeQualNone);
+  raw_string_ostream O(Name);
+  O << VectorizationContext::InternalBuiltinPrefix;
+  if (MaskTy) {
+    O << "masked_";
+  }
+  O << BaseName << Alignment << "_";
+  if (IsVP) {
+    O << "vp_";
+  }
+  if (!Mangler.mangleType(O, DataTy, VecQuals) ||
+      !Mangler.mangleType(O, VecPtrTy, PtrQuals)) {
+    return std::string();
+  }
+  if (MaskTy && !Mangler.mangleType(O, MaskTy, MaskQuals)) {
+    return std::string();
+  }
+  if (IsVP) {
+    const compiler::utils::TypeQualifiers VLQuals(
+        compiler::utils::eTypeQualNone);
+    if (!Mangler.mangleType(O, IntegerType::getInt32Ty(DataTy->getContext()),
+                            VLQuals)) {
+      return std::string();
+    }
+  }
+  O.flush();
+  return Name;
+}
+
+Function *vecz::getOrCreateScatterGatherMemOpFn(vecz::VectorizationContext &Ctx,
+                                                llvm::Type *DataTy,
+                                                llvm::VectorType *VecPtrTy,
+                                                llvm::Type *MaskTy,
+                                                unsigned Alignment,
+                                                bool IsGather, bool IsVP) {
+  const Module &M = Ctx.module();
+  LLVMContext &LLVMCtx = M.getContext();
+  assert(VecPtrTy);
+  assert(!MaskTy || multi_llvm::getVectorElementCount(MaskTy) ==
+                        multi_llvm::getVectorElementCount(DataTy));
+
+  // Try to retrieve the builtin if it already exists.
+  const std::string Name = getScatterGatherMemOpName(DataTy, VecPtrTy, MaskTy,
+                                                     Alignment, IsGather, IsVP);
+  VECZ_FAIL_IF(Name.empty());
+  Function *F = Ctx.getOrCreateInternalBuiltin(Name, nullptr);
+  if (!F) {
+    // Declare it if it doesn't exist.
+    SmallVector<Type *, 4> Tys;
+    if (!IsGather) {
+      VECZ_FAIL_IF(!DataTy);
+      Tys.push_back(DataTy);
+    }
+    Tys.push_back(VecPtrTy);
+    if (MaskTy) {
+      Tys.push_back(MaskTy);
+    }
+    if (IsVP) {
+      Tys.push_back(IntegerType::getInt32Ty(LLVMCtx));
+    }
+
+    Type *RetTy = IsGather ? DataTy : Type::getVoidTy(LLVMCtx);
+    FunctionType *FT = FunctionType::get(RetTy, Tys, false);
+    F = Ctx.getOrCreateInternalBuiltin(Name, FT);
+  }
+  return F;
+}
+
+static CallInst *createScatterGatherMemOp(VectorizationContext &Ctx,
+                                          Value *VecData, Type *DataTy,
+                                          Value *VecPtr, Value *Mask,
+                                          Value *EVL, unsigned Alignment,
+                                          Twine Name) {
+  VECZ_FAIL_IF(!DataTy);
+  VECZ_FAIL_IF(!VecPtr || !VecPtr->getType()->isVectorTy() ||
+               !VecPtr->getType()->getScalarType()->isPointerTy());
+  Type *MaskTy = Mask ? Mask->getType() : nullptr;
+  Function *F = getOrCreateScatterGatherMemOpFn(
+      Ctx, DataTy, cast<VectorType>(VecPtr->getType()), MaskTy, Alignment,
+      /*IsGather*/ VecData == nullptr, EVL != nullptr);
+  VECZ_FAIL_IF(!F);
+  SmallVector<Value *, 4> Ops;
+  if (VecData) {
+    Ops.push_back(VecData);
+  }
+  Ops.push_back(VecPtr);
+  if (Mask) {
+    Ops.push_back(Mask);
+  }
+  if (EVL) {
+    Ops.push_back(EVL);
+  }
+  return CallInst::Create(F, Ops, Name);
+}
+
+llvm::CallInst *vecz::createGather(VectorizationContext &Ctx, llvm::Type *Ty,
+                                   llvm::Value *VecPtr, llvm::Value *Mask,
+                                   llvm::Value *EVL, unsigned Alignment,
+                                   llvm::Twine Name) {
+  return createScatterGatherMemOp(Ctx, /*Data*/ nullptr, Ty, VecPtr, Mask, EVL,
+                                  Alignment, Name);
+}
+
+llvm::CallInst *vecz::createScatter(VectorizationContext &Ctx,
+                                    llvm::Value *VecData, llvm::Value *VecPtr,
+                                    llvm::Value *Mask, llvm::Value *EVL,
+                                    unsigned Alignment, llvm::Twine Name) {
+  return createScatterGatherMemOp(Ctx, VecData, VecData->getType(), VecPtr,
+                                  Mask, EVL, Alignment, Name);
+}
+
+MemOpDesc::MemOpDesc()
+    : DataTy(nullptr), PtrTy(nullptr), MaskTy(nullptr),
+      Kind(MemOpKind::Invalid), AccessKind(MemOpAccessKind::Native),
+      IsVLOp(false), Alignment(1), Stride(nullptr), DataOpIdx(-1), PtrOpIdx(-1),
+      MaskOpIdx(-1), VLOpIdx(-1) {}
+
+bool MemOpDesc::isStrideConstantInt() const {
+  return Stride && isa<ConstantInt>(Stride);
+}
+
+int64_t MemOpDesc::getStrideAsConstantInt() const {
+  return cast<ConstantInt>(Stride)->getSExtValue();
+}
+
+Argument *MemOpDesc::getOperand(Function *F, int OpIdx) const {
+  VECZ_FAIL_IF(!F || (OpIdx < 0) || ((size_t)OpIdx >= F->arg_size()));
+  return F->getArg(OpIdx);
+}
+
+std::optional<MemOpDesc> MemOpDesc::analyzeMemOpFunction(Function &F) {
+  if (auto Op = MemOpDesc::analyzeMaskedMemOp(F)) {
+    return Op;
+  }
+  if (auto Op = MemOpDesc::analyzeInterleavedMemOp(F)) {
+    return Op;
+  }
+  if (auto Op = MemOpDesc::analyzeMaskedInterleavedMemOp(F)) {
+    return Op;
+  }
+  if (auto Op = MemOpDesc::analyzeScatterGatherMemOp(F)) {
+    return Op;
+  }
+  if (auto Op = MemOpDesc::analyzeMaskedScatterGatherMemOp(F)) {
+    return Op;
+  }
+  return std::nullopt;
+}
+
+std::optional<MemOpDesc> MemOpDesc::analyzeMaskedMemOp(Function &F) {
+  const StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return std::nullopt;
+  }
+
+  MemOpDesc Desc;
+  if (L.Consume("masked_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.MaskOpIdx = 2;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::Masked;
+    return Desc;
+  }
+
+  if (L.Consume("masked_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.MaskOpIdx = 1;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::Masked;
+    return Desc;
+  }
+  return std::nullopt;
+}
+
+std::optional<MemOpDesc> MemOpDesc::analyzeInterleavedMemOp(Function &F) {
+  const StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return std::nullopt;
+  }
+  MemOpDesc Desc;
+  int ConstantStride{};
+  if (L.Consume("interleaved_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    if (L.ConsumeSignedInteger(ConstantStride)) {
+      VECZ_ERROR_IF(F.arg_size() != 2,
+                    "Wrong argument list size for interleaved store");
+      Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
+    } else if (L.Consume("V")) {
+      VECZ_ERROR_IF(F.arg_size() != 3,
+                    "Wrong argument list size for interleaved store");
+      auto ArgIt = F.arg_begin();
+      std::advance(ArgIt, 2);
+      Desc.Stride = &*ArgIt;
+    } else {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.AccessKind = MemOpAccessKind::Interleaved;
+    return Desc;
+  }
+
+  if (L.Consume("interleaved_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    if (L.ConsumeSignedInteger(ConstantStride)) {
+      VECZ_ERROR_IF(F.arg_size() != 1,
+                    "Wrong argument list size for interleaved load");
+      Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
+    } else if (L.Consume("V")) {
+      VECZ_ERROR_IF(F.arg_size() != 2,
+                    "Wrong argument list size for interleaved load");
+      auto ArgIt = F.arg_begin();
+      std::advance(ArgIt, 1);
+      Desc.Stride = &*ArgIt;
+    } else {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.AccessKind = MemOpAccessKind::Interleaved;
+    return Desc;
+  }
+
+  return std::nullopt;
+}
+
+std::optional<MemOpDesc> MemOpDesc::analyzeMaskedInterleavedMemOp(Function &F) {
+  const StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return std::nullopt;
+  }
+  MemOpDesc Desc;
+  if (L.Consume("masked_interleaved_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    // KLOCWORK "UNINIT.STACK.MUST" possible false positive
+    // Initialization of ConstantStride looks like an uninitialized access to
+    // Klocwork
+    int ConstantStride;
+    if (L.ConsumeSignedInteger(ConstantStride)) {
+      if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
+        return std::nullopt;
+      }
+      Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
+    } else if (L.Consume("V")) {
+      if (F.arg_size() != 4 + (unsigned)Desc.IsVLOp) {
+        return std::nullopt;
+      }
+      auto ArgIt = F.arg_begin();
+      std::advance(ArgIt, 3 + Desc.IsVLOp);
+      Desc.Stride = &*ArgIt;
+    } else {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.MaskOpIdx = 2;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::MaskedInterleaved;
+    return Desc;
+  }
+  if (L.Consume("masked_interleaved_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    // KLOCWORK "UNINIT.STACK.MUST" possible false positive
+    // Initialization of ConstantStride looks like an uninitialized access to
+    // Klocwork
+    int ConstantStride;
+    if (L.ConsumeSignedInteger(ConstantStride)) {
+      if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
+        return std::nullopt;
+      }
+      Desc.Stride = ConstantInt::get(getSizeTy(*F.getParent()), ConstantStride);
+    } else if (L.Consume("V")) {
+      if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
+        return std::nullopt;
+      }
+      auto ArgIt = F.arg_begin();
+      std::advance(ArgIt, 2 + Desc.IsVLOp);
+      Desc.Stride = &*ArgIt;
+    } else {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.MaskOpIdx = 1;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::MaskedInterleaved;
+    return Desc;
+  }
+
+  return std::nullopt;
+}
+
+std::optional<MemOpDesc> MemOpDesc::analyzeScatterGatherMemOp(Function &F) {
+  const StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return std::nullopt;
+  }
+  MemOpDesc Desc;
+  if (L.Consume("scatter_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    if (F.arg_size() != 2) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.AccessKind = MemOpAccessKind::ScatterGather;
+    return Desc;
+  }
+
+  if (L.Consume("gather_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    if (F.arg_size() != 1) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.AccessKind = MemOpAccessKind::ScatterGather;
+    return Desc;
+  }
+
+  return std::nullopt;
+}
+
+std::optional<MemOpDesc>
+MemOpDesc::analyzeMaskedScatterGatherMemOp(Function &F) {
+  const StringRef MangledName = F.getName();
+  compiler::utils::Lexer L(MangledName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return std::nullopt;
+  }
+
+  MemOpDesc Desc;
+  if (L.Consume("masked_scatter_store")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    if (F.arg_size() != 3 + (unsigned)Desc.IsVLOp) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.DataTy = Arg->getType();
+    ++Arg;
+    Desc.PtrTy = Arg->getType();
+    Desc.Kind = MemOpKind::StoreCall;
+    Desc.DataOpIdx = 0;
+    Desc.PtrOpIdx = 1;
+    Desc.MaskOpIdx = 2;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::MaskedScatterGather;
+    return Desc;
+  }
+
+  if (L.Consume("masked_gather_load")) {
+    if (!L.ConsumeInteger(Desc.Alignment)) {
+      return std::nullopt;
+    }
+    if (!L.Consume("_")) {
+      return std::nullopt;
+    }
+    Desc.IsVLOp = L.Consume("vp_");
+    if (F.arg_size() != 2 + (unsigned)Desc.IsVLOp) {
+      return std::nullopt;
+    }
+
+    Function::arg_iterator Arg = F.arg_begin();
+    Desc.PtrTy = Arg->getType();
+    Desc.DataTy = F.getReturnType();
+    Desc.Kind = MemOpKind::LoadCall;
+    Desc.DataOpIdx = -1;
+    Desc.PtrOpIdx = 0;
+    Desc.MaskOpIdx = 1;
+    Desc.MaskTy = F.getArg(Desc.MaskOpIdx)->getType();
+    Desc.VLOpIdx = Desc.IsVLOp ? Desc.MaskOpIdx + 1 : -1;
+    Desc.AccessKind = MemOpAccessKind::MaskedScatterGather;
+    return Desc;
+  }
+
+  return std::nullopt;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+std::optional<MemOp> MemOp::get(llvm::Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    MemOpDesc Desc;
+    Desc.Kind = MemOpKind::LoadInstruction;
+    Desc.Alignment = LI->getAlign().value();
+    Desc.DataTy = LI->getType();
+    auto *PO = LI->getPointerOperand();
+    assert(PO && "Could not get pointer operand");
+    Desc.PtrTy = PO->getType();
+    return MemOp(I, Desc);
+  }
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    MemOpDesc Desc;
+    Desc.Kind = MemOpKind::StoreInstruction;
+    Desc.Alignment = SI->getAlign().value();
+    assert(SI->getValueOperand() && "Could not get value operand");
+    Desc.DataTy = SI->getValueOperand()->getType();
+    auto *PO = SI->getPointerOperand();
+    assert(PO && "Could not get pointer operand");
+    Desc.PtrTy = PO->getType();
+    return MemOp(I, Desc);
+  }
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    if (Function *Caller = CI->getCalledFunction()) {
+      if (auto FnOp = MemOpDesc::analyzeMemOpFunction(*Caller)) {
+        return MemOp(I, *FnOp);
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+std::optional<MemOp> MemOp::get(llvm::CallInst *CI,
+                                MemOpAccessKind AccessKind) {
+  if (!CI->getCalledFunction()) {
+    return std::nullopt;
+  }
+  std::optional<MemOpDesc> Desc;
+  if (Function *Caller = CI->getCalledFunction()) {
+    switch (AccessKind) {
+    default:
+      return std::nullopt;
+    case MemOpAccessKind::Masked:
+      Desc = MemOpDesc::analyzeMaskedMemOp(*Caller);
+      break;
+    case MemOpAccessKind::Interleaved:
+      Desc = MemOpDesc::analyzeInterleavedMemOp(*Caller);
+      break;
+    case MemOpAccessKind::MaskedInterleaved:
+      Desc = MemOpDesc::analyzeMaskedInterleavedMemOp(*Caller);
+      break;
+    case MemOpAccessKind::ScatterGather:
+      Desc = MemOpDesc::analyzeScatterGatherMemOp(*Caller);
+      break;
+    case MemOpAccessKind::MaskedScatterGather:
+      Desc = MemOpDesc::analyzeMaskedScatterGatherMemOp(*Caller);
+      break;
+    }
+  }
+  if (!Desc) {
+    return std::nullopt;
+  }
+  return MemOp(CI, *Desc);
+}
+
+MemOp::MemOp(Instruction *I, const MemOpDesc &desc) {
+  Ins = I;
+  Desc = desc;
+}
+
+llvm::Value *MemOp::getCallOperand(int OpIdx) const {
+  VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) &&
+               (Desc.getKind() != MemOpKind::StoreCall));
+  CallInst *CI = dyn_cast<CallInst>(Ins);
+  VECZ_FAIL_IF(!CI || (OpIdx < 0) || ((unsigned)OpIdx >= CI->arg_size()));
+  return CI->getArgOperand((unsigned)OpIdx);
+}
+
+bool MemOp::setCallOperand(int OpIdx, Value *V) {
+  VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) &&
+               (Desc.getKind() != MemOpKind::StoreCall));
+  CallInst *CI = dyn_cast<CallInst>(Ins);
+  VECZ_FAIL_IF(!CI || (OpIdx < 0) || ((unsigned)OpIdx >= CI->arg_size()));
+  CI->setArgOperand((unsigned)OpIdx, V);
+  return true;
+}
+
+llvm::Value *MemOp::getDataOperand() const {
+  if (Desc.getKind() == MemOpKind::StoreInstruction) {
+    return cast<StoreInst>(Ins)->getValueOperand();
+  } else if (Desc.getKind() == MemOpKind::StoreCall) {
+    return getCallOperand(Desc.getDataOperandIndex());
+  } else {
+    return nullptr;
+  }
+}
+
+llvm::Value *MemOp::getPointerOperand() const {
+  switch (Desc.getKind()) {
+  default:
+    return nullptr;
+  case MemOpKind::LoadInstruction:
+    return cast<LoadInst>(Ins)->getPointerOperand();
+  case MemOpKind::StoreInstruction:
+    return cast<StoreInst>(Ins)->getPointerOperand();
+  case MemOpKind::LoadCall:
+  case MemOpKind::StoreCall:
+    return getCallOperand(Desc.getPointerOperandIndex());
+  }
+}
+
+llvm::Value *MemOp::getMaskOperand() const {
+  switch (Desc.getKind()) {
+  default:
+    return nullptr;
+  case MemOpKind::LoadCall:
+  case MemOpKind::StoreCall:
+    return getCallOperand(Desc.getMaskOperandIndex());
+  }
+}
+
+bool MemOp::setDataOperand(Value *V) {
+  if (Desc.getKind() == MemOpKind::StoreInstruction) {
+    cast<StoreInst>(Ins)->setOperand(0, V);
+    return true;
+  } else if (Desc.getKind() == MemOpKind::StoreCall) {
+    return setCallOperand(Desc.getDataOperandIndex(), V);
+  } else {
+    return false;
+  }
+}
+
+bool MemOp::setPointerOperand(Value *V) {
+  switch (Desc.getKind()) {
+  default:
+    return false;
+  case MemOpKind::LoadInstruction:
+    cast<LoadInst>(Ins)->setOperand(0, V);
+    return true;
+  case MemOpKind::StoreInstruction:
+    cast<StoreInst>(Ins)->setOperand(1, V);
+    return true;
+  case MemOpKind::LoadCall:
+  case MemOpKind::StoreCall:
+    return setCallOperand(Desc.getPointerOperandIndex(), V);
+  }
+}
+
+bool MemOp::setMaskOperand(Value *V) {
+  switch (Desc.getKind()) {
+  default:
+    return false;
+  case MemOpKind::LoadCall:
+  case MemOpKind::StoreCall:
+    return setCallOperand(Desc.getMaskOperandIndex(), V);
+  }
+}
+
+CallInst *MemOp::getCall() const {
+  VECZ_FAIL_IF((Desc.getKind() != MemOpKind::LoadCall) &&
+               (Desc.getKind() != MemOpKind::StoreCall));
+  return dyn_cast<CallInst>(Ins);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
new file mode 100644
index 0000000000000..a93fdacbdc9a2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/offset_info.cpp
@@ -0,0 +1,1070 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "offset_info.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/Analysis/WithCache.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/KnownBits.h>
+#include <llvm/Support/MathExtras.h>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+inline uint64_t SizeOrZero(TypeSize &&T) {
+  return T.isScalable() ? 0 : T.getFixedValue();
+}
+
+uint8_t highbit(const uint32_t x) {
+  assert(isPowerOf2_32(x) && "Value must be a power of two");
+  // This is a De Bruijn hash table, it returns the index of the highest
+  // bit, which works when x is a power of 2. For details, see
+  // https://en.wikipedia.org/wiki/De_Bruijn_sequence#Uses
+  static const uint32_t deBruijn_magic = 0x06EB14F9U;
+  static const uint8_t tab[32] = {
+      0,  1,  16, 2,  29, 17, 3,  22, 30, 20, 18, 11, 13, 4, 7,  23,
+      31, 15, 28, 21, 19, 10, 12, 6,  14, 27, 9,  5,  26, 8, 25, 24,
+  };
+  return tab[(uint32_t)(x * deBruijn_magic) >> 27];
+}
+
+// Returns a value extended or truncated to match the size type of the target.
+// This will return the original value if it is already the correct size.
+Value *matchSizeType(IRBuilder<> &B, Value *V, bool sext) {
+  auto *const sizeTy = getSizeTy(B);
+
+  if (sext) {
+    return B.CreateSExtOrTrunc(V, sizeTy, "stride_conv");
+  } else {
+    return B.CreateZExtOrTrunc(V, sizeTy, "stride_conv");
+  }
+}
+
+uint64_t getTypeMask(Type *Ty) {
+  const auto bits = Ty->getIntegerBitWidth();
+  return bits < 64 ? ((uint64_t(1) << bits) - 1) : ~uint64_t(0);
+}
+
+// The index size potentially depends on the address space of the pointer,
+// but let's just use the pointer size for now.
+uint64_t getSizeTypeMask(const DataLayout &DL) {
+  const auto bits = DL.getPointerSizeInBits();
+  return bits < 64 ? ((uint64_t(1) << bits) - 1) : ~uint64_t(0);
+}
+
+OffsetKind combineKinds(OffsetKind LHS, OffsetKind RHS) {
+  assert(LHS != eOffsetLinear && RHS != eOffsetLinear &&
+         "OffsetInfo analysis functions should handle all linear cases");
+
+  if (LHS == RHS) {
+    return LHS;
+  }
+
+  if (LHS == eOffsetMayDiverge || RHS == eOffsetMayDiverge) {
+    return eOffsetMayDiverge;
+  }
+
+  // Uniform values are all that's left.
+  return eOffsetUniformVariable;
+}
+} // namespace
+
+OffsetInfo::OffsetInfo(StrideAnalysisResult &SAR, Value *V)
+    : Kind(eOffsetMayDiverge), ActualValue(V), StrideInt(0),
+      ManifestStride(nullptr), BitMask(~uint64_t(0)) {
+  auto *const ty = V->getType();
+  if (ty->isIntegerTy()) {
+    analyze(V, SAR);
+  } else if (ty->isPointerTy()) {
+    analyzePtr(V, SAR);
+  } else {
+    setMayDiverge();
+  }
+}
+
+Value *OffsetInfo::getUniformValue() const {
+  return isUniform() ? ActualValue : nullptr;
+}
+
+int64_t OffsetInfo::getValueAsConstantInt() const {
+  ConstantInt *CInt = cast<ConstantInt>(ActualValue);
+  return CInt->getSExtValue();
+}
+
+bool OffsetInfo::isStrideConstantInt() const {
+  return (Kind == eOffsetLinear && StrideInt != 0);
+}
+
+int64_t OffsetInfo::getStrideAsConstantInt() const { return StrideInt; }
+
+OffsetInfo &OffsetInfo::setMayDiverge() { return setKind(eOffsetMayDiverge); }
+
+OffsetInfo &OffsetInfo::setStride(Value *Stride) {
+  if (auto *const CInt = dyn_cast_or_null<ConstantInt>(Stride)) {
+    StrideInt = CInt->getSExtValue();
+  } else {
+    StrideInt = 0;
+  }
+  ManifestStride = Stride;
+  Kind = eOffsetLinear;
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::setStride(int64_t Stride) {
+  if (Stride == 0) {
+    Kind = eOffsetUniformVariable;
+  } else {
+    StrideInt = Stride;
+    ManifestStride = nullptr;
+    Kind = eOffsetLinear;
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::setKind(OffsetKind K) {
+  Kind = K;
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::analyze(Value *Offset, StrideAnalysisResult &SAR) {
+  Type *OffsetTy = Offset->getType();
+  if (!OffsetTy->isIntegerTy() || OffsetTy->isVectorTy()) {
+    return setMayDiverge();
+  }
+
+  if (auto *const CInt = dyn_cast<ConstantInt>(Offset)) {
+    BitMask = CInt->getZExtValue();
+    return setKind(eOffsetConstant);
+  }
+  BitMask = getTypeMask(OffsetTy);
+
+  if (isa<Argument>(Offset)) {
+    return setKind(eOffsetUniformVariable);
+  }
+
+  Instruction *Ins = dyn_cast<Instruction>(Offset);
+  if (!Ins) {
+    return setMayDiverge();
+  }
+
+  // If we have a uniform value here we don't need to analyse any further.
+  if (!SAR.UVR.isVarying(Ins)) {
+    SimplifyQuery SQ(SAR.F.getParent()->getDataLayout());
+    SQ.AC = &SAR.AC;
+    const WithCache<Instruction *> InsWithCache(Ins);
+    const auto &KB = InsWithCache.getKnownBits(SQ);
+    const auto bitWidth = OffsetTy->getIntegerBitWidth();
+
+    // We are interested in the bits that are not known to be zero.
+    BitMask &= ~KB.Zero.extractBitsAsZExtValue(bitWidth, 0);
+    return setKind(eOffsetUniformVariable);
+  }
+
+  // Analyse binary instructions.
+  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Offset)) {
+    // Copy these values into local variables, because `SAR.analyze()` can
+    // invalidate any previously obtained references.
+    const auto LHS = SAR.analyze(BOp->getOperand(0));
+    const auto RHS = SAR.analyze(BOp->getOperand(1));
+    if (LHS.mayDiverge() || RHS.mayDiverge()) {
+      return setMayDiverge();
+    }
+
+    if (isa<OverflowingBinaryOperator>(BOp) && !BOp->hasNoUnsignedWrap()) {
+      // This operation can over/underflow, therefore all bets are off on
+      // which bits are on. We set it to all ones so a ZExt will catch it.
+      // SExt does not care since overflow is UB.
+      BitMask = ~uint64_t(0);
+    }
+
+    switch (BOp->getOpcode()) {
+    default:
+      return setMayDiverge();
+    case Instruction::Add:
+      return combineAdd(LHS, RHS);
+    case Instruction::Sub:
+      return combineSub(LHS, RHS);
+    case Instruction::And:
+      return combineAnd(LHS, RHS);
+    case Instruction::Or:
+      return combineOr(LHS, RHS);
+    case Instruction::Xor:
+      return combineXor(LHS, RHS);
+    case Instruction::Mul:
+      return combineMul(LHS, RHS);
+    case Instruction::Shl:
+      return combineShl(LHS, RHS);
+    case Instruction::AShr:
+      return combineAShr(LHS, RHS);
+    }
+  }
+
+  // Consider that integer casts cannot scale item IDs.
+  if (CastInst *Cast = dyn_cast<CastInst>(Offset)) {
+    const auto &Src = SAR.analyze(Cast->getOperand(0));
+    if (Src.mayDiverge()) {
+      return setMayDiverge();
+    }
+
+    // However, a Zero-extended offset can underflow.
+    if (isa<ZExtInst>(Cast)) {
+      // A zero-extended offset could underflow and result in an invalid base
+      // address, rendering the entire strided MemOp invalid, even when masked
+      // such that the read from the base address is not meant to execute.
+      // Note that we don't care about overflowing the index type.
+      const auto typeMask = getTypeMask(Cast->getSrcTy());
+      const auto bitMaskSized =
+          Src.BitMask & getSizeTypeMask(Cast->getModule()->getDataLayout());
+      if ((bitMaskSized & typeMask) != bitMaskSized) {
+        return setMayDiverge();
+      }
+      BitMask = Src.BitMask & typeMask;
+    } else if (isa<SExtInst>(Cast)) {
+      const uint64_t widthMask = getTypeMask(Cast->getSrcTy());
+      const uint64_t signMask = (widthMask >> 1) + 1;
+      if (Src.BitMask & signMask) {
+        // If it's possible for the source value to be negative, all of the
+        // bits in the extended value might be set.
+        BitMask = Src.BitMask | ~widthMask;
+      } else {
+        BitMask = Src.BitMask & widthMask;
+      }
+    } else {
+      // We don't truncate the bitmask here, since we don't know if it's going
+      // to be sign extended or zero extended later, which affects whether we
+      // can ignore overflow or not.
+      BitMask = Src.BitMask;
+    }
+    return copyStrideFrom(Src);
+  }
+
+  if (auto *Select = dyn_cast<SelectInst>(Offset)) {
+    if (SAR.UVR.isVarying(Select->getCondition())) {
+      return setMayDiverge();
+    }
+
+    // If the condition isn't varying and both operands have the same
+    // constant stride, the result will also have the same constant stride.
+    const auto LHS = SAR.analyze(Select->getOperand(1));
+    const auto RHS = SAR.analyze(Select->getOperand(2));
+    if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt &&
+        LHS.isStrideConstantInt()) {
+      // Merge the bitmasks from either source - we are selecting one of them.
+      BitMask = LHS.BitMask | RHS.BitMask;
+      return copyStrideFrom(LHS);
+    }
+    return setMayDiverge();
+  }
+
+  if (auto *Phi = dyn_cast<PHINode>(Offset)) {
+    if (auto *const CVal = Phi->hasConstantValue()) {
+      return copyStrideAndBitMaskFrom(SAR.analyze(CVal));
+    }
+
+    auto NumIncoming = Phi->getNumIncomingValues();
+    if (NumIncoming == 1) {
+      // LCSSA Phi, just go right through it..
+      return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0)));
+    } else if (NumIncoming == 2) {
+      auto identifyIncrement = [&](Value *incoming) -> bool {
+        if (auto *BOp = dyn_cast<BinaryOperator>(incoming)) {
+          auto Opcode = BOp->getOpcode();
+          // If it's a simple loop iterator, the stride can be analyzed from the
+          // initial value.
+          return ((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
+                  BOp->getOperand(0) == Phi &&
+                  !SAR.UVR.isVarying(BOp->getOperand(1))) ||
+                 (Opcode == Instruction::Add && BOp->getOperand(1) == Phi &&
+                  !SAR.UVR.isVarying(BOp->getOperand(0)));
+        }
+        return false;
+      };
+
+      // Try the PHI node's incoming values both ways round.
+      if (identifyIncrement(Phi->getIncomingValue(1))) {
+        return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0)));
+      } else if (identifyIncrement(Phi->getIncomingValue(0))) {
+        return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(1)));
+      }
+    }
+    return setMayDiverge();
+  }
+
+  // Analyse function calls.
+  if (CallInst *CI = dyn_cast<CallInst>(Offset)) {
+    const auto &BI = SAR.UVR.Ctx.builtins();
+    if (const auto Builtin = BI.analyzeBuiltinCall(*CI, SAR.UVR.dimension)) {
+      switch (Builtin->uniformity) {
+      default:
+      case compiler::utils::eBuiltinUniformityMaybeInstanceID:
+      case compiler::utils::eBuiltinUniformityNever:
+        return setMayDiverge();
+      case compiler::utils::eBuiltinUniformityLikeInputs:
+        break;
+      case compiler::utils::eBuiltinUniformityAlways:
+        return setKind(eOffsetUniformVariable);
+      case compiler::utils::eBuiltinUniformityInstanceID:
+        if (Builtin->properties & compiler::utils::eBuiltinPropertyLocalID) {
+          // If the local size is unknown (represented by zero), the resulting
+          // mask will be ~0ULL (all ones). Potentially, it is possible to use
+          // the CL_DEVICE_MAX_WORK_ITEM_SIZES property as an upper bound in
+          // this case.
+          uint64_t LocalBitMask = SAR.UVR.VU.getLocalSize() - 1;
+          LocalBitMask |= LocalBitMask >> 32;
+          LocalBitMask |= LocalBitMask >> 16;
+          LocalBitMask |= LocalBitMask >> 8;
+          LocalBitMask |= LocalBitMask >> 4;
+          LocalBitMask |= LocalBitMask >> 2;
+          LocalBitMask |= LocalBitMask >> 1;
+          BitMask = LocalBitMask;
+        }
+        return setStride(1);
+      }
+    }
+  }
+
+  return setMayDiverge();
+}
+
+OffsetInfo &OffsetInfo::analyzePtr(Value *Address, StrideAnalysisResult &SAR) {
+  if (BitCastInst *BCast = dyn_cast<BitCastInst>(Address)) {
+    return copyStrideAndBitMaskFrom(SAR.analyze(BCast->getOperand(0)));
+  } else if (auto *ASCast = dyn_cast<AddrSpaceCastInst>(Address)) {
+    return copyStrideAndBitMaskFrom(SAR.analyze(ASCast->getOperand(0)));
+  } else if (auto *IntPtr = dyn_cast<IntToPtrInst>(Address)) {
+    return copyStrideAndBitMaskFrom(SAR.analyze(IntPtr->getOperand(0)));
+  } else if (auto *Arg = dyn_cast<Argument>(Address)) {
+    // 'Pointer return' arguments should be treated as having an implicit ItemID
+    // offset. This allows memory operations to be packetized instead of
+    // instantiated.
+    if (Arg->getType()->isPointerTy()) {
+      for (const VectorizerTargetArgument &VUArg : SAR.UVR.VU.arguments()) {
+        if (((VUArg.OldArg == Arg) || (VUArg.NewArg == Arg)) &&
+            VUArg.PointerRetPointeeTy) {
+          Type *MemTy = VUArg.PointerRetPointeeTy;
+          const uint64_t MemSize =
+              SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy);
+          return setStride(MemSize);
+        }
+      }
+    }
+    return setKind(eOffsetUniformVariable);
+  } else if (isa<GlobalVariable>(Address)) {
+    return setKind(eOffsetUniformVariable);
+  } else if (!SAR.UVR.isVarying(Address)) {
+    // If it's uniform we can just return the uniform address.
+    // Check this condition before bothering to descend into Phi nodes or GEPs,
+    // since we know stride is zero anyway.
+    return setKind(eOffsetUniformVariable);
+  } else if (auto *const Alloca = dyn_cast<AllocaInst>(Address)) {
+    if (needsInstantiation(SAR.UVR.Ctx, *Alloca)) {
+      // Instantiated allocas result in scatter/gather
+      return setMayDiverge();
+    }
+
+    Type *MemTy = Alloca->getAllocatedType();
+    const uint64_t MemSize = SAR.UVR.Ctx.dataLayout()->getTypeAllocSize(MemTy);
+    return setStride(MemSize);
+  } else if (auto *const Phi = dyn_cast<PHINode>(Address)) {
+    // If all the incoming values are the same, we can trace through it. In
+    // the general case, it's not trivial to check that the stride is the same
+    // from every incoming block, and since incoming values may not dominate
+    // the IRBuilder insert point, we might not even be able to build the
+    // offset expression instructions there.
+    if (auto *const CVal = Phi->hasConstantValue()) {
+      return copyStrideAndBitMaskFrom(SAR.analyze(CVal));
+    }
+
+    // In the simple case of a loop-incremented pointer using a GEP, we can
+    // handle it thus:
+    auto NumIncoming = Phi->getNumIncomingValues();
+    if (NumIncoming != 2) {
+      // Perhaps we can handle more than one loop latch, but not yet.
+      return setMayDiverge();
+    }
+
+    if (auto *const GEP =
+            dyn_cast<GetElementPtrInst>(Phi->getIncomingValue(1))) {
+      // If it's a simple loop iterator, the stride can be analyzed from the
+      // initial value.
+      if (GEP->getPointerOperand() == Phi) {
+        for (const auto &index : GEP->indices()) {
+          if (SAR.UVR.isVarying(index.get())) {
+            return setMayDiverge();
+          }
+        }
+        return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(0)));
+      }
+    } else if (auto *const GEP =
+                   dyn_cast<GetElementPtrInst>(Phi->getIncomingValue(0))) {
+      // If it's a simple loop iterator, the stride can be analyzed from the
+      // initial value.
+      if (GEP->getPointerOperand() == Phi) {
+        for (const auto &index : GEP->indices()) {
+          if (SAR.UVR.isVarying(index.get())) {
+            return setMayDiverge();
+          }
+        }
+        return copyStrideAndBitMaskFrom(SAR.analyze(Phi->getIncomingValue(1)));
+      }
+    }
+
+    return setMayDiverge();
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(Address)) {
+    {
+      auto *const Ptr = GEP->getPointerOperand();
+      const auto &PtrInfo = SAR.analyze(Ptr);
+      if (PtrInfo.mayDiverge()) {
+        if (isa<SelectInst>(Ptr)) {
+          // For the benefit of the Ternary Transform Pass
+          for (Value *idx : GEP->indices()) {
+            SAR.analyze(idx);
+          }
+        }
+        return setMayDiverge();
+      }
+      copyStrideFrom(PtrInfo);
+    }
+
+    PointerType *GEPPtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+    if (!GEPPtrTy) {
+      // A GEP base can be a vector of pointers, for instance. (Unexpected!)
+      return setMayDiverge();
+    }
+
+    int64_t GEPStrideInt = StrideInt;
+    bool StrideVariable = (hasStride() && StrideInt == 0);
+    SmallVector<Value *, 4> Indices;
+    for (unsigned i = 0; i < GEP->getNumIndices(); i++) {
+      // Analyze each GEP offset.
+      Value *GEPIndex = GEP->getOperand(1 + i);
+      assert(GEPIndex && "Could not get operand from GEP");
+
+      const auto &idxOffset = SAR.analyze(GEPIndex);
+      if (idxOffset.mayDiverge()) {
+        return setMayDiverge();
+      }
+
+      Indices.push_back(GEPIndex);
+      if (!idxOffset.hasStride()) {
+        continue;
+      }
+
+      Type *MemTy = GetElementPtrInst::getIndexedType(
+          GEP->getSourceElementType(), Indices);
+      if (!MemTy) {
+        // A somewhat unlikely scenario...?
+        return setMayDiverge();
+      }
+
+      if (idxOffset.isStrideConstantInt()) {
+        // Add all the strides together,
+        // since `Base + (A * X) + (B * X) == Base + (A + B) * X`
+        const uint64_t MemSize = SizeOrZero(
+            GEP->getModule()->getDataLayout().getTypeAllocSize(MemTy));
+        GEPStrideInt += idxOffset.StrideInt * MemSize;
+      } else {
+        StrideVariable = true;
+      }
+    }
+
+    if (StrideVariable) {
+      // We don't know what the stride is yet,
+      // but we know it's linear and variable.
+      setStride(nullptr);
+    } else {
+      setStride(GEPStrideInt);
+    }
+    return *this;
+  } else if (auto *Select = dyn_cast<SelectInst>(Address)) {
+    const auto LHS = SAR.analyze(Select->getOperand(1));
+    const auto RHS = SAR.analyze(Select->getOperand(2));
+    if (SAR.UVR.isVarying(Select->getCondition())) {
+      // Note that we analyze the operands before returning here, for the
+      // benefit of the Ternary Transform Pass, which does its work ONLY
+      // when the condition is varying.
+      return setMayDiverge();
+    }
+
+    // If the condition isn't varying and both operands have the same
+    // constant stride, the result will also have the same constant stride.
+    if (LHS.hasStride() && RHS.hasStride() && LHS.StrideInt == RHS.StrideInt &&
+        LHS.isStrideConstantInt()) {
+      // Merge the bitmasks from either source - we are selecting one of them.
+      BitMask = LHS.BitMask | RHS.BitMask;
+      return copyStrideFrom(LHS);
+    }
+    return setMayDiverge();
+  }
+
+  // If it's varying we can't analyze it any further.
+  return setMayDiverge();
+}
+
+OffsetInfo &OffsetInfo::manifest(IRBuilder<> &B, StrideAnalysisResult &SAR) {
+  if (ManifestStride || Kind != eOffsetLinear) {
+    // If we already manifested the stride, or if it's not a linear value,
+    // there is nothing to do.
+    return *this;
+  }
+
+  if (StrideInt != 0) {
+    // It's an integer stride so we can just create a `ConstantInt`.
+    ManifestStride = getSizeInt(B, StrideInt);
+    return *this;
+  }
+
+  Instruction *Offset = cast<Instruction>(ActualValue);
+  // Analyse binary instructions.
+  if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Offset)) {
+    const auto &LHS = SAR.manifest(B, BOp->getOperand(0));
+    const auto &RHS = SAR.manifest(B, BOp->getOperand(1));
+
+    // Build strides immediately before their instructions
+    B.SetInsertPoint(BOp);
+    switch (BOp->getOpcode()) {
+    default:
+      return *this;
+    case Instruction::Add:
+      return manifestAdd(B, LHS, RHS);
+    case Instruction::Sub:
+      return manifestSub(B, LHS, RHS);
+    case Instruction::And:
+      return manifestAnd(B, LHS, RHS);
+    case Instruction::Or:
+      return manifestOr(B, LHS, RHS);
+    case Instruction::Xor:
+      return manifestXor(B, LHS, RHS);
+    case Instruction::Mul:
+      return manifestMul(B, LHS, RHS);
+    case Instruction::Shl:
+      return manifestShl(B, LHS, RHS);
+    case Instruction::AShr:
+      return manifestAShr(B, LHS, RHS);
+    }
+  }
+
+  // Consider that integer casts cannot scale item IDs.
+  if (CastInst *Cast = dyn_cast<CastInst>(Offset)) {
+    return copyStrideFrom(SAR.manifest(B, Cast->getOperand(0)));
+  }
+
+  if (auto *Phi = dyn_cast<PHINode>(Offset)) {
+    auto NumIncoming = Phi->getNumIncomingValues();
+    Value *SrcVal = nullptr;
+    if (NumIncoming == 1) {
+      // LCSSA Phi, just go right through it..
+      SrcVal = Phi->getIncomingValue(0);
+    } else if (auto *const CVal = Phi->hasConstantValue()) {
+      SrcVal = CVal;
+    } else if (NumIncoming == 2) {
+      auto identifyIncrement = [&](Value *incoming) -> bool {
+        if (auto *BOp = dyn_cast<BinaryOperator>(incoming)) {
+          // If this consumes the Phi node, we have found the increment.
+          return BOp->getOperand(0) == Phi || BOp->getOperand(1) == Phi;
+        } else if (auto *GEP = dyn_cast<GetElementPtrInst>(incoming)) {
+          return GEP->getPointerOperand() == Phi;
+        }
+        return false;
+      };
+
+      // Try the PHI node's incoming values both ways round.
+      if (identifyIncrement(Phi->getIncomingValue(1))) {
+        SrcVal = Phi->getIncomingValue(0);
+      } else if (identifyIncrement(Phi->getIncomingValue(0))) {
+        SrcVal = Phi->getIncomingValue(1);
+      }
+    }
+    assert(SrcVal && "Unexpected Phi node during stride manifestation");
+    return copyStrideFrom(SAR.manifest(B, SrcVal));
+  }
+
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(Offset)) {
+    const auto &Ptr = SAR.manifest(B, GEP->getPointerOperand());
+    copyStrideFrom(Ptr);
+
+    PointerType *GEPPtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType());
+    if (!GEPPtrTy) {
+      // A GEP base can be a vector of pointers, for instance. (Unexpected!)
+      return setMayDiverge();
+    }
+
+    Value *GEPStride = nullptr;
+    SmallVector<Value *, 4> Indices;
+    for (unsigned i = 0; i < GEP->getNumIndices(); i++) {
+      // Analyze each GEP offset.
+      Value *GEPIndex = GEP->getOperand(1 + i);
+      assert(GEPIndex && "Could not get operand from GEP");
+
+      const auto &idxOffset = SAR.manifest(B, GEPIndex);
+
+      Indices.push_back(GEPIndex);
+      if (!idxOffset.hasStride()) {
+        continue;
+      }
+
+      Type *MemTy = GetElementPtrInst::getIndexedType(
+          GEP->getSourceElementType(), Indices);
+
+      // Build stride instructions immediately before the GEP. Note that the
+      // process of manifesting the indices can change the insert point.
+      B.SetInsertPoint(GEP);
+      Value *idxStride = nullptr;
+      const uint64_t MemSize =
+          SizeOrZero(GEP->getModule()->getDataLayout().getTypeAllocSize(MemTy));
+      if (MemSize == 1) {
+        // Don't need to do anything if the size is 1
+        idxStride = idxOffset.ManifestStride;
+      } else {
+        if (isPowerOf2_64(MemSize)) {
+          // the size is a power of two, so shift to get the offset in bytes
+          auto *const SizeVal = getSizeInt(B, highbit(MemSize));
+          idxStride = B.CreateShl(idxOffset.ManifestStride, SizeVal);
+        } else {
+          // otherwise, multiply
+          auto *const SizeVal = getSizeInt(B, MemSize);
+          idxStride = B.CreateMul(idxOffset.ManifestStride, SizeVal);
+        }
+      }
+
+      // Add all the strides together,
+      // since `Base + (A * X) + (B * X) == Base + (A + B) * X`
+      if (GEPStride) {
+        GEPStride = B.CreateAdd(GEPStride, idxStride);
+      } else {
+        GEPStride = idxStride;
+      }
+    }
+
+    if (GEPStride) {
+      setStride(GEPStride);
+    }
+  }
+
+  return *this;
+}
+
+uint64_t OffsetInfo::getConstantMemoryStride(Type *PtrEleTy,
+                                             const DataLayout *DL) const {
+  const uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
+  VECZ_FAIL_IF(!PtrEleSize);
+
+  // It's not a valid stride if it's not divisible by the element size.
+  // Can't generate a valid interleaved MemOp from it!
+  if (StrideInt != 0 && StrideInt % PtrEleSize != 0) {
+    return 0;
+  }
+  return StrideInt / PtrEleSize;
+}
+
+Value *OffsetInfo::buildMemoryStride(IRBuilder<> &B, Type *PtrEleTy,
+                                     const DataLayout *DL) const {
+  if (!ManifestStride) {
+    assert(Kind != eOffsetLinear &&
+           "buildMemoryStride: linear stride not manifest");
+    return nullptr;
+  }
+
+  const uint64_t PtrEleSize = SizeOrZero(DL->getTypeAllocSize(PtrEleTy));
+  VECZ_FAIL_IF(!PtrEleSize);
+
+  // It's not a valid stride if it's not divisible by the element size.
+  // Can't generate a valid interleaved MemOp from it!
+  if (StrideInt != 0 && StrideInt % PtrEleSize != 0) {
+    return nullptr;
+  }
+
+  if (isPowerOf2_64(PtrEleSize)) {
+    auto ShiftVal = highbit(PtrEleSize);
+    if (auto *BinOp = dyn_cast<BinaryOperator>(ManifestStride)) {
+      if (BinOp->getOpcode() == Instruction::Shl) {
+        if (auto *ConstSize = dyn_cast<ConstantInt>(BinOp->getOperand(1))) {
+          if (ConstSize->getZExtValue() == ShiftVal) {
+            return BinOp->getOperand(0);
+          }
+        }
+      }
+    }
+
+    auto *const stride =
+        B.CreateAShr(ManifestStride, ConstantInt::get(getSizeTy(B), ShiftVal));
+    return stride;
+  } else {
+    if (auto *BinOp = dyn_cast<BinaryOperator>(ManifestStride)) {
+      if (BinOp->getOpcode() == Instruction::Mul) {
+        if (auto *ConstSize = dyn_cast<ConstantInt>(BinOp->getOperand(1))) {
+          if (ConstSize->getZExtValue() == PtrEleSize) {
+            return BinOp->getOperand(0);
+          }
+        }
+      }
+    }
+
+    auto *const stride = B.CreateSDiv(
+        ManifestStride, ConstantInt::get(getSizeTy(B), PtrEleSize));
+    return stride;
+  }
+}
+
+OffsetInfo &OffsetInfo::combineAdd(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  BitMask &= LHS.BitMask | RHS.BitMask | (LHS.BitMask + RHS.BitMask);
+
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear + Linear
+      if (LHS.isStrideConstantInt() && RHS.isStrideConstantInt()) {
+        return setStride(LHS.StrideInt + RHS.StrideInt);
+      } else {
+        return setStride(nullptr);
+      }
+    } else {
+      // Linear + Uniform
+      return copyStrideFrom(LHS);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform + Linear
+    return copyStrideFrom(RHS);
+  }
+
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestAdd(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear + Linear
+      auto *const newAdd = B.CreateAdd(LHS.ManifestStride, RHS.ManifestStride);
+      return setStride(newAdd);
+    } else {
+      // Linear + Uniform
+      return copyStrideFrom(LHS);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform + Linear
+    return copyStrideFrom(RHS);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineSub(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear - Linear
+      if (LHS.isStrideConstantInt() && RHS.isStrideConstantInt()) {
+        return setStride(LHS.StrideInt - RHS.StrideInt);
+      } else {
+        return setStride(nullptr);
+      }
+    } else {
+      // Linear - Uniform
+      return copyStrideFrom(LHS);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform - Linear
+    // Subtracting an item ID results in a negative stride.
+    if (RHS.isStrideConstantInt()) {
+      return setStride(-RHS.StrideInt);
+    } else {
+      return setStride(nullptr);
+    }
+  }
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestSub(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear - Linear
+      auto *const newSub = B.CreateSub(LHS.ManifestStride, RHS.ManifestStride);
+      return setStride(newSub);
+    } else {
+      // Linear - Uniform
+      return copyStrideFrom(LHS);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform - Linear
+    // Subtracting an item ID results in a negative stride.
+    auto *const newNeg = B.CreateNeg(RHS.ManifestStride);
+    return setStride(newNeg);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineAnd(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  BitMask = LHS.BitMask & RHS.BitMask;
+  if (LHS.hasStride()) {
+    if (RHS.hasStride()) {
+      // Linear & Linear -> can't analyze
+      return setMayDiverge();
+    } else {
+      // Linear & Uniform
+      // If we didn't lose any bits of the LHS, we can do it.
+      if (BitMask == LHS.BitMask) {
+        return copyStrideFrom(LHS);
+      } else {
+        return setMayDiverge();
+      }
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform & Linear
+    // If we didn't lose any bits of the RHS, we can do it.
+    if (BitMask == RHS.BitMask) {
+      return copyStrideFrom(RHS);
+    } else {
+      return setMayDiverge();
+    }
+  }
+
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestAnd(IRBuilder<> &, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    return copyStrideFrom(LHS);
+  } else if (RHS.hasStride()) {
+    return copyStrideFrom(RHS);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineOr(const OffsetInfo &LHS,
+                                  const OffsetInfo &RHS) {
+  if ((LHS.BitMask & RHS.BitMask) == 0) {
+    // An Or is equivalent to an Add if the operands have no bits in common.
+    return combineAdd(LHS, RHS);
+  }
+
+  if (LHS.hasStride() || RHS.hasStride()) {
+    return setMayDiverge();
+  }
+
+  BitMask = LHS.BitMask | RHS.BitMask;
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestOr(IRBuilder<> &B, const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if ((LHS.BitMask & RHS.BitMask) == 0) {
+    // An Or is equivalent to an Add if the operands have no bits in common.
+    return manifestAdd(B, LHS, RHS);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineXor(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if ((LHS.BitMask & RHS.BitMask) == 0) {
+    // An Xor is equivalent to an Add if the operands have no bits in common.
+    return combineAdd(LHS, RHS);
+  }
+
+  if (LHS.hasStride() || RHS.hasStride()) {
+    return setMayDiverge();
+  }
+
+  BitMask = LHS.BitMask | RHS.BitMask;
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestXor(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if ((LHS.BitMask & RHS.BitMask) == 0) {
+    // An Xor is equivalent to an Add if the operands have no bits in common.
+    return manifestAdd(B, LHS, RHS);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineShl(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if (RHS.hasStride()) {
+    return setMayDiverge();
+  } else if (LHS.hasStride()) {
+    auto *const Shift = RHS.getUniformValue();
+    if (!Shift) {
+      return setMayDiverge();
+    }
+
+    if (ConstantInt *CShift = dyn_cast<ConstantInt>(Shift)) {
+      const auto CVal = CShift->getZExtValue();
+      BitMask = LHS.BitMask << CVal;
+      return setStride(LHS.StrideInt << CVal);
+    }
+
+    BitMask = ~uint64_t(0);
+    return setStride(nullptr);
+  }
+
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestShl(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  auto *const Shift = RHS.getUniformValue();
+  if (Shift && LHS.hasStride()) {
+    auto *const sizeShift = matchSizeType(B, Shift, false);
+    auto *const newShl = B.CreateShl(LHS.ManifestStride, sizeShift);
+    return setStride(newShl);
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineAShr(const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (RHS.hasStride()) {
+    return setMayDiverge();
+  } else if (LHS.hasStride()) {
+    auto *const Shift = RHS.getUniformValue();
+    if (!Shift) {
+      return setMayDiverge();
+    }
+
+    // We have to be careful with right shifts, because some bits of the stride
+    // could get shifted out of the right-hand-side, causing it not to be
+    // uniform anymore.
+    if (RHS.Kind == eOffsetConstant) {
+      auto CShift = RHS.getValueAsConstantInt();
+      if (CShift < 0 || CShift >= 64) {
+        // Unlikely, but just in case..
+        return setMayDiverge();
+      }
+
+      // Note that we shift the bitmask as a signed value.
+      // Note also that the BitMask is been initialized to the width of the
+      // integer type.
+      const uint64_t signMask = (BitMask >> 1) + 1;
+      if (LHS.BitMask & signMask) {
+        // If it's possible for the source value to be negative, all of the
+        // bits in the extended value might be set.
+        BitMask &= (LHS.BitMask >> CShift) | ~(BitMask >> CShift);
+      } else {
+        BitMask &= LHS.BitMask >> CShift;
+      }
+
+      if (LHS.isStrideConstantInt()) {
+        const auto lostBits = ((uint64_t(1) << CShift) - 1);
+        if ((LHS.StrideInt & lostBits) == 0 || (LHS.BitMask & lostBits) == 0) {
+          return setStride(LHS.StrideInt >> CShift);
+        }
+      } else if ((LHS.BitMask & ((uint64_t(1) << CShift) - 1)) == 0) {
+        return setStride(nullptr);
+      }
+    }
+    return setMayDiverge();
+  }
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestAShr(IRBuilder<> &B, const OffsetInfo &LHS,
+                                     const OffsetInfo &RHS) {
+  if (RHS.Kind == eOffsetConstant) {
+    auto *const Shift = RHS.getUniformValue();
+    const auto CShift = RHS.getValueAsConstantInt();
+
+    if (!LHS.isStrideConstantInt() &&
+        (LHS.BitMask & ((uint64_t(1) << CShift) - 1)) == 0) {
+      auto *const sizeShift = matchSizeType(B, Shift, false);
+      auto *const newAShr = B.CreateAShr(LHS.ManifestStride, sizeShift);
+      return setStride(newAShr);
+    }
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::combineMul(const OffsetInfo &LHS,
+                                   const OffsetInfo &RHS) {
+  if (LHS.hasStride() && RHS.hasStride()) {
+    // Linear * Linear = not Linear
+    return setMayDiverge();
+  }
+
+  if (LHS.hasStride()) {
+    // Linear * Uniform
+    if (LHS.isStrideConstantInt() && RHS.Kind == eOffsetConstant) {
+      return setStride(LHS.StrideInt * RHS.getValueAsConstantInt());
+    } else {
+      return setStride(nullptr);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform * Linear
+    if (RHS.isStrideConstantInt() && LHS.Kind == eOffsetConstant) {
+      return setStride(RHS.StrideInt * LHS.getValueAsConstantInt());
+    } else {
+      return setStride(nullptr);
+    }
+  }
+
+  Kind = combineKinds(LHS.Kind, RHS.Kind);
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::manifestMul(IRBuilder<> &B, const OffsetInfo &LHS,
+                                    const OffsetInfo &RHS) {
+  if (LHS.hasStride()) {
+    // Linear * Uniform
+    if (auto *const RHSUniform = RHS.getUniformValue()) {
+      auto *const sizeMul = matchSizeType(B, RHSUniform, true);
+      auto *const newMul = B.CreateMul(LHS.ManifestStride, sizeMul);
+      return setStride(newMul);
+    }
+  } else if (RHS.hasStride()) {
+    // Uniform * Linear
+    if (auto *const LHSUniform = LHS.getUniformValue()) {
+      auto *const sizeMul = matchSizeType(B, LHSUniform, true);
+      auto *const newMul = B.CreateMul(RHS.ManifestStride, sizeMul);
+      return setStride(newMul);
+    }
+  }
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::copyStrideFrom(const OffsetInfo &Other) {
+  Kind = Other.Kind;
+  StrideInt = Other.StrideInt;
+  ManifestStride = Other.ManifestStride;
+  return *this;
+}
+
+OffsetInfo &OffsetInfo::copyStrideAndBitMaskFrom(const OffsetInfo &Other) {
+  BitMask = Other.BitMask;
+  return copyStrideFrom(Other);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
new file mode 100644
index 0000000000000..5d27b424d9d00
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/pass.cpp
@@ -0,0 +1,364 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vecz/pass.h"
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/device_info.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/sub_group_analysis.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
+
+#include <cstdlib>
+#include <functional>
+#include <optional>
+#include <tuple>
+
+#include "vectorization_context.h"
+#include "vectorization_helpers.h"
+#include "vectorization_unit.h"
+#include "vectorizer.h"
+#include "vecz/vecz_choices.h"
+#include "vecz/vecz_target_info.h"
+#include "vecz_pass_builder.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+
+/// @brief Provide debug logging for Vecz's PassManager
+///
+/// This flag is intended for testing and debugging purposes.
+static cl::opt<bool>
+    DebugVeczPipeline("debug-vecz-pipeline",
+                      cl::desc("Enable debug logging of the vecz PassManager"));
+
+/// @brief Provide debug logging for Vecz's PassManager
+///
+/// This flag specifies a textual description of the optimization pass pipeline
+/// to run over the kernel.
+static cl::opt<std::string> VeczPassPipeline(
+    "vecz-passes",
+    cl::desc(
+        "A textual description of the pass pipeline. To have analysis passes "
+        "available before a certain pass, add 'require<foo-analysis>'."));
+
+namespace vecz {
+using FnVectorizationResult = std::pair<Function *, llvm::ElementCount>;
+AnalysisKey VeczPassOptionsAnalysis::Key;
+
+PreservedAnalyses RunVeczPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  auto getVeczOptions = MAM.getResult<VeczPassOptionsAnalysis>(M);
+  auto preserved = PreservedAnalyses::none();
+  // Cache the current set of functions as the vectorizer will insert new ones,
+  // which we don't want to revisit.
+  SmallVector<std::pair<Function *, llvm::SmallVector<VeczPassOptions, 1>>, 4>
+      FnOpts;
+  for (auto &Fn : M.functions()) {
+    llvm::SmallVector<VeczPassOptions, 1> Opts;
+    if (!getVeczOptions(Fn, MAM, Opts)) {
+      continue;
+    }
+    FnOpts.emplace_back(std::make_pair(&Fn, std::move(Opts)));
+  }
+
+  ModulePassManager PM;
+
+  auto &device_info = MAM.getResult<compiler::utils::DeviceInfoAnalysis>(M);
+  TargetInfo *target_info = MAM.getResult<vecz::TargetInfoAnalysis>(M);
+  assert(target_info && "Missing TargetInfo");
+  auto &builtin_info = MAM.getResult<compiler::utils::BuiltinInfoAnalysis>(M);
+
+  VectorizationContext Ctx(M, *target_info, builtin_info);
+  VeczPassMachinery Mach(M.getContext(), target_info->getTargetMachine(), Ctx,
+                         /*verifyEach*/ false,
+                         DebugVeczPipeline
+                             ? compiler::utils::DebugLogging::Normal
+                             : compiler::utils::DebugLogging::None);
+  Mach.initializeStart();
+  Mach.getMAM().registerPass([&device_info] {
+    return compiler::utils::DeviceInfoAnalysis(device_info);
+  });
+  Mach.initializeFinish();
+
+  // Forcibly compute the DeviceInfoAnalysis so that cached retrievals work.
+  PM.addPass(
+      RequireAnalysisPass<compiler::utils::DeviceInfoAnalysis, Module>());
+
+  const bool Check = VeczPassPipeline.empty();
+  if (Check) {
+    if (!buildPassPipeline(PM)) {
+      return PreservedAnalyses::all();
+    }
+  } else {
+    if (auto Err = Mach.getPB().parsePassPipeline(PM, VeczPassPipeline)) {
+      // NOTE this is a command line user error print, not a debug print.
+      // We may want to hoist this out of Vecz once replacing RunVeczPass with
+      // a passbuilder is resolved.
+      errs() << "vecz pipeline: " << toString(std::move(Err)) << "\n";
+      return PreservedAnalyses::all();
+    }
+  }
+
+  // Create the vectorization units and clone the kernels
+  using ResultTy =
+      SmallVector<std::pair<VectorizationUnit *, VeczPassOptions *>, 2>;
+  SmallDenseMap<Function *, ResultTy, 2> Results;
+  for (auto &P : FnOpts) {
+    Function *Fn = P.first;
+    ResultTy T;
+    Results.insert(std::make_pair(Fn, std::move(T)));
+    for (auto &Opts : P.second) {
+      // If we've been given an auto width, try and fit it to any requirements
+      // that the kernel/device places on its sub-groups.
+      if (Opts.vecz_auto) {
+        if (auto AutoSGOpts = getAutoSubgroupSizeOpts(*Fn, MAM)) {
+          Opts = *AutoSGOpts;
+        }
+      }
+
+      auto *const VU =
+          createVectorizationUnit(Ctx, Fn, Opts, Mach.getFAM(), Check);
+      if (!VU) {
+        LLVM_DEBUG(llvm::dbgs() << Fn->getName() << " was not vectorized\n");
+        continue;
+      }
+      Results[Fn].emplace_back(std::make_pair(VU, &Opts));
+
+      if (auto *const VecFn = vecz::cloneFunctionToVector(*VU)) {
+        VU->setVectorizedFunction(VecFn);
+
+        // Allows the Vectorization Unit Analysis to work on the vector kernel
+        Ctx.setActiveVU(VecFn, VU);
+      } else {
+        LLVM_DEBUG(llvm::dbgs() << Fn->getName() << " could not be cloned\n");
+      }
+    }
+  }
+
+  // Vectorize everything
+  PM.run(M, Mach.getMAM());
+
+  auto AllOnModule = llvm::PreservedAnalyses::allInSet<AllAnalysesOn<Module>>();
+  auto eraseFailed = [&](VectorizationUnit *VU) {
+    Function *VectorizedFn = VU->vectorizedFunction();
+    if (VectorizedFn) {
+      // If we fail to vectorize a function, we still cloned and then
+      // deleted it which affects internal addresses. The module has changed
+      // and we can't cache any analyses.
+      Mach.getFAM().invalidate(*VectorizedFn, llvm::PreservedAnalyses::none());
+      // Remove the partially-vectorized function if something went wrong.
+      Ctx.clearActiveVU(VectorizedFn);
+      VU->setVectorizedFunction(nullptr);
+      VectorizedFn->eraseFromParent();
+    }
+    MAM.invalidate(M, AllOnModule);
+  };
+
+  // Fix up the metadata and clean out any dead kernels
+  for (auto &P : Results) {
+    auto &Result = P.second;
+    for (auto &R : Result) {
+      VectorizationUnit *VU = R.first;
+      trackVeczSuccessFailure(*VU);
+      if (!createVectorizedFunctionMetadata(*VU)) {
+        LLVM_DEBUG(dbgs() << P.first->getName() << " failed to vectorize\n");
+        eraseFailed(VU);
+      }
+    }
+  }
+  return PreservedAnalyses::none();
+}
+
+PreservedAnalyses VeczPassOptionsPrinterPass::run(Module &M,
+                                                  ModuleAnalysisManager &MAM) {
+  auto getVeczOptions = MAM.getResult<VeczPassOptionsAnalysis>(M);
+  for (auto &F : M.functions()) {
+    OS << "Function '" << F.getName() << "'";
+    llvm::SmallVector<VeczPassOptions, 1> Opts;
+    if (!getVeczOptions(F, MAM, Opts)) {
+      OS << " will not be vectorized\n";
+      continue;
+    }
+
+    OS << " will be vectorized {\n";
+    for (auto &O : Opts) {
+      OS << "  VF = ";
+      if (O.factor.isScalable()) {
+        OS << "vscale x ";
+      }
+      OS << O.factor.getKnownMinValue();
+
+      if (O.vecz_auto) {
+        OS << ", (auto)";
+      }
+
+      OS << ", vec-dim = " << O.vec_dim_idx;
+
+      if (O.local_size) {
+        OS << ", local-size = " << O.local_size;
+      }
+
+      OS << ", choices = [";
+      OS.tell();
+      auto AvailChoices = VectorizationChoices::queryAvailableChoices();
+      unsigned NumChoices = 0;
+
+      for (auto &C : AvailChoices) {
+        if (!O.choices.isEnabled(C.number)) {
+          continue;
+        }
+        if (!NumChoices) {
+          OS << "\n    ";
+        } else {
+          OS << ",";
+        }
+        OS << C.name;
+        NumChoices++;
+      }
+      // Pretty-print the list of choices on one line if empty, else formatted
+      // across several lines. Always end with a newline, meaning the options
+      // are closed off with a '}' on the first column.
+      if (NumChoices) {
+        OS << "\n  ]\n";
+      } else {
+        OS << "]\n";
+      }
+    }
+    OS << "}\n";
+  }
+
+  return PreservedAnalyses::all();
+}
+
+std::optional<VeczPassOptions> getReqdSubgroupSizeOpts(Function &F) {
+  if (auto reqd_sg_size = compiler::utils::getReqdSubgroupSize(F)) {
+    vecz::VeczPassOptions vecz_opts;
+    // Disable auto - we want a specific width
+    vecz_opts.vecz_auto = false;
+    vecz_opts.vec_dim_idx = 0;
+    // If we can't vectorize to the required sub-group size then we must bail.
+    if (*reqd_sg_size % compiler::utils::getMuxSubgroupSize(F)) {
+      return std::nullopt;
+    }
+    // Else we must vectorize such that we multiply the existing mux sub-group
+    // size up to the required one.
+    vecz_opts.factor = ElementCount::getFixed(
+        *reqd_sg_size / compiler::utils::getMuxSubgroupSize(F));
+    vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions);
+    return vecz_opts;
+  }
+  return std::nullopt;
+}
+
+std::optional<VeczPassOptions>
+getAutoSubgroupSizeOpts(Function &F, ModuleAnalysisManager &AM) {
+  // If there's a required sub-group size, we must return a vectorization
+  // factor that gets us there.
+  if (auto opts = getReqdSubgroupSizeOpts(F)) {
+    return opts;
+  }
+
+  auto &M = *F.getParent();
+  const auto &GSGI = AM.getResult<compiler::utils::SubgroupAnalysis>(M);
+
+  // If the function doesn't use sub-groups (from the user's perspective) then
+  // we don't need to adhere to a specific sub-group size.
+  if (!GSGI.usesSubgroups(F)) {
+    return std::nullopt;
+  }
+
+  // Use the device's sub-group sizes to determine which to vectorize to.
+  auto &DI = AM.getResult<compiler::utils::DeviceInfoAnalysis>(M);
+
+  // We don't force devices to support any sub-group sizes.
+  if (DI.reqd_sub_group_sizes.empty()) {
+    return std::nullopt;
+  }
+
+  vecz::VeczPassOptions vecz_opts;
+  vecz_opts.vec_dim_idx = 0;
+  // Disable auto - we want a specific width
+  vecz_opts.vecz_auto = false;
+  // Enable some default choices
+  vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions);
+
+  // Now try and choose the best width.
+  std::optional<unsigned> best_width;
+  const auto mux_sub_group_size = compiler::utils::getMuxSubgroupSize(F);
+
+  auto can_produce_legal_width = [&mux_sub_group_size](unsigned size) {
+    // We only support vectorization widths where there's a clean multiple, and
+    // we can vectorize *up* to the desired size - we can't shrink the
+    // sub-group size by vectorizing.
+    return size >= mux_sub_group_size && (size % mux_sub_group_size) == 0;
+  };
+
+  for (auto size : DI.reqd_sub_group_sizes) {
+    if (!can_produce_legal_width(size)) {
+      continue;
+    }
+    const unsigned candidate_width = size / mux_sub_group_size;
+    // Try and choose at least one width.
+    if (!best_width) {
+      best_width = candidate_width;
+      continue;
+    }
+
+    // Prefer non-scalar widths.
+    if (best_width == 1 && candidate_width > 1) {
+      best_width = candidate_width;
+      continue;
+    }
+
+    // If we have a required work-group size, prefer one that will fit well
+    // with that.
+    if (auto wgs = compiler::utils::parseRequiredWGSMetadata(F)) {
+      const uint64_t local_size_x = wgs.value()[0];
+      const bool best_fits = !(local_size_x % *best_width);
+      const bool cand_fits = !(local_size_x % candidate_width);
+      if (!best_fits && cand_fits) {
+        best_width = candidate_width;
+        continue;
+      } else if (best_fits && !cand_fits) {
+        continue;
+      }
+    }
+
+    // Else, prefer powers of two.
+    if (!isPowerOf2_32(*best_width) && isPowerOf2_32(candidate_width)) {
+      best_width = candidate_width;
+      continue;
+    }
+  }
+
+  // Return nothing if we couldn't find a good, legal, width.
+  if (!best_width) {
+    return std::nullopt;
+  }
+
+  vecz_opts.factor = ElementCount::getFixed(*best_width);
+
+  return vecz_opts;
+}
+
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
new file mode 100644
index 0000000000000..0cba927e215da
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/passes.def
@@ -0,0 +1,53 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// This is a simplified version of LLVMs llvm/lib/Passes/PassRegistry.def. It
+// outlines all vecz-specific passes (FIXME: not analyses).
+#ifndef MODULE_PASS
+#define MODULE_PASS(NAME, CREATE_PASS)
+#endif
+MODULE_PASS("builtin-inlining", BuiltinInliningPass())
+MODULE_PASS("define-builtins", DefineInternalBuiltinsPass())
+#undef MODULE_PASS
+
+#ifndef FUNCTION_PASS
+#define FUNCTION_PASS(NAME, CREATE_PASS)
+#endif
+FUNCTION_PASS("vecz-mem2reg", BasicMem2RegPass())
+FUNCTION_PASS("pre-linearize", PreLinearizePass())
+FUNCTION_PASS("remove-int-ptr", RemoveIntPtrPass())
+FUNCTION_PASS("squash-small-vecs", SquashSmallVectorsPass())
+FUNCTION_PASS("uniform-reassoc", UniformReassociationPass())
+FUNCTION_PASS("ternary-transform", TernaryTransformPass())
+FUNCTION_PASS("cfg-convert", ControlFlowConversionPass())
+FUNCTION_PASS("cleanup-divergence", DivergenceCleanupPass())
+FUNCTION_PASS("gep-elim", CommonGEPEliminationPass())
+FUNCTION_PASS("scalarize", ScalarizationPass())
+FUNCTION_PASS("mask-memops", SimplifyMaskedMemOpsPass())
+FUNCTION_PASS("packetizer", PacketizationPass())
+FUNCTION_PASS("inline-post-vecz", InlinePostVectorizationPass())
+FUNCTION_PASS("interleave-combine-loads", InterleavedGroupCombinePass(eInterleavedLoad))
+FUNCTION_PASS("interleave-combine-stores", InterleavedGroupCombinePass(eInterleavedStore))
+
+FUNCTION_PASS("print<strides>", StrideAnalysisPrinterPass(llvm::dbgs()))
+#undef FUNCTION_PASS
+
+#ifndef LOOP_PASS
+#define LOOP_PASS(NAME, CREATE_PASS)
+#endif
+LOOP_PASS("simplify-infinite-loops", SimplifyInfiniteLoopPass())
+LOOP_PASS("vecz-loop-rotate", VeczLoopRotatePass())
+#undef LOOP_PASS
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
new file mode 100644
index 0000000000000..4c2ac445b32c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/reachability.cpp
@@ -0,0 +1,281 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "reachability.h"
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/Function.h>
+
+#include "debugging.h"
+
+#define DEBUG_TYPE "vecz-reachability"
+
+// HOW IT WORKS
+//
+// It builds two complementary topological sorts of the supplied basic blocks,
+// which it then uses to filter out obviously unreachable blocks as early as
+// possible. Where we have two blocks A and B and B has any topology index
+// less than that of A, then B is definitely not reachable from A. However,
+// if B has a higher index, it might be (but we have to check to be sure).
+//
+// For details on the above approach, see "Reachability Queries in Very Large
+// Graphs: A Fast Refined Online Search Approach" by
+// Renê R. Veloso, Loïc Cerf, Wagner Meira Jr, Mohammed J. Zaki.
+//
+// It also uses data from the Dominator Tree and Post Dominator Tree, in order
+// to skip ahead. If we want to know if B is reachable from A and we know
+// that C dominates B, if A->C is not ruled out by the topology indices then we
+// know there can be no path from A to B that does NOT go through C, therefore
+// we only need to check if C is reachable from A. The same follows in reverse
+// for Post Dominators.
+
+using namespace llvm;
+
+namespace vecz {
+
+Reachability::Reachability(DominatorTree &p_DT, PostDominatorTree &p_PDT,
+                           LoopInfo &p_LI)
+    : DT(p_DT), PDT(p_PDT), LI(p_LI) {}
+
+void Reachability::update(Function &F) {
+  if (graph.empty()) {
+    recalculate(F);
+  }
+}
+
+void Reachability::clear() {
+  indexMap.clear();
+  graph.clear();
+}
+
+void Reachability::recalculate(Function &F) {
+  clear();
+
+  indexMap.reserve(F.size());
+  graph.resize(F.size());
+  {
+    size_t i = 0;
+    for (auto &BB : F) {
+      indexMap[&BB] = i++;
+    }
+  }
+
+  for (auto &BB : F) {
+    auto &node = graph[indexMap[&BB]];
+
+    auto *const loop = LI.getLoopFor(&BB);
+    auto *const header = loop ? loop->getHeader() : nullptr;
+    for (BasicBlock *succ : successors(&BB)) {
+      if (succ == header) {
+        continue;
+      }
+
+      const size_t succIndex = indexMap[succ];
+
+      node.successors.push_back(succIndex);
+      auto &succNode = graph[succIndex];
+      ++succNode.predecessors;
+    }
+    std::sort(node.successors.begin(), node.successors.end());
+
+    if (auto *DTNode = DT.getNode(&BB)) {
+      if (auto *IDom = DTNode->getIDom()) {
+        const size_t dom = indexMap[IDom->getBlock()];
+        node.dom = dom;
+      }
+    }
+    if (auto *PDTNode = PDT.getNode(&BB)) {
+      if (auto *IPDom = PDTNode->getIDom()) {
+        const size_t postDom = indexMap[IPDom->getBlock()];
+        node.postDom = postDom;
+      }
+    }
+  }
+
+  std::vector<size_t> roots;
+  size_t Xindex = 0;
+  size_t Yindex = 0;
+
+  // It would be surprising in fact if there was more than one root, because
+  // we only expect a single entry block for a function, however we deal with
+  // it for completeness, and in case this is required to be valid for some
+  // intermediate state.
+  {
+    size_t i = 0;
+    for (auto &node : graph) {
+      if (node.successors.empty()) {
+        node.postDom = ~size_t(0);
+      }
+      node.predTmp = node.predecessors;
+      if (node.predecessors == 0) {
+        roots.push_back(i);
+      }
+      ++i;
+    }
+  }
+  // A copy of the roots vector so we don't need to build it again when we come
+  // to construct the Y index.
+  std::vector<size_t> rootsY = roots;
+
+  while (!roots.empty()) {
+    const size_t u = roots.back();
+    roots.pop_back();
+
+    auto &uNode = graph[u];
+    uNode.X = Xindex++;
+    for (const size_t v : uNode.successors) {
+      auto &vNode = graph[v];
+      if (--vNode.predTmp == 0) {
+        roots.push_back(v);
+      }
+    }
+  }
+
+  for (auto &node : graph) {
+    node.predTmp = node.predecessors;
+  }
+  roots.swap(rootsY);
+
+  // Y heap represents right-most vertices (max X)
+  auto cmpY = [this](size_t lhs, size_t rhs) -> bool {
+    return graph[lhs].X < graph[rhs].X;
+  };
+
+  // The vector of roots has strictly decreasing X index, so it already has
+  // the property of a max heap. No need to make_heap!
+  while (!roots.empty()) {
+    std::pop_heap(roots.begin(), roots.end(), cmpY);
+    const size_t u = roots.back();
+    roots.pop_back();
+
+    auto &uNode = graph[u];
+    uNode.Y = Yindex++;
+    for (auto vi = uNode.successors.rbegin(), ve = uNode.successors.rend();
+         vi != ve; ++vi) {
+      const size_t v = *vi;
+      auto &vNode = graph[v];
+      if (--vNode.predTmp == 0) {
+        roots.push_back(v);
+        std::push_heap(roots.begin(), roots.end(), cmpY);
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    size_t I = 0;
+    for (auto &BB : F) {
+      auto &Node = graph[I];
+      dbgs() << BB.getName() << ":\n";
+      dbgs() << "[ " << Node.X << ", " << Node.Y << " ] : ";
+      dbgs() << "( " << Node.dom << ", " << Node.postDom << " ) : ";
+      for (const size_t S : Node.successors) {
+        if (graph[S].X <= graph[I].X) {
+          dbgs() << "!x!";
+        }
+        if (graph[S].Y <= graph[I].Y) {
+          dbgs() << "!y!";
+        }
+        dbgs() << S << "; ";
+      }
+      dbgs() << "\n\n";
+      ++I;
+    }
+  });
+
+  assert(validate() && "Topological indices not valid for reachability graph");
+}
+
+bool Reachability::validate() const {
+  for (auto &node : graph) {
+    for (const size_t s : node.successors) {
+      if (graph[s].X <= node.X || graph[s].Y <= node.Y) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool Reachability::isReachableImpl(size_t from, size_t to) const {
+  DenseSet<size_t> visited;
+  std::vector<size_t> worklist;
+
+  while (true) {
+    auto &nodeFrom = graph[from];
+    auto &nodeTo = graph[to];
+
+    if (nodeFrom.X > nodeTo.X || nodeFrom.Y > nodeTo.Y) {
+      return false;
+    }
+
+    const size_t dom = nodeTo.dom;
+    const size_t postDom = nodeFrom.postDom;
+    if (dom == from || postDom == to) {
+      return true;
+    }
+
+    auto &nodeDom = graph[dom];
+    if (nodeFrom.X < nodeDom.X && nodeFrom.Y < nodeDom.Y) {
+      to = dom;
+      continue;
+    }
+
+    if (postDom != ~size_t(0)) {
+      auto &nodePDom = graph[postDom];
+      if (nodePDom.X < nodeTo.X && nodePDom.Y < nodeTo.Y) {
+        from = postDom;
+        continue;
+      }
+    }
+
+    // possible false positive, so check recursively..
+    for (const size_t succ : nodeFrom.successors) {
+      if (succ == to) {
+        return true;
+      }
+      auto &nodeSucc = graph[succ];
+      if (nodeSucc.X < nodeTo.X && nodeSucc.Y < nodeTo.Y) {
+        if (visited.insert(succ).second) {
+          worklist.push_back(succ);
+        }
+      }
+    }
+    if (worklist.empty()) {
+      return false;
+    }
+    from = worklist.back();
+    worklist.pop_back();
+  }
+  return false;
+}
+
+bool Reachability::isReachable(BasicBlock *from, BasicBlock *to) const {
+  auto fromI = indexMap.find(from);
+  if (fromI == indexMap.end()) {
+    return false;
+  }
+
+  auto toI = indexMap.find(to);
+  if (toI == indexMap.end()) {
+    return false;
+  }
+
+  return from == to || isReachableImpl(fromI->second, toI->second);
+}
+
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
new file mode 100644
index 0000000000000..6f0c952bf64c4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/simd_packet.cpp
@@ -0,0 +1,53 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "simd_packet.h"
+
+#define DEBUG_TYPE "vecz-simd"
+
+using namespace llvm;
+using namespace vecz;
+
+llvm::Value *SimdPacket::at(unsigned Index) const {
+  if (Index >= size()) {
+    return nullptr;
+  } else {
+    return (*this)[Index];
+  }
+}
+
+void SimdPacket::set(unsigned Index, Value *V) {
+  if (Index < size()) {
+    (*this)[Index] = V;
+    Mask.enable(Index);
+  }
+}
+
+SimdPacket &SimdPacket::update(const SimdPacket &Other) {
+  for (unsigned i = 0; i < size(); i++) {
+    if (Other.Mask.isEnabled(i)) {
+      (*this)[i] = Other[i];
+    }
+  }
+  Mask.Value |= Other.Mask.Value;
+  return *this;
+}
+
+void PacketMask::enableAll(unsigned NumLanes) {
+  for (unsigned i = 0; i < NumLanes; i++) {
+    enable(i);
+  }
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
new file mode 100644
index 0000000000000..e8c6c086828a8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/basic_mem2reg_pass.cpp
@@ -0,0 +1,243 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfo.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/Local.h>
+
+#include "debugging.h"
+#include "transform/passes.h"
+
+using namespace llvm;
+using namespace vecz;
+
+#define DEBUG_TYPE "vecz-mem2reg"
+
+PreservedAnalyses BasicMem2RegPass::run(Function &F,
+                                        FunctionAnalysisManager &) {
+  LLVM_DEBUG(dbgs() << "\n\nVECZ MEM2REG on " << F.getName() << "\n");
+  bool modified = false;
+  if (F.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  // Find allocas that can be promoted.
+  SmallVector<AllocaInst *, 4> PromotableAllocas;
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (Instruction &I : EntryBB) {
+    if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I)) {
+      if (canPromoteAlloca(Alloca)) {
+        PromotableAllocas.push_back(Alloca);
+      }
+    }
+  }
+
+  // Promote them.
+  for (AllocaInst *Alloca : PromotableAllocas) {
+    if (promoteAlloca(Alloca)) {
+      LLVM_DEBUG(dbgs() << "VM2R: Promoted :" << *Alloca << "\n");
+      Alloca->eraseFromParent();
+      modified = true;
+    }
+  }
+
+  if (!modified) {
+    return PreservedAnalyses::all();
+  }
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
+
+bool BasicMem2RegPass::canPromoteAlloca(AllocaInst *Alloca) const {
+  BasicBlock *ParentBB = Alloca->getParent();
+  Function *F = ParentBB->getParent();
+  BasicBlock &EntryBB = F->getEntryBlock();
+  if (&EntryBB != ParentBB) {
+    return false;
+  }
+
+  const unsigned SrcPointeeBits =
+      Alloca->getAllocatedType()->getPrimitiveSizeInBits();
+
+  if (SrcPointeeBits == 0) {
+    return false;
+  }
+
+  // Validate the alloca's users.
+  StoreInst *TheStore = nullptr;
+  SmallPtrSet<Value *, 4> NonStoreUsers;
+  for (User *U : Alloca->users()) {
+    if (StoreInst *Store = dyn_cast<StoreInst>(U)) {
+      // There can be at most one store.
+      if (TheStore) {
+        return false;
+      }
+      // Stores must be in the entry block.
+      if (Store->getParent() != &EntryBB) {
+        return false;
+      }
+      // Check if the store is actually storing a value *in* the alloca and not
+      // using the alloca itself as the value to be stored. For example, in the
+      // following IR code, the store can be used to promote p_639 but not
+      // c_640:
+      //
+      // %c_640 = alloca %struct.S2, align 16
+      // %p_639 = alloca %struct.S2*, align 8
+      // store %struct.S2* %c_640, %struct.S2** %p_639, align 8
+      //
+      // Also, if the alloca pointer is stored in some other variable, we can
+      // not promote the alloca as we need the pointer.
+      if (Store->getPointerOperand() != Alloca) {
+        return false;
+      }
+      // Everything is fine, use this store
+      TheStore = Store;
+    } else if (isa<LoadInst>(U)) {
+      // The loaded type doesn't necessarily equal the alloca type when opaque
+      // pointers are involved:
+      //   %a = alloca i32
+      //   %v = load i16, ptr %a
+      // We can only promote the alloca if we can bitcast between the two
+      // underlying types as well.
+      // We could probably zero-extend or trunc if we had to?
+      const unsigned DstPointeeBits = U->getType()->getPrimitiveSizeInBits();
+      if (!DstPointeeBits || SrcPointeeBits != DstPointeeBits) {
+        return false;
+      }
+      NonStoreUsers.insert(U);
+    } else if (BitCastInst *Cast = dyn_cast<BitCastInst>(U)) {
+      // The bitcast must be from one pointer type to another.
+      PointerType *SrcPtrTy = dyn_cast<PointerType>(Cast->getSrcTy());
+      PointerType *DstPtrTy = dyn_cast<PointerType>(Cast->getType());
+      if (!SrcPtrTy || !DstPtrTy) {
+        return false;
+      }
+      // The cast must have one load user.
+      if (!Cast->hasOneUse()) {
+        return false;
+      }
+      User *CastUser = *Cast->user_begin();
+      if (!isa<LoadInst>(CastUser)) {
+        return false;
+      }
+      // Since this is a bitcast, we can only promote the alloca if we can
+      // bitcast between the two underlying types as well.
+      const unsigned DstPointeeBits =
+          CastUser->getType()->getPrimitiveSizeInBits();
+      if (!DstPointeeBits || SrcPointeeBits != DstPointeeBits) {
+        return false;
+      }
+      NonStoreUsers.insert(U);
+    } else {
+      // Do not allow other kinds of users.
+      return false;
+    }
+  }
+
+  // If the alloca has no value stored into it, then there is no value to get
+  // and we can't promote it.
+  if (!TheStore) {
+    return false;
+  }
+
+  // Stores must precede other users.
+  for (Instruction &I : EntryBB) {
+    if (NonStoreUsers.contains(&I)) {
+      return false;
+    } else if (&I == TheStore) {
+      break;
+    }
+  }
+
+  return true;
+}
+
+bool BasicMem2RegPass::promoteAlloca(AllocaInst *Alloca) const {
+  LLVM_DEBUG(dbgs() << "VM2R: NOW AT :" << *Alloca << "\n");
+  // Find the value stored in the alloca.
+  Value *StoredValue = nullptr;
+  SmallVector<Instruction *, 8> ToDelete;
+  for (User *U : Alloca->users()) {
+    if (StoreInst *Store = dyn_cast<StoreInst>(U)) {
+      StoredValue = Store->getValueOperand();
+      ToDelete.push_back(Store);
+      break;
+    }
+  }
+  assert(StoredValue != nullptr && "Could not find value stored in alloca");
+
+  // Replace non-store users with the stored value.
+  for (User *U : Alloca->users()) {
+    if (isa<StoreInst>(U)) {
+      continue;
+    }
+    LoadInst *Load = dyn_cast<LoadInst>(U);
+    Value *NewValue = StoredValue;
+    BitCastInst *Cast = dyn_cast<BitCastInst>(U);
+    if (Cast) {
+      // We've already verified that a bitcast must have a load attached.
+      Load = cast<LoadInst>(*Cast->user_begin());
+      LLVM_DEBUG(dbgs() << "VM2R: Cast     :" << *Cast << "\n");
+    }
+    if (!Load) {
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "VM2R: Load     :" << *Load << "\n");
+    // Handle any type changes - not necessarily from the BitCastInst we've
+    // checked above! We've already verified that the loaded type type and the
+    // alloca size must be identical...
+    assert(Load->getType()->getPrimitiveSizeInBits() ==
+           Alloca->getAllocatedType()->getPrimitiveSizeInBits());
+    if (Load->getType() != NewValue->getType()) {
+      // ... but we haven't checked that the stored value is the right size:
+      //   %a = alloca i32
+      //   store i16, ptr %a
+      //   %v = load i32, ptr %a
+      // Note: we could do other things if the type sizes didn't match.
+      if (Load->getType()->getPrimitiveSizeInBits() !=
+          NewValue->getType()->getPrimitiveSizeInBits()) {
+        return false;
+      }
+      auto *CI = CastInst::CreateBitOrPointerCast(StoredValue, Load->getType());
+      CI->insertBefore(Load->getIterator());
+      NewValue = CI;
+    }
+    LLVM_DEBUG(dbgs() << "VM2R: Replaced :" << *Load << "\n");
+    LLVM_DEBUG(dbgs() << "      |-> with :" << *NewValue << "\n");
+    Load->replaceAllUsesWith(NewValue);
+    if (Cast) {
+      ToDelete.push_back(Cast);
+    }
+    ToDelete.push_back(Load);
+  }
+
+  // Clean up instructions bottom-up (users first).
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.pop_back_val();
+    if (I->use_empty()) {
+      LLVM_DEBUG(dbgs() << "VM2R: Deleted  :" << *I << "\n");
+      I->eraseFromParent();
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
new file mode 100644
index 0000000000000..9e865838c021e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/builtin_inlining_pass.cpp
@@ -0,0 +1,294 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/AlwaysInliner.h>
+
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/passes.h"
+
+using namespace llvm;
+using namespace vecz;
+
+PreservedAnalyses BuiltinInliningPass::run(Module &M,
+                                           ModuleAnalysisManager &AM) {
+  bool modified = false;
+  bool needToRunInliner = false;
+  llvm::FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  for (Function &F : M.functions()) {
+    SmallVector<Instruction *, 4> ToDelete;
+    for (BasicBlock &BB : F) {
+      if (!FAM.getResult<VectorizationUnitAnalysis>(F).hasResult()) {
+        continue;
+      }
+      for (Instruction &I : BB) {
+        // Only look at call instructions as those are the only things that can
+        // be builtins.
+        CallInst *CI = dyn_cast<CallInst>(&I);
+        if (!CI) {
+          continue;
+        }
+
+        bool NeedLLVMInline = false;
+        Value *NewCI = processCallSite(CI, NeedLLVMInline);
+        needToRunInliner |= NeedLLVMInline;
+        if ((NewCI == CI) || !NewCI) {
+          continue;
+        }
+
+        if (!CI->getType()->isVoidTy()) {
+          CI->replaceAllUsesWith(NewCI);
+        }
+        ToDelete.push_back(CI);
+        modified = true;
+      }
+    }
+    // Clean up.
+    while (!ToDelete.empty()) {
+      Instruction *I = ToDelete.pop_back_val();
+      I->eraseFromParent();
+    }
+  }
+
+  // Run the LLVM inliner if some calls were marked as needing inlining.
+  if (needToRunInliner) {
+    llvm::legacy::PassManager PM;
+    PM.add(llvm::createAlwaysInlinerLegacyPass());
+    modified |= PM.run(M);
+  }
+
+  // Recursively run the pass to inline any newly introduced functions.
+  if (modified) {
+    run(M, AM);
+  }
+
+  return modified ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+static Value *emitBuiltinMemSet(Function *F, IRBuilder<> &B,
+                                ArrayRef<Value *> Args, llvm::CallBase *CB) {
+  LLVMContext &Context = F->getContext();
+  auto &DL = F->getParent()->getDataLayout();
+  const unsigned PtrBits = DL.getPointerSizeInBits();
+
+  // Check the alignment constraints do not exceed the algorithmic requirements
+  // of doing 64 bits at time
+
+  // @llvm.memset defines 0 and 1 to both mean no alignment.
+  const auto &MSI = cast<MemSetInst>(CB);
+
+  // Note that once LLVM 8.0 is deprecated we can use actual alignment classes
+  const Align Alignment = MSI->getDestAlign().valueOrOne();
+  const Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty());
+  if (Alignment < std::max(Int64Alignment, Align(8u))) {
+    return nullptr;
+  }
+
+  Value *DstPtr = Args[0];
+  Type *Int8Ty = B.getInt8Ty();
+
+  Value *StoredValue = Args[1];
+  const bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
+  llvm::StoreInst *MS = nullptr;
+
+  // For nicely named IR instructions
+  const std::string DstName = DstPtr->getName().str();
+
+  // We can only replace memset instructions if they have a constant length
+  ConstantInt *CL = dyn_cast<ConstantInt>(Args[2]);
+  if (!CL) {
+    return nullptr;
+  }
+  const int64_t Bytes = CL->getValue().getZExtValue();
+
+  // Unlike memcpy, if we want to use 64bit stores in memset we need to
+  // construct the 64bit value from a 8bit one.
+  // First, check if we can get the value at compile time
+  ConstantInt *ConstantValue = dyn_cast<ConstantInt>(StoredValue);
+  Value *StoredValue64 = nullptr;
+  if (ConstantValue) {
+    // If we can get the value at compile time, calculate the 64bit value at
+    // compile time as well.
+    const unsigned IntValue = ConstantValue->getZExtValue();
+    APInt APValue(64, IntValue);
+    for (int i = 1; IntValue && i < 8; ++i) {
+      APValue |= APValue << 8;
+    }
+    StoredValue64 = ConstantInt::get(Context, APValue);
+  } else {
+    StoredValue64 = B.CreateZExt(StoredValue, Type::getInt64Ty(Context));
+    for (int i = 1; i < 8; ++i) {
+      StoredValue64 = B.CreateOr(
+          StoredValue64,
+          B.CreateShl(StoredValue64,
+                      llvm::ConstantInt::get(Context, llvm::APInt(64, 8))));
+    }
+    // If we can't get the value at compile time, we have to emit instructions
+    // to generate it at runtime.
+  }
+  StoredValue64->setName("ms64val");
+
+  // Emit enough loads and stores to replicate the behaviour of memset.
+  int64_t byte = 0;
+  // Initially we use 64bit loads and stores, in order to avoid emitting too
+  // many instructions.
+
+  for (; byte <= Bytes - 8; byte += 8) {
+    Value *Idx = B.getIntN(PtrBits, byte);
+    Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx);
+    MS = B.CreateStore(StoredValue64, OffsetDstPtr, IsVolatile);
+
+    // Set alignments for store to be minimum of that from
+    // the instruction and what is required for 8 byte stores
+    const Align StoreAlign =
+        byte == 0 ? Alignment : std::min(Align(8u), Alignment);
+    MS->setAlignment(StoreAlign);
+  }
+  // ...and then we fill in the remaining with 8bit stores.
+  for (; byte < Bytes; byte += 1) {
+    Value *Idx = B.getIntN(PtrBits, byte);
+    Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx, DstName);
+    MS = B.CreateStore(StoredValue, OffsetDstPtr, IsVolatile);
+    MS->setAlignment(llvm::Align(1));
+  }
+
+  return MS;
+}
+
+static Value *emitBuiltinMemCpy(Function *F, IRBuilder<> &B,
+                                ArrayRef<Value *> Args, llvm::CallBase *CB) {
+  LLVMContext &Context = F->getContext();
+  auto &DL = F->getParent()->getDataLayout();
+
+  const auto &MSI = cast<MemCpyInst>(CB);
+  const Align DestAlignment = MSI->getDestAlign().valueOrOne();
+  const Align SourceAlignment = MSI->getSourceAlign().valueOrOne();
+  const Align Int64Alignment = DL.getABITypeAlign(B.getInt64Ty());
+
+  if (DestAlignment < std::max(Int64Alignment, Align(8u))) {
+    return nullptr;
+  }
+
+  if (SourceAlignment < std::max(Int64Alignment, Align(8u))) {
+    return nullptr;
+  }
+
+  const unsigned PtrBits = DL.getPointerSizeInBits();
+
+  Value *DstPtr = Args[0];
+  Value *SrcPtr = Args[1];
+  Type *Int8Ty = B.getInt8Ty();
+
+  const bool IsVolatile = (Args.back() == ConstantInt::getTrue(Context));
+  llvm::StoreInst *MC = nullptr;
+
+  // For nicely named IR instructions
+  const std::string DstName = DstPtr->getName().str();
+  const std::string SrcName = SrcPtr->getName().str();
+
+  // Get the length as a constant
+  ConstantInt *CL = dyn_cast<ConstantInt>(Args[2]);
+  // We can only replace memcpy instructions if they have a constant length
+  if (!CL) {
+    return nullptr;
+  }
+  const int64_t Length = CL->getValue().getSExtValue();
+
+  // Emit enough stores to replicate the behaviour of memcpy.
+  int64_t byte = 0;
+  // Initially we use 64bit loads and stores, in order to avoid emitting too
+  // many instructions...
+  Type *Int64Ty = B.getInt64Ty();
+
+  for (; byte <= Length - 8; byte += 8) {
+    Value *Idx = B.getIntN(PtrBits, byte);
+    Value *OffsetSrcPtr = B.CreateInBoundsGEP(Int8Ty, SrcPtr, Idx);
+    Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx);
+    LoadInst *LoadValue =
+        B.CreateLoad(Int64Ty, OffsetSrcPtr, IsVolatile, SrcName);
+    MC = B.CreateStore(LoadValue, OffsetDstPtr, IsVolatile);
+
+    // Set alignments for stores and loads to be minimum of that from
+    // the instruction and what is required for 8 byte load/stores
+    const Align StoreAlign =
+        byte == 0 ? DestAlignment : std::min(Align(8u), DestAlignment);
+    MC->setAlignment(StoreAlign);
+    const Align LoadAlign =
+        byte == 0 ? SourceAlignment : std::min(Align(8u), SourceAlignment);
+    LoadValue->setAlignment(LoadAlign);
+  }
+  // ...and then we fill in the remaining with 8bit stores.
+  for (; byte < Length; byte += 1) {
+    Value *Idx = B.getIntN(PtrBits, byte);
+    Value *OffsetSrcPtr = B.CreateInBoundsGEP(Int8Ty, SrcPtr, Idx);
+    Value *OffsetDstPtr = B.CreateInBoundsGEP(Int8Ty, DstPtr, Idx, DstName);
+    LoadInst *LoadValue =
+        B.CreateLoad(Int8Ty, OffsetSrcPtr, IsVolatile, SrcName);
+    MC = B.CreateStore(LoadValue, OffsetDstPtr, IsVolatile);
+    LoadValue->setAlignment(llvm::Align(1));
+    MC->setAlignment(llvm::Align(1));
+  }
+
+  return MC;
+}
+
+Value *BuiltinInliningPass::processCallSite(CallInst *CI,
+                                            bool &NeedLLVMInline) {
+  NeedLLVMInline = false;
+
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee) {
+    return CI;
+  }
+
+  // Mark user function as needing inlining by LLVM, unless it has the NoInline
+  // attribute
+  if (!Callee->isDeclaration() &&
+      !Callee->hasFnAttribute(Attribute::NoInline)) {
+    CI->addFnAttr(Attribute::AlwaysInline);
+    NeedLLVMInline = true;
+    return CI;
+  }
+
+  // Specially inline some LLVM intrinsics.
+  if (Callee->isIntrinsic()) {
+    if (Callee->getIntrinsicID() == Intrinsic::memcpy) {
+      IRBuilder<> B(CI);
+      const SmallVector<Value *, 4> Args(CI->args());
+      if (Value *Impl = emitBuiltinMemCpy(Callee, B, Args, CI)) {
+        return Impl;
+      }
+    }
+
+    if (Callee->getIntrinsicID() == Intrinsic::memset) {
+      IRBuilder<> B(CI);
+      const SmallVector<Value *, 4> Args(CI->args());
+      if (Value *Impl = emitBuiltinMemSet(Callee, B, Args, CI)) {
+        return Impl;
+      }
+    }
+  }
+
+  return CI;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp
new file mode 100644
index 0000000000000..7a6e7d00fb05e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/common_gep_elimination_pass.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/common_gep_elimination_pass.h"
+
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/IR/Dominators.h>
+
+#include <unordered_map>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "vectorization_unit.h"
+
+using namespace llvm;
+using namespace vecz;
+
+char CommonGEPEliminationPass::PassID = 0;
+
+PreservedAnalyses CommonGEPEliminationPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  // Redundant GEPs to remove
+  SmallPtrSet<GetElementPtrInst *, 16> toDelete;
+  // GEPs we come across.
+  std::unordered_multimap<Value *, GetElementPtrInst *> GEPs;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+        Value *Ptr = GEP->getPointerOperand();
+        // If this is the first time we meet the source of the GEP, just add
+        // it to the multimap and look for another GEP.
+        if (GEPs.find(Ptr) == GEPs.end()) {
+          GEPs.emplace(Ptr, GEP);
+          continue;
+        }
+
+        // The range of values that have the key `Ptr`.
+        auto Range = GEPs.equal_range(Ptr);
+        auto it = Range.first;
+        for (; it != Range.second; it++) {
+          auto *trackedGEP = it->second;
+          if (GEP->getNumIndices() != trackedGEP->getNumIndices()) {
+            continue;
+          }
+
+          // With opaque pointers, we need to check the element types as well.
+          if (GEP->getSourceElementType() !=
+              trackedGEP->getSourceElementType()) {
+            continue;
+          }
+
+          unsigned i = 0;
+          for (; i < GEP->getNumIndices(); i++) {
+            Value *lhs = GEP->getOperand(i + 1);
+            Value *rhs = trackedGEP->getOperand(i + 1);
+
+            // Both GEPs we compare are not the same, stop comparing.
+            if (lhs != rhs) {
+              break;
+            }
+          }
+
+          // trackedGEP does the same operation as GEP, so replace GEP
+          // with the already tracked GEP.
+          if (i == GEP->getNumIndices()) {
+            if (DT.dominates(trackedGEP->getParent(), GEP->getParent())) {
+              GEP->replaceAllUsesWith(trackedGEP);
+              toDelete.insert(GEP);
+              break;
+            }
+          }
+        }
+        // We iterated over all values whose key is Ptr, but haven't found
+        // a matching GEP, so add the latter to the multimap.
+        if (it == Range.second) {
+          GEPs.emplace(Ptr, GEP);
+        }
+      }
+    }
+  }
+
+  // Proceed to remove every duplicate GEP we found.
+  for (auto *GEP : toDelete) {
+    IRCleanup::deleteInstructionNow(GEP);
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+  Preserved.preserve<CFGAnalysis>();
+  Preserved.preserve<DivergenceAnalysis>();
+
+  return Preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
new file mode 100644
index 0000000000000..175e1f043729d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/control_flow_conversion_pass.cpp
@@ -0,0 +1,3306 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/control_flow_conversion_pass.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/InstructionSimplify.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/Argument.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/Error.h>
+#include <llvm/Support/TypeSize.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include <queue>
+#include <utility>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "control_flow_boscc.h"
+#include "control_flow_roscc.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "reachability.h"
+#include "transform/passes.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz-cf"
+
+using namespace llvm;
+using namespace vecz;
+
+class ControlFlowConversionState::Impl : public ControlFlowConversionState {
+public:
+  Impl(Function &F, FunctionAnalysisManager &AM)
+      : ControlFlowConversionState(F, AM) {}
+
+  PreservedAnalyses run(Function &, FunctionAnalysisManager &);
+
+private:
+  /// @brief utility struct used by LinearizeCFG to allow block retargeting
+  /// info to be stored in a single contiguous vector of variable-length
+  /// subvectors. This avoids having to use a vector of vectors, and all
+  /// the individual heap allocations that would involve. Empirically (based on
+  /// UnitCL) we have approximately one new target per Basic Block overall,
+  /// and never more than 2 (which is not to say more than 2 is impossible).
+  /// Since we iterate over all NewTargetInfos linearly, we only need to record
+  /// the number of targets for each block, and not their starting indices.
+  struct Linearization {
+    struct NewTargetInfo {
+      BasicBlock *BB;
+      size_t numTargets = 0;
+
+      NewTargetInfo(BasicBlock *bb) : BB(bb) {}
+    };
+
+    std::vector<NewTargetInfo> infos;
+    std::vector<BasicBlock *> data;
+
+    void beginBlock(BasicBlock *BB) { infos.emplace_back(BB); }
+    size_t currentSize() const { return infos.back().numTargets; }
+    void push(BasicBlock *BB) {
+      data.push_back(BB);
+      ++infos.back().numTargets;
+    }
+  };
+
+  /// @brief Type that maps exit blocks to exit mask information.
+  using DenseExitPHIMap = SmallDenseMap<const BasicBlock *, PHINode *, 2>;
+  /// @brief Type that maps exiting blocks to update mask information.
+  using DenseExitUpdateMap =
+      SmallDenseMap<const BasicBlock *, BinaryOperator *, 2>;
+
+  struct LoopMasksInfo {
+    /// @brief Keep track of which instances left the loop through which exit
+    ///        (persisted throughout the whole loop).
+    DenseExitPHIMap persistedDivergentExitMasks;
+    /// @brief Divergent loop exit masks updated for the current iteration.
+    DenseExitUpdateMap updatedPersistedDivergentExitMasks;
+    /// @brief Combined divergent loop exit masks of the current iteration.
+    Instruction *combinedDivergentExitMask = nullptr;
+    /// @brief Combined divergent loop exit masks of the whole loop.
+    Instruction *persistedCombinedDivergentExitMask = nullptr;
+  };
+
+  /// @brief Convert the function's CFG to data-flow.
+  /// @return true if the function's CFG was converted, false otherwise.
+  bool convertToDataFlow();
+
+  /// @brief Generate masks needed to do control-flow to data-flow conversion.
+  /// @return true if masks were generated successfully, false otherwise.
+  bool generateMasks();
+
+  /// @brief Generate masks for the given block.
+  /// @param[in] BB Block whose masks we are generating.
+  /// @return true if no problem occurred, false otherwise.
+  bool createMasks(BasicBlock &BB);
+
+  /// @brief Create entry mask for the given block.
+  /// @param[in] BB Block whose masks we are generating.
+  /// @return true if no problem occurred, false otherwise.
+  bool createEntryMasks(BasicBlock &BB);
+
+  /// @brief Create exit mask for the given block.
+  /// @param[in] BB Block whose masks we are generating.
+  /// @param[in] isBOSCCEntry Whether BB creates a uniform region.
+  /// @return true if no problem occurred, false otherwise.
+  bool createExitMasks(BasicBlock &BB, bool isBOSCCEntry = false);
+
+  /// @brief Create loop exit masks for the given loop.
+  /// @param[in,out] LTag Information on the loop we are evaluating.
+  /// @return true if no problem occurred, false otherwise.
+  bool createLoopExitMasks(LoopTag &LTag);
+
+  /// @brief Combine all information about instances that left the loop in the
+  ///        current iteration.
+  /// @param[in,out] LTag Information on the loop we are evaluating.
+  /// @return true if no problem occurred, false otherwise.
+  bool createCombinedLoopExitMask(LoopTag &LTag);
+
+  /// @brief Apply masks to basic blocks in the function, to prevent
+  /// side-effects for inactive instances.
+  ///
+  /// @return llvm::Error::success if masks were applied successfully, an error
+  /// message explaining the failure otherwise.
+  Error applyMasks();
+
+  /// @brief Apply a mask to the given basic block, to prevent side-effects for
+  /// inactive instances.
+  ///
+  /// @param[in] BB Basic block to apply masks to.
+  /// @param[in] mask Mask to apply.
+  ///
+  /// @return llvm::Error::success if masks were applied successfully, an error
+  /// message explaining the failure otherwise.
+  Error applyMask(BasicBlock &BB, Value *mask);
+
+  /// @brief Emit a call instructions to the masked version of the called
+  /// function.
+  ///
+  /// @param[in] CI The call instructions to create a masked version of
+  /// @param[in] entryBit The Value that determines if the lane is active or
+  /// not.
+  /// @return The call instruction to the masked version.
+  CallInst *emitMaskedVersion(CallInst *CI, Value *entryBit);
+
+  /// @brief Create a masked version of the given function
+  ///
+  /// The Function (F) to be masked will be extracted from the CallInst and a
+  /// new Function (NewFunction) will be generated. NewFunction takes the same
+  /// arguments as F, plus an additional boolean argument that determines if the
+  /// lane is active or not. If the boolean argument is true, then NewFunction
+  /// will execute F and (if it's not void) return its return value. Vararg
+  /// functions are supported by expanding their arguments.
+  ///
+  /// @param[in] CI The call instructions to create a masked version of
+  /// @return The masked function
+  Function *getOrCreateMaskedVersion(CallInst *CI);
+
+  /// @brief a type that maps unmasked instructions onto masked replacements.
+  using DeletionMap = SmallVector<std::pair<Instruction *, Value *>, 4>;
+
+  /// @brief Attempt to apply a mask to an Instruction as a Memory Operation
+  ///
+  /// @param[in] I The Binary Operation to apply the mask to
+  /// @param[in] mask The mask to apply to the MemOp
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @param[out] safeDivisors a cache of re-usable known non-zero divisors
+  /// @return true if it was a BinOp, false otherwise
+  bool tryApplyMaskToBinOp(Instruction &I, Value *mask, DeletionMap &toDelete,
+                           DenseMap<Value *, Value *> &safeDivisors);
+
+  /// @brief Attempt to apply a mask to a Memory Operation
+  ///
+  /// @param[in] op The MemOp to apply the mask to
+  /// @param[in] mask The mask to apply to the MemOp
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true of the MemOp got masked, false otherwise
+  bool tryApplyMaskToMemOp(MemOp &op, Value *mask, DeletionMap &toDelete);
+
+  /// @brief Attempt to apply a mask to an Instruction as a Memory Operation
+  ///
+  /// @param[in] CI The call instruction to apply the mask to
+  /// @param[in] mask The mask to apply to the MemOp
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true if it is valid to mask this call, false otherwise
+  bool applyMaskToCall(CallInst *CI, Value *mask, DeletionMap &toDelete);
+
+  /// @brief Attempt to apply a mask to an atomic instruction via a builtin
+  /// call.
+  ///
+  /// @param[in] I The (atomic) instruction to apply the mask to
+  /// @param[in] mask The mask to apply to the masked atomic
+  /// @param[out] toDelete mapping of deleted unmasked operations
+  /// @return true if it is valid to mask this atomic, false otherwise
+  bool applyMaskToAtomic(Instruction &I, Value *mask, DeletionMap &toDelete);
+
+  /// @brief Linearize a CFG.
+  /// @return true if no problem occurred, false otherwise.
+  bool partiallyLinearizeCFG();
+
+  /// @brief Create the reduction functions needed to vectorize the branch
+  /// @return true on success, false otherwise
+  bool createBranchReductions();
+
+  /// @brief Uniformize every divergent loop.
+  ///
+  /// @return true if no problem occurred, false otherwise.
+  bool uniformizeDivergentLoops();
+
+  /// @brief Assign a divergent loop a single loop exit from which all other
+  ///        exits will be rewired.
+  /// @param[in] LTag Tag of the processed loop
+  /// @return true if no problem occurred, false otherwise.
+  bool computeDivergentLoopPureExit(LoopTag &LTag);
+
+  /// @brief Rewire every loop exit block such that the loop can be considered
+  ///        uniform.
+  ///
+  /// @param[in] LTag Tag of the processed loop
+  /// @param[in] exitBlocks List of exit blocks before any transformation
+  /// @return true if no problem occurred, false otherwise.
+  bool rewireDivergentLoopExitBlocks(
+      LoopTag &LTag, const SmallVectorImpl<BasicBlock *> &exitBlocks);
+
+  /// @brief Generate blend operations to discard execution of inactive
+  /// instances.
+  /// @param[in] LTag The loop whose live value is being handled.
+  /// @return true if no problem occurred, false otherwise.
+  bool generateDivergentLoopResults(LoopTag &LTag);
+
+  /// @brief Generate loop live value update instructions.
+  /// @param[in] LLV   The loop live value we want to generate instructions for.
+  /// @param[in] LTag The loop whose live value is being handled.
+  /// @return true if no problem occurred, false otherwise.
+  bool generateDivergentLoopResultUpdates(Value *LLV, LoopTag &LTag);
+
+  /// @brief Generate blend instruction for loop live values at the latch.
+  /// @param[in] LTag The loop whose live values are being handled.
+  /// @param[in] exitBlocks List of exit blocks before any transformation
+  /// @return true if no problem occurred, false otherwise.
+  bool
+  blendDivergentLoopLiveValues(LoopTag &LTag,
+                               const SmallVectorImpl<BasicBlock *> &exitBlocks);
+
+  /// @brief Generate blend instruction for loop exit masks at the latch.
+  ///
+  /// @param[in] LTag Tag of the processed loop
+  /// @param[in] exitEdges List of exit edges before any transformation
+  /// @param[in] exitBlocks List of exit blocks before any transformation
+  /// @return true if no problem occurred, false otherwise.
+  bool
+  blendDivergentLoopExitMasks(LoopTag &LTag,
+                              const SmallVectorImpl<Loop::Edge> &exitEdges,
+                              const SmallVectorImpl<BasicBlock *> &exitBlocks);
+
+  /// @brief Replace uses of loop values outside of a divergent loop.
+  ///
+  /// @param[in] LTag Tag of the processed loop
+  /// @param[in] from Instruction to be replaced.
+  /// @param[in] to Instruction to replace `from` with.
+  /// @param[in] exitBlocks Exit blocks of the loop.
+  /// @return true if no problem occurred, false otherwise.
+  bool replaceUsesOutsideDivergentLoop(
+      LoopTag &LTag, Value *from, Value *to,
+      const SmallVectorImpl<BasicBlock *> &exitBlocks);
+
+  /// @brief Assign new targets to edges based on the dominance-compact
+  ///        ordering.
+  /// @param[out] lin New target information for each BasicBlock
+  /// @return true if no problem occurred, false otherwise.
+  bool computeNewTargets(Linearization &lin);
+
+  /// @brief Linearize the CFG with the new calculated edges.
+  /// @return true if no problem occurred, false otherwise.
+  bool linearizeCFG();
+
+  /// @brief Generate blend operations to discard execution of inactive
+  /// instances.
+  /// @return true if no problem occurred, false otherwise.
+  bool generateSelects();
+
+  /// @brief Split a phi instruction into several select instructions.
+  /// @param[in,out] PHI The PHI node we want to split.
+  /// @param[in]     B  The block PHI belongs to.
+  /// @return true if no problem occurred, false otherwise.
+  bool generateSelectFromPHI(PHINode *PHI, BasicBlock *B);
+
+  /// @brief Repair the SSA form. First blend and create new masks from the
+  ///        new wires, then blend all the instructions that need blending.
+  /// @return true if no errors occurred.
+  bool repairSSA();
+
+  /// @brief Update the incoming blocks of phi nodes whose predecessors have
+  ///        changed whilst rewiring.
+  /// @return true if no errors occurred.
+  bool updatePHIsIncomings();
+
+  /// @brief Blend instructions before their uses if divergence happened
+  ///        inbetween.
+  /// @return true if no errors occurred.
+  bool blendInstructions();
+
+  /// @brief Simplify the mask instructions.
+  /// @return true if no errors occurred.
+  bool simplifyMasks();
+
+  /// @brief Check all blocks have a unique index order.
+  /// @return true if no errors occurred.
+  bool checkBlocksOrder() const;
+
+  /// @brief Upon modifying a mask, we need to update the in-memory masks as
+  ///        well.
+  /// @param[in] src The block whose mask changed
+  /// @param[in] from The old mask
+  /// @param[in] to The new mask
+  void replaceMasks(BasicBlock *src, Value *from, Value *to);
+
+  /// @brief Upon removing an instruction, we need to also update our internal
+  ///        containers.
+  /// @param[in] from The old value
+  /// @param[in] to The new value
+  void updateMaps(Value *from, Value *to);
+
+  BasicBlock *functionExitBlock = nullptr;
+  DenseSet<const Instruction *> blends;
+  DenseMap<Loop *, LoopMasksInfo> LoopMasks;
+};
+
+STATISTIC(VeczCFGFail,
+          "Number of kernels that failed control flow conversion [ID#L80]");
+
+// Set this to enable all-of masks in the latch of divergent loops. This can
+// be interesting if there exists an intrinsic that, when comparing vector
+// instructions, can immediately stop comparing if one of the operands if false.
+// In counterpart, this makes us update two more values per divergent loops
+// (said values allowing to keep track of which instances left the loop).
+//
+// Because no such intrinsic exists to my knowledge, we don't set this by
+// default.
+#undef ALL_OF_DIVERGENT_LOOP_LATCH
+
+namespace {
+
+BasicBlock::iterator getInsertionPt(BasicBlock &BB) {
+  // We have to insert instructions after any Allocas
+  auto it = BB.getFirstInsertionPt();
+  while (isa<AllocaInst>(*it)) {
+    ++it;
+  }
+  return it;
+}
+
+Instruction *copyMask(Value *mask, Twine name) {
+  VECZ_ERROR_IF(!mask, "Trying to copy mask with invalid arguments");
+  return BinaryOperator::CreateAnd(mask, getDefaultValue(mask->getType(), 1),
+                                   name);
+}
+
+Instruction *copyEntryMask(Value *mask, BasicBlock &BB) {
+  VECZ_ERROR_IF(!mask, "Trying to copy entry mask with invalid arguments");
+  auto *EM = copyMask(mask, BB.getName() + ".entry_mask");
+  EM->insertBefore(getInsertionPt(BB));
+  return EM;
+}
+
+Instruction *copyExitMask(Value *mask, StringRef base, BasicBlock &BB) {
+  VECZ_ERROR_IF(!mask, "Trying to copy exit mask with invalid arguments");
+  auto *EM = copyMask(mask, base + ".exit_mask");
+  EM->insertBefore(BB.getTerminator()->getIterator());
+  return EM;
+}
+
+/// Wrap a string into an llvm::StringError, pointing to an instruction.
+static inline Error makeStringError(const Twine &message, Instruction &I) {
+  std::string helper_str = message.str();
+  raw_string_ostream helper_stream(helper_str);
+  helper_stream << " " << I;
+  return make_error<StringError>(helper_stream.str(), inconvertibleErrorCode());
+}
+
+// A helper method to determine whether a branch condition
+// (expected to be an i1 result of a comparison instruction) is truly uniform.
+static bool isBranchCondTrulyUniform(Value *cond, UniformValueResult &UVR) {
+  const auto *cmp = dyn_cast_if_present<CmpInst>(cond);
+  if (!cmp || cmp->getType()->isVectorTy()) {
+    return false;
+  }
+
+  return UVR.isTrueUniform(cmp);
+}
+} // namespace
+
+////////////////////////////////////////////////////////////////////////////////
+
+char ControlFlowConversionPass::PassID = 0;
+
+PreservedAnalyses ControlFlowConversionPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  ControlFlowConversionState::Impl state(F, AM);
+  return state.run(F, AM);
+}
+
+ControlFlowConversionState::ControlFlowConversionState(
+    Function &F, FunctionAnalysisManager &AM)
+    : F(F), AM(AM), VU(AM.getResult<VectorizationUnitAnalysis>(F).getVU()),
+      Ctx(AM.getResult<VectorizationContextAnalysis>(F).getContext()) {}
+
+PreservedAnalyses
+ControlFlowConversionState::Impl::run(Function &F,
+                                      FunctionAnalysisManager &AM) {
+  const auto &CFGR = AM.getResult<CFGAnalysis>(F);
+  if (CFGR.getFailed()) {
+    ++VeczCFGFail;
+    return VU.setFailed("Cannot vectorize the CFG for", &F, &F);
+  } else if (!CFGR.isConversionNeeded()) {
+    return PreservedAnalyses::all();
+  }
+  functionExitBlock = CFGR.getExitBlock();
+
+  if (!convertToDataFlow()) {
+    // This pass may leave the function in an invalid state. Instead of doing
+    // so, and hoping that later passes don't throw verification failures back
+    // at us, replace the function body with an unreachable statement. Marking
+    // vectorization has having failed will mean the function will later be
+    // deleted.
+    // Note that this is quite coarse-grained; we could be cleverer, e.g., by
+    // returning whether convertToDataFlow has (potentially) left behind an
+    // invalid function.
+    ++VeczCFGFail;
+    VU.setFailed("Control flow conversion failed for", &F, VU.scalarFunction());
+    F.deleteBody();
+    BasicBlock *BB = BasicBlock::Create(F.getContext(), "entry", &F);
+    IRBuilder<> IRB(BB);
+    IRB.CreateUnreachable();
+    return PreservedAnalyses::none();
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DivergenceAnalysis>();
+
+  return Preserved;
+}
+
+bool ControlFlowConversionState::replaceReachableUses(Reachability &RC,
+                                                      Instruction *from,
+                                                      Value *to,
+                                                      BasicBlock *src) {
+  for (auto it = from->use_begin(); it != from->use_end();) {
+    Use &U = *it++;
+    Instruction *user = cast<Instruction>(U.getUser());
+
+    if (user == to) {
+      continue;
+    }
+
+    BasicBlock *blockUse = user->getParent();
+
+    if (PHINode *PHI = dyn_cast<PHINode>(user)) {
+      // Cannot replace a use in a phi node with another phi node in the same
+      // block.
+      if (blockUse == src) {
+        if (isa<PHINode>(to)) {
+          continue;
+        }
+      } else {
+        // We must also check that 'src' can reach the incoming block to be
+        // allowed to replace the incoming value.
+        BasicBlock *incoming = PHI->getIncomingBlock(U);
+        if (!RC.isReachable(src, incoming)) {
+          continue;
+        }
+      }
+    }
+
+    if (auto toI = dyn_cast<Instruction>(to)) {
+      if (toI->getParent() == blockUse) {
+        for (Instruction &I : *src) {
+          // If we found the user before `to`, then skip this user as it lives
+          // before `to` in the same block.
+          if (&I == user) {
+            break;
+          }
+          if (&I == to) {
+            LLVM_DEBUG(dbgs() << "Replace  " << *from << " with " << *to
+                              << " in " << *user << "\n");
+            U.set(to);
+            break;
+          }
+        }
+        // We've handled all possible cases if `to` lives in the same block as
+        // `user`, so iterate over a new instruction.
+        continue;
+      }
+    }
+
+    // `to` is in a different block than `user` so just check for reachability
+    // across BasicBlocks and not within them.
+    if (RC.isReachable(src, blockUse)) {
+      LLVM_DEBUG(dbgs() << "Replace  " << *from << " with " << *to << " in "
+                        << *user << "\n");
+      U.set(to);
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::convertToDataFlow() {
+  DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  PDT = &AM.getResult<PostDominatorTreeAnalysis>(F);
+  LI = &AM.getResult<LoopAnalysis>(F);
+  UVR = &AM.getResult<UniformValueAnalysis>(F);
+
+  // Make sure every loop has an entry in the masks table before we start.
+  for (auto *L : *LI) {
+    LoopMasks[L];
+  }
+
+  if (!VU.choices().linearizeBOSCC()) {
+    ROSCCGadget ROSCC(*this);
+    ROSCC.run(F);
+  }
+
+  RC = std::make_unique<Reachability>(*DT, *PDT, *LI);
+
+  // We do this after ROSCC, because it may have modified the CFG.
+  DR = &AM.getResult<DivergenceAnalysis>(F);
+
+  if (VU.choices().linearizeBOSCC()) {
+    BOSCC = std::make_unique<BOSCCGadget>(*this);
+    if (!BOSCC->duplicateUniformRegions()) {
+      emitVeczRemarkMissed(&F, VU.scalarFunction(),
+                           "Could not duplicate uniform regions for");
+      return false;
+    }
+  }
+
+  // Reserve space for the masks table and default-construct all entries, to
+  // avoid re-hashing/element relocation on access.
+  MaskInfos.reserve(F.size());
+  for (auto &BB : F) {
+    MaskInfos[&BB];
+  }
+
+  if (!generateMasks()) {
+    emitVeczRemarkMissed(&F, VU.scalarFunction(),
+                         "Could not generate masks for");
+    return false;
+  }
+  if (auto err = applyMasks()) {
+    emitVeczRemarkMissed(&F, VU.scalarFunction(), "Could not apply masks for",
+                         llvm::toString(std::move(err)));
+    return false;
+  }
+
+  if (!partiallyLinearizeCFG()) {
+    emitVeczRemarkMissed(&F, VU.scalarFunction(),
+                         "Could not partially linearize the CFG for");
+    return false;
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateMasks() {
+  LLVM_DEBUG(dbgs() << "MASKS GENERATION\n");
+
+  RC->update(F);
+
+  VECZ_FAIL_IF(!createMasks(*functionExitBlock));
+
+  if (BOSCC) {
+    // The BOSCC entry blocks that have not been duplicated need exit masks
+    // towards uniform blocks.
+    SmallVector<BasicBlock *, 16> entryBlocks;
+    BOSCC->getUnduplicatedEntryBlocks(entryBlocks);
+    for (auto *const entry : entryBlocks) {
+      VECZ_FAIL_IF(!createExitMasks(*entry, true));
+    }
+
+    // Link the masks of the predicated regions to the uniform regions.
+    VECZ_FAIL_IF(!BOSCC->linkMasks());
+  }
+
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    VECZ_FAIL_IF(!createLoopExitMasks(*LTag));
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createMasks(BasicBlock &BB) {
+  // If we have already set the mask for this block, don't do it again.
+  // Uniform blocks are handled separately because of their lack of context.
+  if (MaskInfos[&BB].entryMask) {
+    return true;
+  }
+
+  auto *const LTag = DR->getTag(&BB).loop;
+  auto *const header = LTag ? LTag->header : nullptr;
+  // If BB is a header, we will need the mask from its preheader.
+  // KLOCWORK "NPD.CHECK.MIGHT" possible false positive
+  // LTag is only dereferenced if it's not nullptr, but Klocwork doesn't follow
+  // the logic.
+  if (header == &BB) {
+    BasicBlock *preheader = LTag->preheader;
+    VECZ_FAIL_IF(!createMasks(*preheader));
+  } else {
+    // Otherwise we will need the mask from every incoming edge.
+    for (BasicBlock *pred : predecessors(&BB)) {
+      VECZ_FAIL_IF(!createMasks(*pred));
+    }
+  }
+
+  VECZ_FAIL_IF(!createEntryMasks(BB));
+  VECZ_FAIL_IF(!createExitMasks(BB));
+
+  // If the block is a loop header, its entry mask is a phi function with
+  // incoming values from the preheader and:
+  //  - the latch for divergent loops,
+  //  - nothing else for uniform loops (because if we enter an uniform loop,
+  //    all instance that were active upon entry remain active upon exit).
+  if (header == &BB) {
+    BasicBlock *latch = LTag->latch;
+    VECZ_FAIL_IF(!createMasks(*latch));
+
+    if (LTag->isLoopDivergent()) {
+      auto *const entryMask = MaskInfos[&BB].entryMask;
+      assert(isa<PHINode>(entryMask) &&
+             "Divergent Loop entry mask must be a PHI Node!");
+      PHINode *phi = cast<PHINode>(entryMask);
+      // If header has two incoming values, we have already processed it.
+      if (phi->getNumIncomingValues() != 2) {
+        Value *latchMask = MaskInfos[latch].exitMasks[header];
+        phi->addIncoming(latchMask, latch);
+
+        LLVM_DEBUG(dbgs() << "Divergent loop header " << header->getName()
+                          << ": entry mask: " << *phi << "\n");
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createEntryMasks(BasicBlock &BB) {
+  auto &maskInfo = MaskInfos[&BB];
+  if (maskInfo.entryMask) {
+    return true;
+  }
+
+  Type *maskTy = Type::getInt1Ty(BB.getContext());
+
+  // If the block is by_all (i.e. executed by all lanes), it will always be
+  // executed on active masks,
+  // Similarly, if the block is uniform, its mask is true by definition.
+  if (DR->isByAll(BB) || DR->isUniform(BB)) {
+    maskInfo.entryMask = copyEntryMask(getDefaultValue(maskTy, 1), BB);
+    LLVM_DEBUG(dbgs() << BB.getName() << ": entry mask: " << *maskInfo.entryMask
+                      << "\n");
+    return true;
+  }
+
+  // If the block has only one predecessor, set its entry mask to be its
+  // predecessor's exit mask.
+  const unsigned numPreds = std::distance(pred_begin(&BB), pred_end(&BB));
+  if (numPreds == 1) {
+    BasicBlock *pred = *pred_begin(&BB);
+    maskInfo.entryMask = copyEntryMask(MaskInfos[pred].exitMasks[&BB], BB);
+    LLVM_DEBUG(dbgs() << BB.getName()
+                      << ": entry mask: its single predecessor exit mask "
+                      << *maskInfo.entryMask << "\n");
+    return true;
+  }
+
+  // If the block is a loop header, its mask is a phi function with incoming
+  // values from the preheader and:
+  //  - the latch for divergent loops,
+  //  - nothing else for uniform loops (because if we enter a uniform loop,
+  //    all instance that were active upon entry remain active upon exit).
+  //
+  // Here we only store the preheader's exit block as we handle the latch
+  // in case the loop is divergent in the caller function.
+  const auto *const LTag = DR->getTag(&BB).loop;
+  if (LTag && LTag->header == &BB) {
+    BasicBlock *preheader = LTag->preheader;
+    VECZ_ERROR_IF(!preheader, "BasicBlock tag is not defined");
+
+    if (LTag->isLoopDivergent()) {
+      PHINode *PHI = PHINode::Create(maskTy, 2, BB.getName() + ".entry_mask");
+      PHI->insertBefore(BB.begin());
+      PHI->addIncoming(MaskInfos[preheader].exitMasks[&BB], preheader);
+      maskInfo.entryMask = PHI;
+      LLVM_DEBUG(dbgs() << "Loop divergent loop header " << BB.getName()
+                        << ": entry mask: " << *maskInfo.entryMask << "\n");
+
+    } else {
+      maskInfo.entryMask =
+          copyEntryMask(MaskInfos[preheader].exitMasks[&BB], BB);
+      LLVM_DEBUG(dbgs() << "Uniform loop header " << BB.getName()
+                        << ": entry mask: " << *maskInfo.entryMask << "\n");
+    }
+    return true;
+  }
+
+  // If the dominator of this block is also post-dominated by this block,
+  // then if one is executed, the other must be also. So copy the mask.
+  auto *IDom = DT->getNode(&BB)->getIDom();
+  while (IDom) {
+    BasicBlock *DomBB = IDom->getBlock();
+    if (DR->getTag(DomBB).loop == LTag && PDT->dominates(&BB, DomBB)) {
+      maskInfo.entryMask = copyEntryMask(MaskInfos[DomBB].entryMask, BB);
+      LLVM_DEBUG(dbgs() << "Copied-via-domination " << BB.getName()
+                        << ": entry mask: " << *maskInfo.entryMask << "\n");
+      return true;
+    }
+    IDom = IDom->getIDom();
+  }
+
+  // In any other case, its mask is the disjunction of every incoming edge.
+  // The union of every predecessor if it is a join point of a varying branch.
+  if (DR->isBlend(BB)) {
+    for (auto it = pred_begin(&BB); it != pred_end(&BB); ++it) {
+      if (it == pred_begin(&BB)) {
+        maskInfo.entryMask = copyEntryMask(MaskInfos[*it].exitMasks[&BB], BB);
+        LLVM_DEBUG(dbgs() << "Blend block " << BB.getName()
+                          << ": entry mask: " << *maskInfo.entryMask << "\n");
+      } else {
+        auto InsertPt = std::next(maskInfo.entryMask->getIterator());
+        maskInfo.entryMask = BinaryOperator::CreateOr(
+            maskInfo.entryMask, MaskInfos[*it].exitMasks[&BB],
+            BB.getName() + ".entry_mask");
+        maskInfo.entryMask->insertBefore(InsertPt);
+
+        LLVM_DEBUG(dbgs() << "Blend block " << BB.getName()
+                          << ": entry mask: " << *maskInfo.entryMask << "\n");
+      }
+    }
+  } else {
+    // A phi function of the predecessors otherwise.
+    PHINode *PHI =
+        PHINode::Create(maskTy, numPreds, BB.getName() + ".entry_mask");
+    PHI->insertBefore(BB.begin());
+    for (auto it = pred_begin(&BB); it != pred_end(&BB); ++it) {
+      PHI->addIncoming(MaskInfos[*it].exitMasks[&BB], *it);
+    }
+    maskInfo.entryMask = PHI;
+    LLVM_DEBUG(dbgs() << BB.getName() << ": entry mask: " << *maskInfo.entryMask
+                      << "\n");
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createExitMasks(BasicBlock &BB,
+                                                       bool isBOSCCEntry) {
+  assert((!isBOSCCEntry || BOSCC) &&
+         "Creating BOSCC Exit Masks when BOSCC object does not exist!");
+
+  auto &maskInfo = MaskInfos[&BB];
+
+  // If BB is a BOSCC entry, we want to compute the uniform exit masks for
+  // this block.
+  if (!isBOSCCEntry && !maskInfo.exitMasks.empty()) {
+    return true;
+  }
+
+  const unsigned numSucc = std::distance(succ_begin(&BB), succ_end(&BB));
+
+  // If BB has no successor, there is obviously nothing to do.
+  if (numSucc == 0) {
+    return true;
+  }
+
+  // If BB has only one successor, then the exit mask is the entry mask of BB.
+  if (numSucc == 1) {
+    BasicBlock *succ = *succ_begin(&BB);
+    maskInfo.exitMasks[succ] =
+        copyExitMask(maskInfo.entryMask, succ->getName(), BB);
+    LLVM_DEBUG(dbgs() << BB.getName() << ": exit mask to single successor "
+                      << succ->getName() << ": " << *maskInfo.entryMask
+                      << "\n");
+    return true;
+  }
+
+  const bool isVarying = DR->getTag(&BB).hasVaryingBranch();
+
+  // If BB has more than 1 successor, the exit mask of each successor is the
+  // conjunction of the entry mask of BB and the condition to jump to the
+  // successor.
+  auto *T = BB.getTerminator();
+  IRBuilder<> B(T);
+
+  if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+    BasicBlock *trueBB = BI->getSuccessor(0);
+    BasicBlock *falseBB = BI->getSuccessor(1);
+    assert(trueBB && "Could not get successor 0 of branch");
+    assert(falseBB && "Could not get successor 1 of branch");
+
+    if (isBOSCCEntry) {
+      if (BasicBlock *trueBBUniform = BOSCC->getBlock(trueBB)) {
+        trueBB = trueBBUniform;
+      }
+      if (BasicBlock *falseBBUniform = BOSCC->getBlock(falseBB)) {
+        falseBB = falseBBUniform;
+      }
+    }
+
+    Value *cond = BI->getCondition();
+    if (isVarying) {
+      Value *constantFalse = getDefaultValue(cond->getType());
+
+      maskInfo.exitMasks[trueBB] =
+          B.CreateSelect(maskInfo.entryMask, cond, constantFalse,
+                         trueBB->getName() + ".exit_mask");
+
+      // For the false edge, we have to negate the condition.
+      Value *negCond = B.CreateNot(cond, cond->getName() + ".not");
+      maskInfo.exitMasks[falseBB] =
+          B.CreateSelect(maskInfo.entryMask, negCond, constantFalse,
+                         falseBB->getName() + ".exit_mask");
+
+      LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
+                        << trueBB->getName() << ": "
+                        << *maskInfo.exitMasks[trueBB] << "\n");
+      LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
+                        << falseBB->getName() << ": "
+                        << *maskInfo.exitMasks[falseBB] << "\n");
+    } else {
+      maskInfo.exitMasks[trueBB] = B.CreateSelect(
+          cond, maskInfo.entryMask, getDefaultValue(cond->getType()),
+          trueBB->getName() + ".exit_mask");
+      maskInfo.exitMasks[falseBB] =
+          B.CreateSelect(cond, getDefaultValue(cond->getType()),
+                         maskInfo.entryMask, falseBB->getName() + ".exit_mask");
+
+      LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
+                        << trueBB->getName() << ": "
+                        << *maskInfo.exitMasks[trueBB] << "\n");
+      LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
+                        << falseBB->getName() << ": "
+                        << *maskInfo.exitMasks[falseBB] << "\n");
+    }
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
+    Value *cond = SI->getCondition();
+    BasicBlock *defaultDest = SI->getDefaultDest();
+
+    if (isBOSCCEntry) {
+      if (BasicBlock *defaultDestUniform = BOSCC->getBlock(defaultDest)) {
+        defaultDest = defaultDestUniform;
+      }
+    }
+
+    // The default condition is the negation of the disjunction of every case
+    // condition, so that if no case has its condition true, then we can choose
+    // default.
+    Value *caseConds = nullptr;
+    Value *constantFalse = nullptr;
+    for (auto c : SI->cases()) {
+      Value *caseCond = B.CreateICmpEQ(cond, c.getCaseValue());
+      if (!caseConds) {
+        caseConds = caseCond;
+        constantFalse = getDefaultValue(caseCond->getType());
+      } else {
+        caseConds = B.CreateOr(caseConds, caseCond);
+      }
+      BasicBlock *caseBlock = c.getCaseSuccessor();
+      if (isBOSCCEntry) {
+        if (BasicBlock *caseBlockUniform = BOSCC->getBlock(caseBlock)) {
+          caseBlock = caseBlockUniform;
+        }
+      }
+
+      if (isVarying) {
+        maskInfo.exitMasks[caseBlock] =
+            B.CreateSelect(maskInfo.entryMask, caseCond, constantFalse,
+                           caseBlock->getName() + ".exit_mask");
+        LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
+                          << caseBlock->getName() << ": "
+                          << *maskInfo.exitMasks[caseBlock] << "\n");
+      } else {
+        maskInfo.exitMasks[caseBlock] =
+            B.CreateSelect(maskInfo.entryMask, caseCond, constantFalse,
+                           caseBlock->getName() + ".exit_mask");
+        LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
+                          << caseBlock->getName() << ": "
+                          << *maskInfo.exitMasks[caseBlock] << "\n");
+      }
+    }
+
+    VECZ_ERROR_IF(!caseConds, "No switch condition was found");
+
+    Value *negCond = B.CreateNot(caseConds, caseConds->getName() + ".not");
+    if (isVarying) {
+      maskInfo.exitMasks[defaultDest] =
+          B.CreateSelect(maskInfo.entryMask, negCond, constantFalse,
+                         defaultDest->getName() + ".exit_mask");
+      LLVM_DEBUG(dbgs() << BB.getName() << ": varying exit mask to "
+                        << defaultDest->getName() << ": "
+                        << *maskInfo.exitMasks[defaultDest] << "\n");
+    } else {
+      maskInfo.exitMasks[defaultDest] =
+          B.CreateSelect(maskInfo.entryMask, negCond, constantFalse,
+                         defaultDest->getName() + ".exit_mask");
+      LLVM_DEBUG(dbgs() << BB.getName() << ": uniform exit mask to "
+                        << defaultDest->getName() << ": "
+                        << *maskInfo.exitMasks[defaultDest] << "\n");
+    }
+  } else {
+    // We should not have a case where we don't have a BranchInst nor a
+    // SwitchInst but more than 1 successors.
+    return false;
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createLoopExitMasks(LoopTag &LTag) {
+  auto &LMask = LoopMasks[LTag.loop];
+  // If the Loop already has a CombinedExitMasks we have already processed it.
+  if (LMask.combinedDivergentExitMask) {
+    return true;
+  }
+
+  Type *maskTy = Type::getInt1Ty(F.getContext());
+  SmallVector<Loop::Edge, 1> exitEdges;
+  LTag.loop->getExitEdges(exitEdges);
+  for (const Loop::Edge &EE : exitEdges) {
+    const auto *const exitingBlock = EE.first;
+    const auto *const exitBlock = EE.second;
+    // Divergent loop need to keep track of which instance left at which exit.
+    if (LTag.isLoopDivergent() && DR->isDivergent(*exitBlock)) {
+      // The value of the exit mask of a divergent loop is a phi function
+      // between the mask update and the loop exit mask phi.
+      auto *const exitMask =
+          PHINode::Create(maskTy, 2, exitBlock->getName() + ".loop_exit_mask");
+      exitMask->insertBefore(LTag.header->getFirstNonPHIIt());
+      LMask.persistedDivergentExitMasks[exitingBlock] = exitMask;
+      if (BOSCC) {
+        BOSCC->createReference(exitMask, getDefaultValue(maskTy));
+      }
+    }
+  }
+
+  for (Loop *L : LTag.loop->getSubLoops()) {
+    VECZ_FAIL_IF(!createLoopExitMasks(DR->getTag(L)));
+  }
+
+  // If the loop is uniform, all instances that enter the loop will leave it
+  // together.
+  if (!LTag.isLoopDivergent()) {
+    return true;
+  }
+
+  // Check if the exit edge leaves multiple loops, in which case we return the
+  // next inner loop left by it.
+  auto nextInnerLoopLeft = [this, &LTag](BasicBlock *exitingBlock,
+                                         BasicBlock *exitBlock) -> Loop * {
+    Loop *innerLoop = nullptr;
+    Loop *loop = DR->getTag(exitingBlock).loop->loop;
+    // Iterate until we reach the current loop.
+    while (loop && loop != LTag.loop) {
+      // If this is an exit edge.
+      if (loop->contains(exitingBlock) && !loop->contains(exitBlock)) {
+        innerLoop = loop;
+      }
+
+      loop = loop->getParentLoop();
+    }
+
+    return innerLoop;
+  };
+
+  for (const Loop::Edge &EE : exitEdges) {
+    BasicBlock *exitingBlock = const_cast<BasicBlock *>(EE.first);
+    BasicBlock *exitBlock = const_cast<BasicBlock *>(EE.second);
+
+    if (DR->isDivergent(*exitBlock)) {
+      PHINode *REM = LMask.persistedDivergentExitMasks[exitingBlock];
+      REM->addIncoming(getDefaultValue(REM->getType()), LTag.preheader);
+
+      const auto *const exitingLTag = DR->getTag(exitingBlock).loop;
+      VECZ_ERROR_IF(!exitingLTag, "Loop tag is not defined");
+
+      // By default, the second operand of the mask update is the exit
+      // condition.
+      auto &exitMasks = MaskInfos[exitingBlock].exitMasks;
+      Value *maskUpdateOperand = exitMasks[exitBlock];
+
+      // If the exit leaves multiple loops and the current loop is not the
+      // innermost left by this exit, set the update mask to be a disjunction
+      // with the exit mask and the accumulated update mask from the next inner
+      // loop left by this exit.
+      if (exitingLTag->loop != LTag.loop) {
+        if (Loop *nestedLoop = nextInnerLoopLeft(exitingBlock, exitBlock)) {
+          maskUpdateOperand =
+              LoopMasks[nestedLoop]
+                  .updatedPersistedDivergentExitMasks[exitingBlock];
+        }
+      }
+
+      BinaryOperator *maskUpdate = BinaryOperator::CreateOr(
+          REM, maskUpdateOperand,
+          exitBlock->getName() + ".loop_exit_mask.update");
+      maskUpdate->insertBefore(exitingBlock->getTerminator()->getIterator());
+
+      LMask.updatedPersistedDivergentExitMasks[exitingBlock] = maskUpdate;
+
+      if (BOSCC) {
+        // The uniform version of divergent loop exit masks is the edge's exit
+        // mask.
+        BOSCC->addReference(maskUpdate, exitMasks[exitBlock]);
+      }
+
+      // If this is the outermost loop left by this exit, update the exit
+      // mask.
+      if (DR->getTag(exitBlock).outermostExitedLoop == &LTag) {
+        VECZ_ERROR_IF(!isa<Instruction>(exitMasks[exitBlock]),
+                      "Trying to replace uses of a value");
+        VECZ_FAIL_IF(
+            !replaceReachableUses(*RC, cast<Instruction>(exitMasks[exitBlock]),
+                                  maskUpdate, exitBlock));
+
+        exitMasks[exitBlock] = maskUpdate;
+      }
+
+      REM->addIncoming(maskUpdate, LTag.latch);
+
+      LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName()
+                        << ": divergent loop exit edges ["
+                        << exitingBlock->getName() << " -> "
+                        << exitBlock->getName() << "]: exit mask: " << *REM
+                        << "\n");
+      LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName()
+                        << ": divergent loop exit edges ["
+                        << exitingBlock->getName() << " -> "
+                        << exitBlock->getName()
+                        << "]: update exit mask: " << *maskUpdate << "\n");
+    }
+  }
+
+  VECZ_FAIL_IF(!createCombinedLoopExitMask(LTag));
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createCombinedLoopExitMask(
+    LoopTag &LTag) {
+  // Gather every information on every instance that left the loop in the
+  // current iteration.
+  SmallVector<Loop::Edge, 1> exitEdges;
+  auto *const Loop = LTag.loop;
+  Loop->getExitEdges(exitEdges);
+  auto &LMask = LoopMasks[Loop];
+  for (const Loop::Edge &EE : exitEdges) {
+    BasicBlock *exitingBlock = const_cast<BasicBlock *>(EE.first);
+    BasicBlock *exitBlock = const_cast<BasicBlock *>(EE.second);
+    if (DR->isDivergent(*exitBlock)) {
+      if (!LMask.combinedDivergentExitMask) {
+        LMask.combinedDivergentExitMask = copyMask(
+            LMask.updatedPersistedDivergentExitMasks[exitingBlock]->getOperand(
+                1),
+            Loop->getName() + ".combined_divergent_exit_mask");
+
+        LMask.persistedCombinedDivergentExitMask = copyMask(
+            LMask.updatedPersistedDivergentExitMasks[exitingBlock],
+            Loop->getName() + ".persisted_combined_divergent_exit_mask");
+      } else {
+        LMask.combinedDivergentExitMask = BinaryOperator::CreateOr(
+            LMask.combinedDivergentExitMask,
+            LMask.updatedPersistedDivergentExitMasks[exitingBlock]->getOperand(
+                1),
+            Loop->getName() + ".combined_divergent_exit_mask");
+
+        LMask.persistedCombinedDivergentExitMask = BinaryOperator::CreateOr(
+            LMask.persistedCombinedDivergentExitMask,
+            LMask.updatedPersistedDivergentExitMasks[exitingBlock],
+            Loop->getName() + ".persisted_combined_divergent_exit_mask");
+      }
+      LMask.combinedDivergentExitMask->insertBefore(
+          LTag.latch->getTerminator()->getIterator());
+      LMask.persistedCombinedDivergentExitMask->insertBefore(
+          LTag.latch->getTerminator()->getIterator());
+    }
+  }
+
+  VECZ_ERROR_IF(!LMask.combinedDivergentExitMask ||
+                    !LMask.persistedCombinedDivergentExitMask,
+                "Divergent loop has no loop exit condition");
+
+  LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName()
+                    << ": current iteration combine divergent loop exit: "
+                    << *LMask.combinedDivergentExitMask << "\n");
+  LLVM_DEBUG(dbgs() << "Divergent loop " << LTag.loop->getName()
+                    << ": whole loop combine divergent loop exit: "
+                    << *LMask.persistedCombinedDivergentExitMask << "\n");
+
+  return true;
+}
+
+Error ControlFlowConversionState::Impl::applyMasks() {
+  for (auto &BB : F) {
+    // Use masks with instructions that have side-effects.
+    if (!DR->isUniform(BB) && !DR->isByAll(BB)) {
+      auto *const entryMask = MaskInfos[&BB].entryMask;
+      VECZ_ERROR_IF(!entryMask, "BasicBlock should have an entry mask");
+      if (auto err = applyMask(BB, entryMask)) {
+        return err;
+      }
+    }
+  }
+  return Error::success();
+}
+
+Error ControlFlowConversionState::Impl::applyMask(BasicBlock &BB, Value *mask) {
+  // Packetization hasn't happened yet so this better be a scalar 1 bit int.
+  assert(mask->getType()->isIntegerTy(1) && "CFG mask type should be int1");
+  // Map the unmasked instruction with the masked one.
+  DeletionMap toDelete;
+  DenseMap<Value *, Value *> safeDivisors;
+
+  for (Instruction &I : BB) {
+    if (tryApplyMaskToBinOp(I, mask, toDelete, safeDivisors)) {
+      continue;
+    }
+    std::optional<MemOp> memOp = MemOp::get(&I);
+    // Turn loads and stores into masked loads and stores.
+    if (memOp && (memOp->isLoad() || memOp->isStore())) {
+      if (!tryApplyMaskToMemOp(*memOp, mask, toDelete)) {
+        return makeStringError("Could not apply mask to MemOp", I);
+      }
+    } else if (auto *CI = dyn_cast<CallInst>(&I)) {
+      // Turn calls into masked calls if possible.
+      if (!applyMaskToCall(CI, mask, toDelete)) {
+        return makeStringError("Could not apply mask to call instruction", I);
+      }
+    } else if (I.isAtomic() && !isa<FenceInst>(&I)) {
+      // Turn atomics into calls to masked builtins if possible.
+      if (!applyMaskToAtomic(I, mask, toDelete)) {
+        return makeStringError("Could not apply mask to atomic instruction", I);
+      }
+    } else if (auto *branch = dyn_cast<BranchInst>(&I)) {
+      // We have to be careful with infinite loops, because if they exist on a
+      // divergent code path, they will always be entered and will hang the
+      // kernel. Therefore, we replace the branch condition with the mask of
+      // the preheader, to ensure they only loop if at least one lane is
+      // actually executed.
+      if (branch->isConditional()) {
+        auto *const cond = dyn_cast<Constant>(branch->getCondition());
+        if (cond && cond->isOneValue()) {
+          auto *const loop = DR->getTag(&BB).loop;
+          if (loop && loop->latch == &BB) {
+            auto *const loopMask = MaskInfos[loop->preheader].entryMask;
+            branch->setCondition(loopMask);
+          }
+        }
+      }
+    }
+  }
+
+  for (auto &pair : toDelete) {
+    Instruction *unmasked = pair.first;
+    Value *masked = pair.second;
+    updateMaps(unmasked, masked);
+    IRCleanup::deleteInstructionNow(unmasked);
+  }
+
+  return Error::success();
+}
+
+CallInst *ControlFlowConversionState::Impl::emitMaskedVersion(CallInst *CI,
+                                                              Value *entryBit) {
+  // Get the masked function
+  Function *newFunction = Ctx.getOrCreateMaskedFunction(CI);
+  VECZ_FAIL_IF(!newFunction);
+  SmallVector<Value *, 8> fnArgs;
+  for (unsigned i = 0; i < CI->arg_size(); ++i) {
+    fnArgs.push_back(CI->getOperand(i));
+  }
+  fnArgs.push_back(entryBit);
+
+  CallInst *newCI = CallInst::Create(newFunction, fnArgs);
+  newCI->insertBefore(CI->getIterator());
+  newCI->setCallingConv(CI->getCallingConv());
+  newCI->setAttributes(CI->getAttributes());
+
+  return newCI;
+}
+
+bool ControlFlowConversionState::Impl::tryApplyMaskToBinOp(
+    Instruction &I, Value *mask, DeletionMap &toDelete,
+    DenseMap<Value *, Value *> &safeDivisors) {
+  if (auto *binOp = dyn_cast<BinaryOperator>(&I)) {
+    if (!VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions)) {
+      // we don't need to mask division operations if they don't trap
+      return true;
+    }
+    // We might have to mask integer divides to avoid division errors.
+    // NOTE we don't generate any specific error checks ourselves, on the
+    // assumption that the incoming IR is already guarded against these,
+    // so it is sufficient to use the mask generated from the CFG.
+    bool isUnsigned = false;
+    switch (binOp->getOpcode()) {
+    case Instruction::UDiv:
+    case Instruction::URem:
+      isUnsigned = true;
+      LLVM_FALLTHROUGH;
+    case Instruction::SDiv:
+    case Instruction::SRem: {
+      auto *divisor = binOp->getOperand(1);
+      // no need to mask divides by a constant..
+      if (auto *C = dyn_cast<Constant>(divisor)) {
+        if (C->isZeroValue()) {
+          // Divides by constant zero can be a NOP since there is no
+          // division by zero exception in OpenCL.
+          auto *nop = binOp->getOperand(0);
+          I.replaceAllUsesWith(nop);
+          toDelete.emplace_back(&I, nop);
+        }
+      } else {
+        auto &masked = safeDivisors[divisor];
+        if (!masked) {
+          // NOTE this function does not check for the pattern
+          // "select (x eq 0) 1, x" or equivalent, so we might want to
+          // write it ourselves, but Instruction Combining cleans it up.
+          // NOTE that for a signed division, we also have to consider the
+          // potential overflow situation, which is not so simple
+          if (isUnsigned &&
+              isKnownNonZero(divisor, F.getParent()->getDataLayout())) {
+            // Static analysis concluded it can't be zero, so we don't need
+            // to do anything.
+            masked = divisor;
+          } else {
+            auto *SI = SelectInst::Create(
+                mask, divisor, ConstantInt::get(divisor->getType(), 1),
+                divisor->getName() + ".masked");
+            SI->insertBefore(I.getIterator());
+            masked = SI;
+          }
+        }
+
+        if (masked != divisor) {
+          binOp->setOperand(1, masked);
+        }
+      }
+    } break;
+
+    default:
+      break;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool ControlFlowConversionState::Impl::tryApplyMaskToMemOp(
+    MemOp &memOp, Value *mask, DeletionMap &toDelete) {
+  VECZ_FAIL_IF(!memOp.isLoad() && !memOp.isStore());
+  auto *I = memOp.getInstr();
+  VECZ_FAIL_IF(!I);
+  auto *dataVecTy = dyn_cast<FixedVectorType>(memOp.getDataType());
+  const unsigned dataWidth = dataVecTy ? dataVecTy->getNumElements() : 1;
+  Value *wideMask = mask;
+  if (dataWidth > 1) {
+    // If it's a vector mem-op it gets the same mask for every element
+    IRBuilder<> B(I);
+    wideMask = B.CreateVectorSplat(dataWidth, mask);
+  }
+
+  // Turn loads and stores into masked loads and stores.
+  if (memOp.isLoadStoreInst()) {
+    // Create a new mem-op the same as the original except for the addition
+    // of the mask.
+    Instruction *newVal = nullptr;
+    if (memOp.isLoad()) {
+      newVal = createMaskedLoad(
+          Ctx, memOp.getDataType(), memOp.getPointerOperand(), wideMask,
+          /*VL*/ nullptr, memOp.getAlignment(), I->getName());
+    } else {
+      newVal = createMaskedStore(
+          Ctx, memOp.getDataOperand(), memOp.getPointerOperand(), wideMask,
+          /*VL*/ nullptr, memOp.getAlignment(), I->getName());
+    }
+    VECZ_FAIL_IF(!newVal);
+
+    newVal->insertBefore(I->getIterator());
+
+    if (!I->getType()->isVoidTy()) {
+      I->replaceAllUsesWith(newVal);
+    }
+    toDelete.emplace_back(I, newVal);
+    return true;
+  }
+
+  if (auto *opMask = memOp.getMaskOperand()) {
+    auto *mask = BinaryOperator::CreateAnd(wideMask, opMask, "composite_mask");
+    mask->insertBefore(I->getIterator());
+    memOp.setMaskOperand(mask);
+    return true;
+  }
+
+  return false;
+}
+
+bool ControlFlowConversionState::Impl::applyMaskToCall(CallInst *CI,
+                                                       Value *mask,
+                                                       DeletionMap &toDelete) {
+  LLVM_DEBUG(dbgs() << "vecz-cf: Now at CallInst " << *CI << "\n");
+  // It might be that we need to mask the function call here because we
+  // won't be able to packetize it later on.
+  Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    callee = dyn_cast<Function>(CI->getCalledOperand()->stripPointerCasts());
+  }
+  VECZ_FAIL_IF(!callee); // TODO: Support indirect function calls.
+  // Check to see if this is a function that we know we won't be able to
+  // handle in any other way.
+  VECZ_FAIL_IF(callee->cannotDuplicate());
+
+  // Do not mess with internal builtins
+  if (Ctx.isInternalBuiltin(callee)) {
+    LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an internal builtin\n");
+    return true;
+  }
+
+  // Functions without side-effects do not need to be masked.
+  if (callee->onlyReadsMemory() || callee->doesNotAccessMemory()) {
+    LLVM_DEBUG(
+        dbgs() << "vecz-cf: Called function does not have any side-effects\n");
+    return true;
+  }
+
+  // Builtins without side effects do not need to be masked.
+  if (const auto builtin = Ctx.builtins().analyzeBuiltin(*callee)) {
+    const auto props = builtin->properties;
+    if (props & compiler::utils::eBuiltinPropertyNoSideEffects) {
+      LLVM_DEBUG(dbgs() << "vecz-cf: Called function is an pure builtin\n");
+      return true;
+    }
+    if (props & compiler::utils::eBuiltinPropertyWorkItem) {
+      LLVM_DEBUG(
+          dbgs() << "vecz-cf: Called function is a workitem ID builtin\n");
+      return true;
+    }
+    if (props & compiler::utils::eBuiltinPropertyExecutionFlow) {
+      LLVM_DEBUG(
+          dbgs() << "vecz-cf: Called function is an execution flow builtin\n");
+      // Masking this kind of builtin (a barrier) is not valid.
+      return false;
+    }
+    // We don't want to mask work-group collective builtins, because they are
+    // barriers (see above). This should actually be a rare situation, as these
+    // builtins are required to be uniform/convergent and so either all
+    // work-items or no work-items should hit them. Most of the time, this
+    // situation relies on the vectorizer failing to trace the branch flow and
+    // failing to realize the conditions are in fact uniform.
+    if (auto info = Ctx.builtins().isMuxGroupCollective(builtin->ID);
+        info && info->isWorkGroupScope()) {
+      LLVM_DEBUG(
+          dbgs() << "vecz-cf: Called function is a work-group collective\n");
+      return true;
+    }
+  }
+
+  // Create the new function and replace the old one with it
+  CallInst *newCI = emitMaskedVersion(CI, mask);
+  VECZ_FAIL_IF(!newCI);
+  if (!CI->getType()->isVoidTy()) {
+    CI->replaceAllUsesWith(newCI);
+  }
+  toDelete.emplace_back(CI, newCI);
+
+  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << *CI << "\n");
+  LLVM_DEBUG(dbgs() << "          with " << *newCI << "\n");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::applyMaskToAtomic(
+    Instruction &I, Value *mask, DeletionMap &toDelete) {
+  LLVM_DEBUG(dbgs() << "vecz-cf: Now at atomic inst " << I << "\n");
+
+  SmallVector<Value *, 8> maskedFnArgs;
+  VectorizationContext::MaskedAtomic MA;
+  MA.VF = ElementCount::getFixed(1);
+  MA.IsVectorPredicated = VU.choices().vectorPredication();
+
+  if (auto *atomicI = dyn_cast<AtomicRMWInst>(&I)) {
+    MA.Align = atomicI->getAlign();
+    MA.BinOp = atomicI->getOperation();
+    MA.IsVolatile = atomicI->isVolatile();
+    MA.Ordering = atomicI->getOrdering();
+    MA.SyncScope = atomicI->getSyncScopeID();
+    MA.ValTy = atomicI->getType();
+    MA.PointerTy = atomicI->getPointerOperand()->getType();
+
+    // Set up the arguments to this function
+    maskedFnArgs = {atomicI->getPointerOperand(), atomicI->getValOperand(),
+                    mask};
+
+  } else if (auto *cmpxchgI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+    MA.Align = cmpxchgI->getAlign();
+    MA.BinOp = AtomicRMWInst::BAD_BINOP;
+    MA.IsWeak = cmpxchgI->isWeak();
+    MA.IsVolatile = cmpxchgI->isVolatile();
+    MA.Ordering = cmpxchgI->getSuccessOrdering();
+    MA.CmpXchgFailureOrdering = cmpxchgI->getFailureOrdering();
+    MA.SyncScope = cmpxchgI->getSyncScopeID();
+    MA.ValTy = cmpxchgI->getCompareOperand()->getType();
+    MA.PointerTy = cmpxchgI->getPointerOperand()->getType();
+
+    // Set up the arguments to this function
+    maskedFnArgs = {cmpxchgI->getPointerOperand(),
+                    cmpxchgI->getCompareOperand(), cmpxchgI->getNewValOperand(),
+                    mask};
+  } else {
+    return false;
+  }
+
+  // Create the new function and replace the old one with it
+  // Get the masked function
+  Function *maskedAtomicFn = Ctx.getOrCreateMaskedAtomicFunction(
+      MA, VU.choices(), ElementCount::getFixed(1));
+  VECZ_FAIL_IF(!maskedAtomicFn);
+  // We don't have a vector length just yet - pass in one as a dummy.
+  if (MA.IsVectorPredicated) {
+    maskedFnArgs.push_back(
+        ConstantInt::get(IntegerType::getInt32Ty(I.getContext()), 1));
+  }
+
+  CallInst *maskedCI = CallInst::Create(maskedAtomicFn, maskedFnArgs);
+  VECZ_FAIL_IF(!maskedCI);
+  maskedCI->insertBefore(I.getIterator());
+
+  I.replaceAllUsesWith(maskedCI);
+  toDelete.emplace_back(&I, maskedCI);
+
+  LLVM_DEBUG(dbgs() << "vecz-cf: Replaced " << I << "\n");
+  LLVM_DEBUG(dbgs() << "          with " << *maskedCI << "\n");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::partiallyLinearizeCFG() {
+  // Two methods are possible to transform the divergent loops into uniform
+  // ones:
+  // 1) rewire the exit edges to the single latch, which means the loop live
+  //    masks have to be updated at each exiting block.
+  // 2) delete the divergent loop exit edges and update the loop live masks at
+  //    the latch.
+  //
+  // The former means more overhead when a loop exit is reached because we
+  // always have to update the masks, but it allows to retain the exiting
+  // branches.
+  // The latter means we only blend at the latch, thus less overhead at the
+  // loop exits, but if we reach a divergent loop exit, and it happens that all
+  // lanes have exited the loop, we still have to finish the iteration until we
+  // reach the latch and exit the loop.
+  //
+  // We are currently using the latter.
+  VECZ_FAIL_IF(!uniformizeDivergentLoops());
+
+  // ... and actually rewire them.
+  VECZ_FAIL_IF(!linearizeCFG());
+
+  // Transform phi nodes into selects for blocks that got blended.
+  VECZ_FAIL_IF(!generateSelects());
+
+  // Connect BOSCC regions if it is activated.
+  VECZ_FAIL_IF(BOSCC && !BOSCC->connectBOSCCRegions());
+
+  // Repair the CFG because the rewiring broke it.
+  VECZ_FAIL_IF(!repairSSA());
+
+  // Now we create the opaque calls to builtins that compute the real branch
+  // values. This must come before instruction simplification, otherwise LLVM
+  // can fold branch predicates that appear unreachable now, but would later
+  // become vector masks, thus mangling the control flow..
+  VECZ_FAIL_IF(!createBranchReductions());
+
+  // ... and now we can do instruction simplification on the masks and know they
+  // won't be prematurely folded.
+  VECZ_FAIL_IF(!simplifyMasks());
+
+  // Finally, if we used BOSCC it might want to do some tidying up.
+  VECZ_FAIL_IF(BOSCC && !BOSCC->cleanUp());
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::createBranchReductions() {
+  // Try to retrieve the builtin if it already exists.
+  const auto baseName =
+      Twine(VectorizationContext::InternalBuiltinPrefix).concat("divergence");
+  const StringRef nameAny = "_any";
+  const StringRef nameAll = "_all";
+
+  Type *boolTy = Type::getInt1Ty(F.getContext());
+  FunctionType *FT = FunctionType::get(boolTy, {boolTy}, false);
+
+  for (BasicBlock &BB : F) {
+    const bool needsAllOfMask = DR->hasFlag(BB, eBlockNeedsAllOfMask);
+
+    // If the block is uniform and is not a bossc indirection, all its lanes
+    // are true or false, not both. Thus, we don't need to packetize the
+    // condition.
+    if (!needsAllOfMask && DR->isUniform(BB)) {
+      continue;
+    }
+
+    auto *TI = BB.getTerminator();
+    if (BranchInst *Branch = dyn_cast<BranchInst>(TI)) {
+      if (Branch->isConditional()) {
+        auto *cond = Branch->getCondition();
+        if (isa<Constant>(cond)) {
+          continue;
+        }
+
+        // On divergent paths, ensure that only active lanes contribute to a
+        // branch condition; merge the branch condition with the active lane
+        // mask. This ensures that disabled lanes don't spuriously contribute a
+        // 'true' value into the reduced branch condition.
+        // Note that the distinction between 'uniform' and 'divergent' isn't
+        // 100% sufficient for our purposes here, because even uniform values
+        // may read undefined/poison values when masked out.
+        // Don't perform this on uniform loops as those may be unconditionally
+        // entered even when no work-items are active. Masking the loop exit
+        // with the entry mask would mean that the loop never exits.
+        // FIXME: Is this missing incorrect branches in uniform blocks/loops?
+        if (auto *LTag = DR->getTag(&BB).loop;
+            DR->isDivergent(BB) && (!LTag || LTag->isLoopDivergent())) {
+          if (!isBranchCondTrulyUniform(cond, *UVR)) {
+            auto *newcond = SelectInst::Create(MaskInfos[&BB].entryMask, cond,
+                                               getDefaultValue(cond->getType()),
+                                               cond->getName() + "_active");
+            newcond->insertBefore(Branch->getIterator());
+            cond = newcond;
+          }
+        }
+
+        const auto &name = needsAllOfMask ? nameAll : nameAny;
+        Function *const F = Ctx.getOrCreateInternalBuiltin(
+            Twine(baseName).concat(name).str(), FT);
+        VECZ_FAIL_IF(!F);
+
+        auto *const newCall =
+            CallInst::Create(F, {cond}, Twine(cond->getName()).concat(name));
+        newCall->insertBefore(Branch->getIterator());
+        Branch->setCondition(newCall);
+      }
+    } else if (isa<SwitchInst>(TI) &&
+               DR->hasFlag(BB, eBlockHasDivergentBranch)) {
+      // Not sure what to actually do with switch instructions..
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::uniformizeDivergentLoops() {
+  LLVM_DEBUG(dbgs() << "CFC: UNIFORMIZE DIVERGENT LOOPS\n");
+
+  // For every divergent loop of the function, we want to create a new exit edge
+  // whose source is the latch of the loop. That exit is called "pure". The
+  // target of this edge is a new divergent loop exit that will start a cascade
+  // of if conditions to branch to the original loop exits. The divergent loop
+  // exits will no longer be exits, while the optional loop exits will retain
+  // their branch but they will be rewired to the pure exit.
+  //
+  // Given the following *divergent* loop:
+  //
+  //                           preheader
+  //                               |
+  //                             header <---------.
+  //                              / \             |
+  //                            ... ...           |
+  //                            /     \           |
+  //                     %exit2.o     ...         |
+  //                     /            / \         |
+  //                    %d     %exit1.o ...       |
+  //                           /          \       |
+  //                          %b          ...     |
+  //                                      / \     |
+  //                               %exit2.r ...   |
+  //                               /          \   |
+  //                              %c   %latch.r --'
+  //                                   /
+  //                            %exit1.r
+  //                               |
+  //                               %a
+  //
+  // with:
+  // - %a, %b, %c, %d = a group of non specific basic blocks
+  // - %exit*.*       = loop exits
+  // - *.o            = optional blocks
+  // - *.r            = divergent blocks
+  // - %latch.r       = the latch of the loop. It is necessarily a divergent
+  //                    block because the loop is divergent
+  //
+  // The following transformation is performed:
+  //
+  //                     preheader
+  //                         |
+  //                       header <---------.
+  //                        / \             |
+  //                      ... ...           |
+  //                      /     \           |
+  //        %exit2.split1.o     ...         |
+  //        |                   / \         |
+  //         \    %exit1.split1.o ...       |
+  //          \   |                 \       |
+  //           \   \                ...     |
+  //            \   \                 \     |
+  //             \   \                ...   |
+  //              \   \                 \   |
+  //               \   \         %latch.r --'
+  //                \   \           |
+  //                 `---`-> %loop.pure_exit
+  //                               / |
+  //                        %exit1.r %exit1.else.r
+  //                        /             / |
+  //                       %a      %exit2.r %exit2.else.r
+  //                               /             / |
+  //                              %c            /  |
+  //                                           /   |
+  //                             %exit1.split2.o   %exit1.else.o
+  //                             /                      / |
+  //                            %b        %exit2.split2.o %exit2.else.o
+  //                                      /
+  //                                     %d
+  //
+  // with:
+  // - %exit*.split1.o = the first half of the original %exit*.o with only
+  //                     phi nodes
+  // - %exit*.split2.o = the second half of the original %exit*.o without the
+  //                     phi nodes
+  // - %loop.pure_exit = a new loop exit starting a cascade of ifs towards the
+  //                     original loop exits
+  // - %exit*.else.*   = a new block whose only purpose is to branch to other
+  //                     blocks
+  //
+  // Each introduced conditional branch uses the entry mask of the exit block
+  // as the condition.
+  // Each introduced divergent conditional block is marked as Div causing, thus
+  // linearizing them.
+  // Each introduced optional conditional block is marked as divergent, thus
+  // retaining the branches and branching to the true path only if any of the
+  // lanes that executed the loop left through the exit the true path targets.
+  //
+  // The state of the loop after the transformation is invalid and relies on
+  // the linearizer to correctly rewire the introduced blocks. The result of the
+  // above transformed loop after linearization will be:
+  //
+  //                            preheader
+  //                                |
+  //                              header <---------.
+  //                               / \             |
+  //                             ... ...           |
+  //                             /     \           |
+  //               %exit2.split1.o     ...         |
+  //                      |              \         |
+  //                      |              ...       |
+  //                      |                \       |
+  //                      |                ...     |
+  //                      |                / \     |
+  //                      |  %exit1.split1.o ...   |
+  //                       \        |          \   |
+  //                        \       |   %latch.r --'
+  //                         \      |      |
+  //                          `---> %loop.pure_exit
+  //                                       |
+  //                                    %exit1.r
+  //                                       |
+  //                                       %a
+  //                                       |
+  //                                 %exit1.else.r
+  //                                       |
+  //                                    %exit2.r
+  //                                       |
+  //                                       %c
+  //                                       |
+  //                                 %exit2.else.r
+  //                                      / |
+  //                         %exit1.split.o %exit1.else.o
+  //                         /                   / |
+  //                        %b      %exit2.split.o %exit2.else.o
+  //                                /                   ...
+  //                               %d
+  //
+  // Note that only one branch introduced from an optional loop exit
+  // ('%exit2.else.r' and '%exit1.else.o' in this example) can evaluate to
+  // true because as soon as an optional loop exit is taken, all the active
+  // lanes in the loop leave through it.
+  // However, as many as all the branches introduced from divergent loop exits
+  // may evaluate to true. The '...' at the end of the CFG will be replaced by
+  // whatever would originally succeed the original divergent loop exits.
+  bool modified = false;
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    if (LTag->isLoopDivergent()) {
+      Loop *L = LTag->loop;
+
+      // Store the loop exit blocks and edges before doing any modification.
+      SmallVector<BasicBlock *, 2> exitBlocks;
+      SmallVector<Loop::Edge, 2> exitEdges;
+      {
+        L->getExitEdges(exitEdges);
+        // 1) Retrieve the unique loop exit blocks.
+        // 2) Remove any loop exit for which 'L' is not the outermost loop left.
+        // 3) Sort the loop exit blocks.
+        //
+        // We can't use the `getUniqueExitBlocks' method because the loop may
+        // not be in a canonical form because of BOSCC.
+        if (BOSCC) {
+          L->getExitBlocks(exitBlocks);
+          SmallPtrSet<BasicBlock *, 1> _uniqueExitBlocks;
+          for (auto it = exitBlocks.begin(); it != exitBlocks.end();) {
+            if (!_uniqueExitBlocks.insert(*it).second) {
+              it = exitBlocks.erase(it);
+            } else {
+              ++it;
+            }
+          }
+        } else {
+          L->getUniqueExitBlocks(exitBlocks);
+        }
+        // Only handle outermost loops left by the exits.
+        exitBlocks.erase(
+            std::remove_if(exitBlocks.begin(), exitBlocks.end(),
+                           [this, LTag](BasicBlock *EB) {
+                             return DR->getTag(EB).outermostExitedLoop != LTag;
+                           }),
+            exitBlocks.end());
+        // Order the loop exit blocks such that:
+        // - divergent loop exits come first
+        // - smallest DCBI come first
+        const auto middle = std::partition(
+            exitBlocks.begin(), exitBlocks.end(),
+            [this](BasicBlock *BB) { return DR->isDivergent(*BB); });
+        std::sort(exitBlocks.begin(), middle,
+                  [this](BasicBlock *LHS, BasicBlock *RHS) {
+                    return DR->getTagIndex(LHS) < DR->getTagIndex(RHS);
+                  });
+        std::sort(middle, exitBlocks.end(),
+                  [this](BasicBlock *LHS, BasicBlock *RHS) {
+                    return DR->getTagIndex(LHS) < DR->getTagIndex(RHS);
+                  });
+      }
+
+      if (exitBlocks.empty()) {
+        LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+                          << " has no loop exits eligible for rewiring.\n");
+        continue;
+      }
+
+      VECZ_FAIL_IF(!computeDivergentLoopPureExit(*LTag));
+      VECZ_FAIL_IF(!rewireDivergentLoopExitBlocks(*LTag, exitBlocks));
+
+      VECZ_FAIL_IF(!generateDivergentLoopResults(*LTag));
+      VECZ_FAIL_IF(!blendDivergentLoopLiveValues(*LTag, exitBlocks));
+      VECZ_FAIL_IF(!blendDivergentLoopExitMasks(*LTag, exitEdges, exitBlocks));
+
+      modified = true;
+    }
+  }
+
+  // We have modified the divergent loops into uniform ones, thus changing the
+  // dominance-compact block ordering. We need to recompute it.
+  if (modified) {
+    DT->recalculate(F);
+    PDT->recalculate(F);
+    // And make sure we correctly updated the DomTrees.
+    VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated");
+    VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated");
+    VECZ_FAIL_IF(!computeBlockOrdering());
+
+    RC->clear();
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::computeDivergentLoopPureExit(
+    LoopTag &LTag) {
+  LLVM_DEBUG(dbgs() << "CFC: COMPUTE PURE EXIT FOR LOOP "
+                    << LTag.loop->getName() << "\n");
+
+  auto *const latchBB = LTag.latch;
+  BasicBlock *pureExit =
+      BasicBlock::Create(F.getContext(), LTag.loop->getName() + ".pure_exit",
+                         &F, latchBB->getNextNode());
+  BasicBlockTag &pureExitTag = DR->getOrCreateTag(pureExit);
+
+  // Set the tags.
+  auto &LMask = LoopMasks[LTag.loop];
+  MaskInfos[pureExit].entryMask = LMask.persistedCombinedDivergentExitMask;
+  pureExitTag.outermostExitedLoop = &LTag;
+
+  auto *const preheaderLoopTag = DR->getTag(LTag.preheader).loop;
+  if (preheaderLoopTag) {
+    pureExitTag.loop = preheaderLoopTag;
+    preheaderLoopTag->loop->addBasicBlockToLoop(pureExit, *LI);
+  }
+  DR->setFlag(*pureExit,
+              static_cast<BlockDivergenceFlag>(
+                  BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit |
+                  BlockDivergenceFlag::eBlockHasDivergentBranch |
+                  BlockDivergenceFlag::eBlockIsDivergent));
+
+  LTag.pureExit = pureExit;
+
+  LLVM_DEBUG(dbgs() << "Pure exit: " << pureExit->getName() << "\n");
+
+  if (BOSCC) {
+    BOSCC->addInRegions(pureExit, latchBB);
+  }
+
+  auto *latchT = latchBB->getTerminator();
+#ifndef ALL_OF_DIVERGENT_LOOP_LATCH
+  Value *cond = MaskInfos[latchBB].exitMasks[LTag.header];
+  auto *newT = BranchInst::Create(LTag.header, pureExit, cond, latchBB);
+#else
+  // Exit the loop through the single divergent loop exit only if all instances
+  // that entered the loop left it.
+  ICmpInst *cond = new ICmpInst(
+      latchT, CmpInst::ICMP_EQ, LMask.persistedCombinedDivergentExitMask,
+      MaskInfos[LTag.preheader].exitMasks[LTag.header]);
+  auto *newT = BranchInst::Create(pureExit, LTag.header, cond, latchBB);
+  DR->setFlag(*latchBB, eBlockNeedsAllOfMask);
+#endif
+
+  updateMaps(latchT, newT);
+
+  IRCleanup::deleteInstructionNow(latchT);
+
+  MaskInfos[latchBB].exitMasks[pureExit] =
+      LMask.persistedCombinedDivergentExitMask;
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::rewireDivergentLoopExitBlocks(
+    LoopTag &LTag, const SmallVectorImpl<BasicBlock *> &exitBlocks) {
+  LLVM_DEBUG(dbgs() << "CFC: REWIRE EXIT BLOCKS FOR LOOP "
+                    << LTag.loop->getName() << "\n");
+
+  auto removeSuccessor = [this](Instruction *T, unsigned succIdx) {
+    switch (T->getOpcode()) {
+    default:
+      // Any other kind of Terminator cannot be handled and until
+      // proven otherwise, should not.
+      break;
+    case Instruction::Br: {
+      const unsigned keepIdx = succIdx == 0 ? 1 : 0;
+      auto *newT = BranchInst::Create(T->getSuccessor(keepIdx));
+      newT->insertBefore(T->getIterator());
+
+      updateMaps(T, newT);
+
+      IRCleanup::deleteInstructionNow(T);
+      break;
+    }
+    case Instruction::Switch: {
+      SwitchInst *SI = cast<SwitchInst>(T);
+      if (succIdx == 0) {
+        SI->setDefaultDest(SI->getSuccessor(1));
+        SI->removeCase(SI->case_begin());
+      } else {
+        SI->removeCase(std::next(SI->case_begin(), succIdx - 1));
+      }
+      break;
+    }
+    case Instruction::IndirectBr: {
+      IndirectBrInst *IBI = cast<IndirectBrInst>(T);
+      IBI->removeDestination(succIdx);
+      break;
+    }
+    }
+  };
+
+  // 'divergentLE' represents the current virtual divergent loop exit that a
+  // loop exit needs to be rewired to/from.
+  BasicBlock *divergentLE = LTag.pureExit;
+  for (unsigned idx = 0; idx < exitBlocks.size(); ++idx) {
+    BasicBlock *EB = exitBlocks[idx];
+
+    // The target of 'divergentLE'.
+    BasicBlock *target = nullptr;
+
+    // If 'EB' is optional, we split it at the terminator so that the exiting
+    // block keeps its edge towards it. The second half of 'EB' will be targeted
+    // by the cascade if.
+    if (DR->isOptional(*EB)) {
+      LLVM_DEBUG(dbgs() << "Optional loop exit " << EB->getName() << ":\n");
+
+      target =
+          EB->splitBasicBlock(EB->getTerminator(), EB->getName() + ".split");
+      auto &targetTag = DR->getOrCreateTag(target);
+
+      LLVM_DEBUG(dbgs() << "\tSplit " << EB->getName() << " into "
+                        << target->getName() << "\n");
+
+      // Set the tags.
+      // We have to be very careful copying a value from one key to another, in
+      // case one key did not exist, and constructing it caused rehashing.
+      {
+        auto EBmasks = MaskInfos[EB];
+        MaskInfos[target] = std::move(EBmasks);
+      }
+
+      auto *const EBLoopTag = DR->getTag(EB).loop;
+      if (EBLoopTag) {
+        targetTag.loop = EBLoopTag;
+        EBLoopTag->loop->addBasicBlockToLoop(target, *LI);
+      }
+
+      // If 'EB' is the preheader of a loop then 'target' takes its place.
+      for (auto *const ordered : DR->getLoopOrdering()) {
+        if (ordered->preheader == EB) {
+          LLVM_DEBUG(dbgs()
+                     << "\t" << target->getName() << " is now the preheader of "
+                     << ordered->loop->getName() << "\n");
+          ordered->preheader = target;
+        }
+      }
+
+      if (BOSCC) {
+        BOSCC->addReference(target, EB);
+        BOSCC->addInRegions(target, EB);
+      }
+      DR->setFlag(*target, DR->getFlag(*EB));
+
+      // Rewire 'EB' to the pure exit.
+      auto *const pureExit = LTag.pureExit;
+      EB->getTerminator()->setSuccessor(0, pureExit);
+
+      LLVM_DEBUG(dbgs() << "\t" << EB->getName() << " now targets "
+                        << pureExit->getName() << "\n");
+
+      // Retain branch for optional loop exits.
+      DR->clearFlag(*divergentLE,
+                    BlockDivergenceFlag::eBlockHasDivergentBranch);
+      // Set all-of mask because the first successor of 'divergentLE' is taken
+      // if no one existed from the optional loop exit.
+      DR->setFlag(*divergentLE, eBlockNeedsAllOfMask);
+
+      // 'EB' now has only one single exit edge.
+      auto &EBmasks = MaskInfos[EB];
+      EBmasks.exitMasks[pureExit] = EBmasks.entryMask;
+    } else {
+      LLVM_DEBUG(dbgs() << "Divergent loop exit " << EB->getName() << ":\n");
+
+      // Otherwise, the edge exiting-block-to-divergent-exit-block is removed ..
+      {
+        SmallPtrSet<BasicBlock *, 1> predsToRemove;
+        for (BasicBlock *pred : predecessors(EB)) {
+          const auto *const predLTag = DR->getTag(pred).loop;
+          // All predecessors of the divergent loop exit that belong in a loop
+          // contained in the outermost loop left by that exit need their
+          // edge removed.
+          if (predLTag && LTag.loop->contains(predLTag->loop)) {
+            predsToRemove.insert(pred);
+          }
+        }
+        for (BasicBlock *pred : predsToRemove) {
+          auto *predT = pred->getTerminator();
+          for (unsigned succIdx = 0; succIdx < predT->getNumSuccessors();
+               ++succIdx) {
+            if (predT->getSuccessor(succIdx) == EB) {
+              removeSuccessor(predT, succIdx);
+              LLVM_DEBUG(dbgs() << "\tRemove predecessor: " << pred->getName()
+                                << "\n");
+
+              break;
+            }
+          }
+        }
+        PHINode *PHI = nullptr;
+        while ((PHI = dyn_cast<PHINode>(&EB->front()))) {
+          VECZ_FAIL_IF(!generateSelectFromPHI(PHI, EB));
+        }
+      }
+
+      // ... and the exit block gets targeted by the current divergent loop
+      // exit.
+      target = EB;
+    }
+
+    VECZ_ERROR_IF(!target, "No target was found");
+
+    // If we are processing the last exit block, and it happens to be divergent
+    // there is no optional exit loop it can branch to, so create an
+    // unconditional branch.
+    if ((idx + 1 == exitBlocks.size()) && DR->isDivergent(*target)) {
+      BranchInst::Create(target, divergentLE);
+      auto &maskInfo = MaskInfos[divergentLE];
+      maskInfo.exitMasks[target] = maskInfo.entryMask;
+
+      LLVM_DEBUG(dbgs() << "\tVirtual Divergent Loop Exit "
+                        << divergentLE->getName()
+                        << ":\n\t\tSuccessor 0: " << target->getName() << "\n");
+    } else {
+      // The DCBI ordering sets the right sibling to be of an index less than
+      // the left sibling if they are on the same level of dominance. For that
+      // reason, we want to set the original loop exit as the right sibling so
+      // that the latter gets processed first while linearizing, and branches
+      // to the left sibling. We thus have to negate the condition to do so.
+      //
+      // The said condition is the entry mask of the exit block, i.e. whether
+      // any exiting block left through it.
+      auto &targetMasks = MaskInfos[target];
+      Instruction *cond = cast<Instruction>(targetMasks.entryMask);
+      // If that entry mask is defined in the loop (if the exit block has only
+      // one predecessor), then we can directly use that mask as the condition.
+      // Otherwise, we must move the latter in the pure exit so that
+      // 'divergentLE' can refer to it.
+      if (cond->getParent() == target) {
+        if (PHINode *PHI = dyn_cast<PHINode>(cond)) {
+          VECZ_FAIL_IF(!generateSelectFromPHI(PHI, target));
+          cond = cast<Instruction>(targetMasks.entryMask);
+        }
+        std::queue<Instruction *> toMove;
+        toMove.push(cond);
+        // Make sure to move all the operands of the condition that are in its
+        // definition block.
+        while (!toMove.empty()) {
+          Instruction *move = toMove.front();
+          toMove.pop();
+          move->moveBefore(*LTag.pureExit, LTag.pureExit->begin());
+          for (Value *op : move->operands()) {
+            if (Instruction *opI = dyn_cast<Instruction>(op)) {
+              if (opI->getParent() == target) {
+                toMove.push(opI);
+              }
+            }
+          }
+        }
+      }
+
+      auto *negCond = BinaryOperator::CreateNot(cond, cond->getName() + ".not",
+                                                divergentLE);
+      BasicBlock *newDivergentLE = BasicBlock::Create(
+          F.getContext(), EB->getName() + ".else", &F, EB->getNextNode());
+      BranchInst::Create(newDivergentLE, target, negCond, divergentLE);
+
+      // The divergentLE block "ought" to exist in the masks map already, but
+      // it is safer to take a local copy and retire `targetMasks`.
+      auto *const targetEntryMask = targetMasks.entryMask;
+
+      // No use of `targetMasks` after this line
+      auto &divgLEMask = MaskInfos[divergentLE];
+      divgLEMask.exitMasks[target] = targetEntryMask;
+      divgLEMask.exitMasks[newDivergentLE] = negCond;
+
+      LLVM_DEBUG(dbgs() << "\tCreate new virtual divergent loop exit "
+                        << newDivergentLE->getName() << "\n");
+
+      LLVM_DEBUG(
+          dbgs() << "\tVirtual Divergent Loop Exit " << divergentLE->getName()
+                 << ":\n\t\tSuccessor 0: " << target->getName()
+                 << "\n\t\tSuccessor 1: " << newDivergentLE->getName() << "\n");
+
+      auto &newDivergentLETag = DR->getOrCreateTag(newDivergentLE);
+
+      // Set the tags.
+      MaskInfos[newDivergentLE].entryMask = negCond;
+      if (auto *const divLoopTag = DR->getTag(divergentLE).loop) {
+        newDivergentLETag.loop = divLoopTag;
+        newDivergentLETag.loop->loop->addBasicBlockToLoop(newDivergentLE, *LI);
+      }
+
+      DR->setFlag(*newDivergentLE,
+                  static_cast<BlockDivergenceFlag>(
+                      DR->getFlag(*divergentLE) |
+                      BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit |
+                      BlockDivergenceFlag::eBlockHasDivergentBranch |
+                      BlockDivergenceFlag::eBlockIsDivergent));
+
+      if (BOSCC) {
+        BOSCC->addInRegions(newDivergentLE, LTag.latch);
+      }
+
+      divergentLE = newDivergentLE;
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateDivergentLoopResults(
+    LoopTag &LTag) {
+  LLVM_DEBUG(dbgs() << "CFC: GENERATE DIVERGENT LOOP RESULTS FOR "
+                    << LTag.loop->getName() << "\n");
+
+  // First create instructions to save the value of the last iteration ...
+  IRBuilder<> B(LTag.header, LTag.header->getFirstNonPHIIt());
+  for (Value *LLV : LTag.loopLiveValues) {
+    LTag.loopResultPrevs[LLV] =
+        B.CreatePHI(LLV->getType(), 2, LLV->getName() + ".prev");
+    LLVM_DEBUG(dbgs() << "Create result phi: "
+                      << LTag.loopResultPrevs[LLV]->getName() << "\n");
+  }
+
+  // ... then create instructions to retrieve the updated value in the current
+  // iteration.
+  for (Value *LLV : LTag.loopLiveValues) {
+    VECZ_FAIL_IF(!generateDivergentLoopResultUpdates(LLV, LTag));
+  }
+
+  if (BOSCC) {
+    // Clone the loop live values update instructions in the uniform version.
+    if (Loop *uniformL = BOSCC->getLoop(LTag.loop)) {
+      auto *const uniformHeader = DR->getTag(uniformL).header;
+      for (Value *LLV : LTag.loopLiveValues) {
+        BOSCC->addReference(LTag.loopResultUpdates[LLV], LLV);
+        PHINode *LRP = LTag.loopResultPrevs[LLV];
+        // We only need to clone the value of the previous iteration.
+        PHINode *uniformLRP = cast<PHINode>(LRP->clone());
+
+        uniformLRP->setIncomingValue(1, LLV);
+
+        uniformLRP->insertBefore(uniformHeader->getFirstNonPHIIt());
+        BOSCC->createReference(LRP, uniformLRP, true);
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateDivergentLoopResultUpdates(
+    Value *LLV, LoopTag &LTag) {
+  auto &LMask = LoopMasks[LTag.loop];
+  Value *mask = LMask.combinedDivergentExitMask;
+  VECZ_ERROR_IF(!mask, "Divergent loop does not have an exit mask");
+  PHINode *PHI = LTag.loopResultPrevs[LLV];
+  SelectInst *select =
+      SelectInst::Create(mask, LLV, PHI, LLV->getName() + ".update");
+  select->insertBefore(LTag.latch->getTerminator()->getIterator());
+  LTag.loopResultUpdates[LLV] = select;
+
+  // The PHI function of each loop live value has one incoming value from
+  // the preheader if this is the outermost loop, or from the PHI function from
+  // the outer loop otherwise.
+  auto *const ParentL = LTag.loop->getParentLoop();
+  auto *const ParentLT = ParentL ? &DR->getTag(ParentL) : nullptr;
+  if (!ParentLT || !ParentLT->loopResultPrevs.contains(LLV)) {
+    PHI->addIncoming(getDefaultValue(PHI->getType()), LTag.preheader);
+  } else {
+    BasicBlock *LLVDef = cast<Instruction>(LLV)->getParent();
+    if (LLVDef != LTag.header && DR->isReachable(LLVDef, LTag.header)) {
+      PHI->addIncoming(LLV, LTag.preheader);
+    } else {
+      PHI->addIncoming(ParentLT->loopResultPrevs[LLV], LTag.preheader);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Create result update: " << *select << "\n");
+
+  // The second incoming value is the updated value from the latch.
+  PHI->addIncoming(select, LTag.latch);
+
+  LLVM_DEBUG(dbgs() << "Update result phi: " << *PHI << "\n");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::blendDivergentLoopLiveValues(
+    LoopTag &LTag, const SmallVectorImpl<BasicBlock *> &exitBlocks) {
+  LLVM_DEBUG(dbgs() << "CFC: BLEND DIVERGENT LOOP LIVE VALUES FOR "
+                    << LTag.loop->getName() << "\n");
+
+  // Get the exit blocks that were not removed.
+  SmallVector<BasicBlock *, 1> optionalExitBlocks;
+  LTag.loop->getExitBlocks(optionalExitBlocks);
+  // Remove the pure exit from it.
+  for (auto it = optionalExitBlocks.begin(); it != optionalExitBlocks.end();
+       ++it) {
+    if (*it == LTag.pureExit) {
+      (void)optionalExitBlocks.erase(it);
+      break;
+    }
+  }
+
+  for (Value *LLV : LTag.loopLiveValues) {
+    BasicBlock *LLVDef = cast<Instruction>(LLV)->getParent();
+    PHINode *prev = LTag.loopResultPrevs[LLV];
+    SelectInst *update = LTag.loopResultUpdates[LLV];
+
+    VECZ_ERROR_IF(
+        !update,
+        "Divergent loop live value does not have an update instruction");
+    VECZ_ERROR_IF(
+        !prev, "Divergent loop live value does not have a persist instruction");
+
+    PHINode *blend =
+        PHINode::Create(LLV->getType(), 2, LLV->getName() + ".blend");
+    blend->insertBefore(LTag.pureExit->begin());
+
+    // Replace all uses outside the loop.
+    VECZ_FAIL_IF(
+        !replaceUsesOutsideDivergentLoop(LTag, LLV, blend, optionalExitBlocks));
+
+    for (BasicBlock *EB : exitBlocks) {
+      if (DR->isOptional(*EB)) {
+        if (!DR->isReachable(LLVDef, EB)) {
+          blend->addIncoming(prev, EB);
+        } else {
+          blend->addIncoming(LLV, EB);
+        }
+      }
+    }
+    blend->addIncoming(update, LTag.latch);
+
+    if (BOSCC) {
+      BOSCC->addReference(blend, update);
+    }
+
+    LLVM_DEBUG(dbgs() << "Create blend " << *blend << " for LLV " << *LLV
+                      << "\n");
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::blendDivergentLoopExitMasks(
+    LoopTag &LTag, const SmallVectorImpl<Loop::Edge> &exitEdges,
+    const SmallVectorImpl<BasicBlock *> &exitBlocks) {
+  LLVM_DEBUG(dbgs() << "CFC: BLEND DIVERGENT LOOP EXIT MASKS FOR "
+                    << LTag.loop->getName() << "\n");
+
+  // Get the exit blocks that were not removed.
+  SmallVector<BasicBlock *, 1> optionalExitBlocks;
+  LTag.loop->getExitBlocks(optionalExitBlocks);
+  // Remove the pure exit from it.
+  for (auto it = optionalExitBlocks.begin(); it != optionalExitBlocks.end();
+       ++it) {
+    if (*it == LTag.pureExit) {
+      (void)optionalExitBlocks.erase(it);
+      break;
+    }
+  }
+
+  auto &LMask = LoopMasks[LTag.loop];
+  for (const Loop::Edge &EE : exitEdges) {
+    BasicBlock *exitingBlock = const_cast<BasicBlock *>(EE.first);
+    BasicBlock *exitBlock = const_cast<BasicBlock *>(EE.second);
+
+    if (DR->isDivergent(*exitBlock)) {
+      PHINode *prev = LMask.persistedDivergentExitMasks[exitingBlock];
+      BinaryOperator *update =
+          LMask.updatedPersistedDivergentExitMasks[exitingBlock];
+
+      VECZ_ERROR_IF(
+          !update,
+          "Divergent loop exit mask does not have an update instruction");
+      VECZ_ERROR_IF(
+          !prev,
+          "Divergent loop exit mask does not have a persist instruction");
+
+      PHINode *blend =
+          PHINode::Create(prev->getType(), 2, prev->getName() + ".blend");
+      blend->insertBefore(LTag.pureExit->begin());
+
+      // Replace all uses outside the loop.
+      VECZ_FAIL_IF(!replaceUsesOutsideDivergentLoop(LTag, update, blend,
+                                                    optionalExitBlocks));
+
+      for (BasicBlock *EB : exitBlocks) {
+        if (DR->isOptional(*EB)) {
+          blend->addIncoming(prev, EB);
+        }
+      }
+      blend->addIncoming(update, LTag.latch);
+
+      if (BOSCC) {
+        BOSCC->addReference(blend, update);
+      }
+
+      LLVM_DEBUG(dbgs() << "Create blend " << *blend << " for loop exit mask "
+                        << *update << "\n");
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::replaceUsesOutsideDivergentLoop(
+    LoopTag &LTag, Value *from, Value *to,
+    const SmallVectorImpl<BasicBlock *> &exitBlocks) {
+  for (auto it = from->use_begin(); it != from->use_end();) {
+    Use &U = *it++;
+    Instruction *user = cast<Instruction>(U.getUser());
+    BasicBlock *blockUse = user->getParent();
+    // Don't replace uses within the loop.
+    if (LTag.loop->contains(blockUse) ||
+        // If the use is in a loop exit block, then 'to' can't reach it.
+        std::count(exitBlocks.begin(), exitBlocks.end(), blockUse)) {
+      continue;
+    }
+    // If the use is in a pure exit block of a divergent loop, don't replace
+    // the use if it comes from an optional exit block of that loop.
+    if (PHINode *PHI = dyn_cast<PHINode>(user)) {
+      const auto *const exitedLoop = DR->getTag(blockUse).outermostExitedLoop;
+      if (exitedLoop && exitedLoop->pureExit == blockUse) {
+        BasicBlock *incoming = PHI->getIncomingBlock(U);
+        if (!exitedLoop->loop->contains(incoming)) {
+          continue;
+        }
+      }
+    }
+    U.set(to);
+    LLVM_DEBUG(dbgs() << "Replace loop value " << *from << " with blend "
+                      << to->getName() << "\n");
+  }
+
+  return true;
+}
+
+namespace {
+using DenseDeferralMap =
+    SmallDenseMap<BasicBlock *, SmallPtrSet<BasicBlock *, 2>, 32>;
+
+void addDeferral(BasicBlock *newSrc, BasicBlock *deferred,
+                 DenseDeferralMap &deferrals) {
+  auto newSrcIt = deferrals.find(newSrc);
+  if (newSrcIt != deferrals.end()) {
+    // If the deferral edge already exists, there is no need to add it again.
+    if (newSrcIt->second.contains(deferred)) {
+      LLVM_DEBUG(dbgs() << "\t\tDeferral (" << newSrc->getName() << ", "
+                        << deferred->getName() << ") already exists\n");
+      return;
+    }
+  }
+  auto deferredIt = deferrals.find(deferred);
+  if (deferredIt != deferrals.end()) {
+    // If the deferral edge already exists the other way around, we don't want
+    // to add it the opposite way, in risk of creating an infinite loop in the
+    // CFG.
+    if (deferredIt->second.contains(newSrc)) {
+      LLVM_DEBUG(dbgs() << "\t\tOpposite deferral (" << deferred->getName()
+                        << ", " << newSrc->getName() << ") already exists\n");
+      return;
+    }
+  }
+
+  deferrals[newSrc].insert(deferred);
+
+  LLVM_DEBUG(dbgs() << "\t\tAdd deferral (" << newSrc->getName() << ", "
+                    << deferred->getName() << ")\n");
+}
+
+void removeDeferrals(BasicBlock *src, DenseDeferralMap &deferrals) {
+  auto deferredIt = deferrals.find(src);
+  if (deferredIt != deferrals.end()) {
+#ifndef NDEBUG
+    for (BasicBlock *deferred : deferredIt->second) {
+      LLVM_DEBUG(dbgs() << "\tRemove deferral (" << src->getName() << ", "
+                        << deferred->getName() << ")\n");
+    }
+#endif
+    deferrals.erase(deferredIt);
+  }
+}
+} // namespace
+
+bool ControlFlowConversionState::Impl::computeNewTargets(Linearization &lin) {
+  // The entry block cannot be targeted.
+  const auto &DCBI = DR->getBlockOrdering();
+  const size_t numBlocks = DCBI.size();
+  DenseSet<BasicBlock *> targets(numBlocks - 1);
+  for (const auto &tag : make_range(std::next(DCBI.begin()), DCBI.end())) {
+    targets.insert(tag.BB);
+  }
+
+  DenseDeferralMap deferrals;
+
+  LLVM_DEBUG(dbgs() << "CFC: COMPUTE NEW TARGETS\n");
+
+  // For each basic block, select its new targets based on previous blocks that
+  // have been deferred because of divergence, and their current successors.
+  // Select the target that has the lowest DCBI, i.e. the block whose dominance
+  // englobes or is equal to the other available targets.
+  //
+  // If we assign a target different from the current successor of the block,
+  // we must add a deferral edge from the selected target to the current
+  // successor (that got replaced by the selected target) such that an edge
+  // from the current block to the replaced successor exists in the modified
+  // graph.
+  lin.infos.reserve(numBlocks);
+  lin.data.reserve(numBlocks);
+  for (size_t BBIndex = 0; BBIndex != numBlocks; ++BBIndex) {
+    const auto &BBTag = DR->getBlockTag(BBIndex);
+    BasicBlock *BB = BBTag.BB;
+    lin.beginBlock(BB);
+
+    LLVM_DEBUG(dbgs() << "BB " << BB->getName() << ":\n");
+
+    // Retrieve the rewire list for 'BB'.
+    SmallPtrSet<BasicBlock *, 8> availableTargets;
+    {
+      auto deferredIt = deferrals.find(BB);
+      if (deferredIt != deferrals.end()) {
+        for (BasicBlock *deferred : deferredIt->second) {
+          availableTargets.insert(deferred);
+        }
+      }
+    }
+
+    if (!DR->isDivCausing(*BB) ||
+        // Loop latches must have their branch retained.
+        (BBTag.loop && BBTag.loop->latch == BB)) {
+      // If 'BB' ends in a uniform branch.
+      LLVM_DEBUG(dbgs() << "  uniform branch\n");
+
+      // Keep track of what blocks we have targeted in case we have a deferred
+      // block that is a current successor (which could lead in choosing the
+      // same block twice!).
+      SmallPtrSet<BasicBlock *, 8> targeted;
+
+      for (BasicBlock *succ : successors(BB)) {
+        size_t nextIndex = ~size_t(0);
+        for (BasicBlock *deferred : availableTargets) {
+          if (targeted.contains(deferred)) {
+            continue;
+          }
+
+          const size_t deferredIndex = DR->getTagIndex(deferred);
+          if (nextIndex == ~size_t(0) || nextIndex > deferredIndex) {
+            nextIndex = deferredIndex;
+          }
+        }
+
+        const size_t succIndex = DR->getTagIndex(succ);
+        if (!targeted.contains(succ)) {
+          // If we have not found a target or there is a better one.
+          if (nextIndex == ~size_t(0) || nextIndex > succIndex) {
+            nextIndex = succIndex;
+          }
+        }
+
+        VECZ_ERROR_IF(nextIndex == ~size_t(0), "No target was found");
+
+        auto *const next = DR->getBlockTag(nextIndex).BB;
+        lin.push(next);
+        targeted.insert(next);
+
+        LLVM_DEBUG(dbgs() << "\tsuccessor " << lin.currentSize() - 1 << ": "
+                          << next->getName() << "\n");
+
+        // Virtually remove backedges.
+        if (!BBTag.isLoopBackEdge(next)) {
+          targets.erase(next);
+          // Don't add deferred edges to blocks already processed.
+          if (BBIndex < nextIndex) {
+            auto S = availableTargets;
+            S.insert(succ);
+
+            for (BasicBlock *deferred : S) {
+              if (deferred != next) {
+                addDeferral(next, deferred, deferrals);
+              }
+            }
+          }
+        }
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "  divergent branch\n");
+
+      for (BasicBlock *succ : successors(BB)) {
+        availableTargets.insert(succ);
+      }
+
+      size_t nextIndex = ~size_t(0);
+      for (BasicBlock *deferred : availableTargets) {
+        const size_t deferredIndex = DR->getTagIndex(deferred);
+        if (nextIndex == ~size_t(0) || nextIndex > deferredIndex) {
+          LLVM_DEBUG(dbgs()
+                     << (nextIndex == ~size_t(0)
+                             ? "\tchoosing successor: "
+                             : "\tpreferring instead successor: ")
+                     << DR->getBlockTag(deferredIndex).BB->getName() << "\n");
+          nextIndex = deferredIndex;
+        }
+      }
+
+      VECZ_ERROR_IF(nextIndex == ~size_t(0), "No target was found");
+
+      BasicBlock *const next = DR->getBlockTag(nextIndex).BB;
+      lin.push(next);
+
+      // The last eBlockIsVirtualDivergentLoopExit introduced from an optional
+      // loop exit wasn't given a block to branch to, it is thus empty.
+      if (DR->hasFlag(*BB,
+                      BlockDivergenceFlag::eBlockIsVirtualDivergentLoopExit) &&
+          !BB->getTerminator()) {
+        BranchInst::Create(next, BB);
+      }
+
+      LLVM_DEBUG(dbgs() << "\tsuccessor 0: " << next->getName() << "\n");
+
+      // Virtually remove backedges.
+      if (!BBTag.isLoopBackEdge(next)) {
+        targets.erase(next);
+        // Don't add deferred edges to blocks already processed.
+        if (BBIndex < nextIndex) {
+          for (BasicBlock *deferred : availableTargets) {
+            if (deferred != next) {
+              addDeferral(next, deferred, deferrals);
+            }
+          }
+        }
+      }
+    }
+
+    // Remove the deferrals that involved 'BB'.
+    removeDeferrals(BB, deferrals);
+
+    // clang-format off
+    LLVM_DEBUG(
+        dbgs() << "  deferral list:";
+        if (deferrals.empty()) {
+          dbgs() << " (empty)\n";
+        } else {
+          dbgs() << "\n";
+          for (const auto &Pair : deferrals) {
+            for (BasicBlock *Deferred : Pair.second) {
+              LLVM_DEBUG(dbgs() << "\t(" << Pair.first->getName() << ", "
+                                << Deferred->getName() << ")\n");
+            }
+          }
+        }
+    );
+    // clang-format on
+  }
+
+  // There shouldn't remain any deferral edges.
+  VECZ_ERROR_IF(!deferrals.empty(), "Deferrals remain");
+  // All blocks should have been targeted at least once.
+  VECZ_ERROR_IF(!targets.empty(), "Not all blocks have been rewired");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::linearizeCFG() {
+  LLVM_DEBUG(dbgs() << "CFC: LINEARIZE\n");
+
+  // Compute the new targets ...
+  Linearization lin;
+  VECZ_FAIL_IF(!computeNewTargets(lin));
+
+  auto dataIt = lin.data.begin();
+  for (const auto &newTargetInfo : lin.infos) {
+    BasicBlock *BB = newTargetInfo.BB;
+
+    // Get the new target info for this block
+    const auto numTargets = newTargetInfo.numTargets;
+    const auto newTargets = dataIt;
+    dataIt += numTargets;
+
+    LLVM_DEBUG(dbgs() << BB->getName() << ":\n");
+
+    auto *T = BB->getTerminator();
+
+    // If we have set a new target that is already a successor of BB, but we
+    // have not set it at the same successor's position, then do it!
+    // It will avoid to have to update the phi nodes.
+    SmallDenseMap<BasicBlock *, unsigned, 2> successors;
+    for (unsigned idx = 0; idx < T->getNumSuccessors(); ++idx) {
+      BasicBlock *succ = T->getSuccessor(idx);
+      successors.try_emplace(succ, idx);
+    }
+    for (unsigned idx = 0; idx < numTargets; ++idx) {
+      auto succIt = successors.find(newTargets[idx]);
+      // If we have a successor set as a new target ...
+      if (succIt != successors.end()) {
+        // ... but we have not set it at the same position ...
+        if (succIt->second != idx && succIt->second < numTargets) {
+          // .. then swap both blocks.
+          std::swap(newTargets[idx], newTargets[succIt->second]);
+        }
+      }
+    }
+
+    // Now iterate over the new targets to set them as successors of BB if
+    // they were not already.
+    unsigned idx = 0;
+    for (; idx < numTargets; ++idx) {
+      BasicBlock *const newTarget = newTargets[idx];
+
+      VECZ_ERROR_IF(
+          idx >= T->getNumSuccessors(),
+          "BasicBlock should not have more successors after linearization");
+
+      BasicBlock *oldSucc = T->getSuccessor(idx);
+
+      LLVM_DEBUG(dbgs() << "\tOld successor: " << oldSucc->getName() << "\n");
+
+      // If we have set the current successor to be the new target, there is
+      // nothing to do.
+      if (oldSucc == newTarget) {
+        LLVM_DEBUG(dbgs() << "\tUntouched successor: " << oldSucc->getName()
+                          << "\n");
+        continue;
+      }
+
+      // Uniform blocks should not be rewired.
+      VECZ_ERROR_IF(DR->isUniform(*oldSucc),
+                    "Uniform BasicBlock should not have its edge modified");
+
+      // Otherwise update the successor.
+      T->setSuccessor(idx, newTarget);
+      LLVM_DEBUG(dbgs() << "\tAdd successor: " << newTarget->getName() << "\n");
+    }
+
+    // We have either processed a divergent branch (with only one successor), or
+    // we have processed a uniform branch (with all its successors untouched).
+    VECZ_ERROR_IF(idx != 1 && idx != T->getNumSuccessors(),
+                  "Number of processed new targets is undefined");
+
+    // Finally, clear the remaining successors that have not been set as new
+    // targets.
+    if (idx != T->getNumSuccessors()) {
+      for (; idx < T->getNumSuccessors(); ++idx) {
+        BasicBlock *succ = T->getSuccessor(idx);
+
+        // Uniform blocks should not be rewired.
+        VECZ_ERROR_IF(DR->isUniform(*succ),
+                      "Uniform BasicBlock should not have its edge modified");
+
+        LLVM_DEBUG(dbgs() << "\tRemove successor: " << succ->getName() << "\n");
+      }
+
+      auto *newT = BranchInst::Create(T->getSuccessor(0));
+      newT->insertBefore(T->getIterator());
+
+      updateMaps(T, newT);
+
+      IRCleanup::deleteInstructionNow(T);
+    }
+  }
+  assert(dataIt == lin.data.end() &&
+         "Failed to reach end of Linearization data vector!");
+
+  // Updating on-the-fly the DomTree and PostDomTree whilst rewiring the CFG
+  // is extremely tedious, and may not even be possible due to all the invalid
+  // states that happen during it ... Therefore, we have no choice but to
+  // recalculate the DomTree and PostDomTree from scratch.
+  DT->recalculate(F);
+  PDT->recalculate(F);
+  VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated");
+  VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated");
+  VECZ_FAIL_IF(!computeBlockOrdering());
+  RC->clear();
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateSelects() {
+  LLVM_DEBUG(dbgs() << "CFC: GENERATE SELECTS FROM PHI NODES\n");
+  // For each basic block that has only one predecessor and phi nodes, we need
+  // to either blend those phi nodes into select instructions or try to move
+  // the phi nodes up the chain of linearized path.
+  for (const auto &BTag : DR->getBlockOrdering()) {
+    BasicBlock *B = BTag.BB;
+    if (B->hasNPredecessors(1) || DR->isBlend(*B)) {
+      if (PHINode *PHI = dyn_cast<PHINode>(&B->front())) {
+        LLVM_DEBUG(dbgs() << B->getName() << ":\n");
+        const SmallPtrSet<BasicBlock *, 2> incomings(PHI->block_begin(),
+                                                     PHI->block_end());
+        BasicBlock *cur = B;
+        while (cur->hasNPredecessors(1) && !incomings.empty()) {
+          cur = cur->getSinglePredecessor();
+          if (incomings.contains(cur)) {
+            break;
+          }
+        }
+        // Only move the phis up the chain of linearized path:
+        // - if the block whose phis we are processing is not a blend block
+        //   (because the latter do need to have its phis transformed into
+        //   selects),
+        // - if the last block of the chain is not an incoming block, and
+        // - if the last block of the chain is a convergence block.
+        if (!DR->isBlend(*B) && !incomings.contains(cur) &&
+            cur->hasNPredecessorsOrMore(2) && PHI->getNumIncomingValues() > 1) {
+          // All PHI nodes have the same incoming blocks so we update the exit
+          // masks of the incoming blocks of the first PHI node here.
+          for (unsigned i = 0; i < PHI->getNumIncomingValues(); ++i) {
+            auto &maskInfo = MaskInfos[PHI->getIncomingBlock(i)];
+            Value *&exitMask = maskInfo.exitMasks[cur];
+
+            if (!exitMask) {
+              exitMask = maskInfo.exitMasks[B];
+            }
+          }
+
+          while ((PHI = dyn_cast<PHINode>(&B->front()))) {
+            LLVM_DEBUG(dbgs() << "\tMove " << *PHI << " in " << cur->getName()
+                              << "\n");
+            PHI->moveBefore(*cur, cur->begin());
+          }
+        } else {
+          while ((PHI = dyn_cast<PHINode>(&B->front()))) {
+            VECZ_FAIL_IF(!generateSelectFromPHI(PHI, B));
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::generateSelectFromPHI(PHINode *PHI,
+                                                             BasicBlock *B) {
+  const unsigned phiNumIncVals = PHI->getNumIncomingValues();
+  VECZ_ERROR_IF(phiNumIncVals == 0, "PHINode does not have any incoming value");
+
+  Value *newVal = nullptr;
+  auto &maskInfo = MaskInfos[B];
+  if (PHI == maskInfo.entryMask) {
+    // The entry mask of a blend value should be the conjunction of the incoming
+    // masks, so change it.
+    maskInfo.entryMask = copyEntryMask(PHI->getIncomingValue(0), *B);
+    for (unsigned i = 1; i < phiNumIncVals; i++) {
+      Value *V = PHI->getIncomingValue(i);
+      auto InsertPt = std::next(maskInfo.entryMask->getIterator());
+      maskInfo.entryMask = BinaryOperator::CreateOr(
+          maskInfo.entryMask, V, B->getName() + ".entry_mask");
+      maskInfo.entryMask->insertBefore(InsertPt);
+    }
+    newVal = maskInfo.entryMask;
+  } else {
+    Value *select = PHI->getIncomingValue(0);
+    for (unsigned i = 1; i < phiNumIncVals; i++) {
+      Value *V = PHI->getIncomingValue(i);
+      BasicBlock *PHIB = PHI->getIncomingBlock(i);
+      Value *cond = MaskInfos[PHIB].exitMasks[B];
+      VECZ_ERROR_IF(!cond, "Exit mask does not exist");
+
+      auto InsertPt = B->getFirstInsertionPt();
+      if (i == 1) {
+        if (Instruction *condI = dyn_cast<Instruction>(cond)) {
+          BasicBlock *maskParent = condI->getParent();
+          if (maskParent == B) {
+            InsertPt = std::next(condI->getIterator());
+          }
+        }
+      } else {
+        InsertPt = std::next(cast<Instruction>(select)->getIterator());
+      }
+      auto *selectInst =
+          SelectInst::Create(cond, V, select, PHI->getName() + ".blend");
+      selectInst->insertBefore(InsertPt);
+      select = selectInst;
+    }
+    newVal = select;
+  }
+
+  LLVM_DEBUG(dbgs() << "\tReplace " << *PHI << " with " << *newVal << "\n");
+
+  updateMaps(PHI, newVal);
+
+  PHI->replaceAllUsesWith(newVal);
+
+  IRCleanup::deleteInstructionNow(PHI);
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::repairSSA() {
+  // Check that all the blocks have a unique position
+  VECZ_FAIL_IF(!checkBlocksOrder());
+  RC->update(F);
+
+  VECZ_FAIL_IF(!updatePHIsIncomings());
+  VECZ_FAIL_IF(!blendInstructions());
+
+  VECZ_ERROR_IF(!DT->verify(), "DominatorTree incorrectly updated");
+  VECZ_ERROR_IF(!PDT->verify(), "PostDominatorTree incorrectly updated");
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::updatePHIsIncomings() {
+  // We need to update the incoming blocks of phi nodes whose predecessors may
+  // have changed since we have not changed the phi nodes during the rewiring.
+  for (const auto &BBTag : DR->getBlockOrdering()) {
+    BasicBlock *BB = BBTag.BB;
+    const SmallPtrSet<BasicBlock *, 4> preds(pred_begin(BB), pred_end(BB));
+    for (auto it = BB->begin(); it != BB->end();) {
+      Instruction &I = *it++;
+      PHINode *PHI = dyn_cast<PHINode>(&I);
+      if (!PHI) {
+        break;
+      }
+
+      const SmallPtrSet<BasicBlock *, 4> incomings(PHI->block_begin(),
+                                                   PHI->block_end());
+
+      // If no predecessors of `BB` is an incoming block of its PHI Node, then
+      // completely transform the PHI Node into multiple select instructions.
+      bool intersect = false;
+      for (BasicBlock *inc : incomings) {
+        for (BasicBlock *pred : preds) {
+          if (pred == inc) {
+            intersect = true;
+            break;
+          }
+        }
+        if (intersect) {
+          break;
+        }
+      }
+      if (!intersect) {
+        VECZ_FAIL_IF(!generateSelectFromPHI(PHI, BB));
+        continue;
+      }
+      // Otherwise, only transform the incoming blocks of predecessors that got
+      // linearized into selects.
+      //
+      // Instruction that will combine the phi node and the select instructions
+      // created from it if some incoming blocks are no longer predecessors.
+      Instruction *newBlend = nullptr;
+      const BasicBlock::iterator InsertPt = getInsertionPt(*BB);
+
+      auto &maskInfo = MaskInfos[BB];
+      const bool isEntryMask = PHI == maskInfo.entryMask;
+      for (unsigned idx = 0; idx < PHI->getNumIncomingValues(); ++idx) {
+        BasicBlock *incoming = PHI->getIncomingBlock(idx);
+        if (preds.contains(incoming)) {
+          continue;
+        }
+        // If the incoming block is no longer a predecessor, transform it into
+        // a select instruction, or a binary OR if it is an entry mask.
+        Value *V = PHI->getIncomingValue(idx);
+
+        if (isEntryMask) {
+          // The entry mask of a blend value should be the conjunction of
+          // the incoming masks, so change it.
+          if (!newBlend) {
+            newBlend =
+                BinaryOperator::CreateOr(PHI, V, BB->getName() + ".entry_mask");
+          } else {
+            newBlend = BinaryOperator::CreateOr(newBlend, V,
+                                                BB->getName() + ".entry_mask");
+          }
+          maskInfo.entryMask = newBlend;
+        } else {
+          Value *cond = MaskInfos[incoming].exitMasks[BB];
+          VECZ_ERROR_IF(!cond, "Exit mask does not exist");
+          if (!newBlend) {
+            newBlend =
+                SelectInst::Create(cond, V, PHI, PHI->getName() + ".blend");
+          } else {
+            newBlend = SelectInst::Create(cond, V, newBlend,
+                                          PHI->getName() + ".blend");
+          }
+        }
+        newBlend->insertBefore(InsertPt);
+        PHI->removeIncomingValue(idx--);
+      }
+
+      // If we have created select instructions from `PHI`, update the users
+      // of the latter.
+      if (newBlend) {
+        VECZ_FAIL_IF(!replaceReachableUses(*RC, PHI, newBlend, BB));
+        updateMaps(PHI, newBlend);
+      }
+
+      // And add any new incoming blocks that do not replace any previous.
+      for (BasicBlock *pred : preds) {
+        if (!incomings.contains(pred)) {
+          PHI->addIncoming(getDefaultValue(PHI->getType()), pred);
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::blendInstructions() {
+  LLVM_DEBUG(dbgs() << "CFC: BLEND INSTRUCTIONS\n");
+
+  auto addSuccessors = [this](const BasicBlockTag &BTag, BlockQueue &queue,
+                              DenseSet<BasicBlock *> &visited,
+                              const BasicBlockTag &dstTag) {
+    for (BasicBlock *succ : successors(BTag.BB)) {
+      // Allow latch if 'succ' belongs in 'dst's loop and 'dst' is the header
+      // of that loop.
+      const bool allowLatch =
+          dstTag.isLoopHeader() && dstTag.loop->loop->contains(succ);
+
+      if (!allowLatch && BTag.isLoopBackEdge(succ)) {
+        continue;
+      }
+
+      if (allowLatch) {
+        // the fast Reachability calculation can't follow back edges yet
+        if (!DR->isReachable(succ, dstTag.BB, allowLatch)) {
+          continue;
+        }
+      } else if (!RC->isReachable(succ, dstTag.BB)) {
+        continue;
+      }
+
+      if (visited.insert(succ).second) {
+        LLVM_DEBUG(dbgs() << "\t\t\tInsert " << succ->getName()
+                          << " in the queue\n");
+        queue.push(DR->getTagIndex(succ));
+      }
+    }
+
+    // clang-format off
+    LLVM_DEBUG(
+        dbgs() << "\t\t\tWorklist: [";
+        if (!queue.empty()) {
+          dbgs() << DR->getBlockTag(*queue.begin()).BB->getName();
+          for (auto It = std::next(queue.begin()); It != queue.end(); ++It) {
+            dbgs() << ", " << DR->getBlockTag(*It).BB->getName();
+          }
+          dbgs() << "]\n";
+        }
+    );
+    // clang-format on
+  };
+
+  DenseMap<Instruction *, SmallDenseMap<BasicBlock *, Value *, 2>> blendMap;
+
+  auto getValueOfAt = [&blendMap](Instruction *opDef,
+                                  BasicBlock *B) -> Value * {
+    auto it = blendMap.find(opDef);
+    if (it != blendMap.end()) {
+      auto it2 = it->second.find(B);
+      if (it2 != it->second.end()) {
+        return it2->second;
+      }
+    }
+    return nullptr;
+  };
+
+  auto createBlend = [this, &blendMap, &getValueOfAt](
+                         BasicBlock *B, Instruction *opDef) -> Value * {
+    if (Value *V = getValueOfAt(opDef, B)) {
+      return V;
+    }
+
+    Type *T = opDef->getType();
+    const unsigned numPreds = std::distance(pred_begin(B), pred_end(B));
+    Value *blend = nullptr;
+    PHINode *PHI = PHINode::Create(T, numPreds, opDef->getName() + ".merge");
+    PHI->insertBefore(B->begin());
+
+    const auto *const LTag = DR->getTag(B).loop;
+    bool hasVisitedPred = false;
+    for (BasicBlock *pred : predecessors(B)) {
+      Value *incomingV = nullptr;
+      if (Value *predV = getValueOfAt(opDef, pred)) {
+        incomingV = predV;
+        hasVisitedPred = true;
+      } else {
+        // When blending a loop header, the value coming from the latch should
+        // be the one coming from the preheader if that value dominates the
+        // latch and the latch has no definition of the value we are trying to
+        // blend.
+        if (DR->getTag(pred).isLoopBackEdge(B)) {
+          if (Value *preheaderV = getValueOfAt(opDef, LTag->preheader)) {
+            if (auto *instV = dyn_cast<Instruction>(preheaderV)) {
+              if (DT->dominates(instV->getParent(), pred)) {
+                incomingV = preheaderV;
+              }
+            } else {
+              incomingV = preheaderV;
+            }
+          }
+        }
+      }
+
+      if (!incomingV) {
+        incomingV = getDefaultValue(T);
+      }
+      PHI->addIncoming(incomingV, pred);
+    }
+    if (!hasVisitedPred) {
+      IRCleanup::deleteInstructionNow(PHI);
+      return nullptr;
+    }
+
+    if (PHI->hasConstantValue()) {
+      blend = PHI->getIncomingValue(0);
+      IRCleanup::deleteInstructionNow(PHI);
+    } else {
+      blend = PHI;
+      blends.insert(PHI);
+    }
+
+    blendMap[opDef][B] = blend;
+
+    return blend;
+  };
+
+  // Manually set the entry point of persisted loop live values and persisted
+  // loop exit masks.
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    auto *const header = LTag->header;
+    for (Value *LLV : LTag->loopLiveValues) {
+      Instruction *LLVI = cast<Instruction>(LLV);
+      if (LLVI->getParent() != header) {
+        blendMap[LLVI][header] = LTag->loopResultPrevs[LLV];
+      }
+    }
+
+    auto &LMask = LoopMasks[LTag->loop];
+    for (auto &UPREM : LMask.updatedPersistedDivergentExitMasks) {
+      if (UPREM.first != header) {
+        blendMap[UPREM.second][header] =
+            LMask.persistedDivergentExitMasks[UPREM.first];
+      }
+    }
+  }
+
+  SmallPtrSet<Value *, 16> spareBlends;
+
+  for (const auto &dstTag : DR->getBlockOrdering()) {
+    BasicBlock *dst = dstTag.BB;
+    LLVM_DEBUG(dbgs() << "Blending instructions used in " << dst->getName()
+                      << ":\n");
+    for (Instruction &I : *dst) {
+      // Don't try to blend a blend value.
+      if (blends.contains(&I)) {
+        continue;
+      }
+
+      LLVM_DEBUG(dbgs() << "\tInstruction " << I << ":\n");
+
+      for (unsigned idx = 0; idx < I.getNumOperands(); ++idx) {
+        Instruction *opDef = dyn_cast<Instruction>(I.getOperand(idx));
+        if (!opDef) {
+          continue;
+        }
+
+        BasicBlock *src = opDef->getParent();
+
+        LLVM_DEBUG(dbgs() << "\t\tOperand " << *opDef << "\n\t\tdefined in "
+                          << src->getName() << ":\n");
+
+        blendMap[opDef][src] = opDef;
+
+        // There exists two possible ways to early exit the blend instruction:
+        // - if the current block dominates the 'dst'.
+        // - if the current block dominates the incoming block of the phi node
+        //   'I' we are blending in 'dst'.
+        //
+        // 'dst' can freely access the values of 'src'.
+        if (DT->dominates(src, dst)) {
+          LLVM_DEBUG(dbgs() << "\t\t\tDefinition dominates use\n");
+          continue;
+        }
+        // The incoming block of this phi node is dominated by the definition
+        // block of the incoming value.
+        BasicBlock *incoming = nullptr;
+        if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+          incoming = PHI->getIncomingBlock(idx);
+          if (DT->dominates(src, incoming)) {
+            LLVM_DEBUG(dbgs() << "\t\t\tDefinition dominates use\n");
+            continue;
+          }
+        }
+
+        DenseSet<BasicBlock *> visited;
+        BlockQueue queue(*DR);
+
+        const auto &srcTag = DR->getTag(src);
+
+        addSuccessors(srcTag, queue, visited, dstTag);
+
+        auto *const srcLoop = srcTag.loop;
+        if (srcLoop && srcLoop->isLoopDivergent()) {
+          if (dst != srcLoop->header) {
+            auto &srcMasks = LoopMasks[srcLoop->loop];
+            const auto &headerTag = DR->getTag(srcLoop->header);
+
+            // If 'opDef' is an update loop exit mask, set an entry point in
+            // the loop header.
+            auto UPREMIt =
+                srcMasks.updatedPersistedDivergentExitMasks.find(src);
+            if (UPREMIt != srcMasks.updatedPersistedDivergentExitMasks.end()) {
+              if (UPREMIt->second == opDef) {
+                LLVM_DEBUG(dbgs()
+                           << "\t\t\tFound persisted value of the operand: "
+                           << srcMasks.persistedDivergentExitMasks[src]
+                           << "\n");
+                addSuccessors(headerTag, queue, visited, dstTag);
+              }
+            }
+            // If 'opDef' is a loop live value, set an entry point in the loop
+            // header.
+            if (srcLoop->loopLiveValues.contains(opDef)) {
+              LLVM_DEBUG(dbgs()
+                         << "\t\t\tFound persisted value of the operand: "
+                         << srcLoop->loopResultPrevs[opDef] << "\n");
+              addSuccessors(headerTag, queue, visited, dstTag);
+            }
+          }
+        }
+
+        while (!queue.empty()) {
+          const BasicBlockTag &curTag = queue.pop();
+          BasicBlock *const cur = curTag.BB;
+
+          LLVM_DEBUG(dbgs() << "\t\t\tPopping " << cur->getName() << "\n");
+
+          // We have reached 'dst' without finding a block that dominates it,
+          // we need to create a phi node if the user is not one, and replace
+          // the operand with the last blended value.
+          if (cur == dst) {
+            LLVM_DEBUG(dbgs() << "\t\t\tReached destination: ");
+            VECZ_ERROR_IF(!queue.empty(), "Blocks remain in the queue");
+            if (PHINode *PHI = dyn_cast<PHINode>(&I)) {
+              BasicBlock *incoming = PHI->getIncomingBlock(idx);
+              Value *V = getValueOfAt(opDef, incoming);
+              VECZ_ERROR_IF(!V, "No blend value was found");
+              I.setOperand(idx, V);
+            } else {
+              Value *blend = createBlend(cur, opDef);
+              VECZ_ERROR_IF(!blend, "No blend value was found");
+              spareBlends.erase(blend);
+              I.setOperand(idx, blend);
+            }
+            LLVM_DEBUG(dbgs() << "new operand: " << *I.getOperand(idx) << "\n");
+            break;
+          }
+
+          const bool curDomDst = DT->dominates(cur, dst);
+          const bool curDomInc = incoming && DT->dominates(cur, incoming);
+          const bool srcDomCur = DT->dominates(src, cur);
+
+          auto &opDefBlend = blendMap[opDef];
+          // If either condition is true, we can early exit:
+          // - 'dst' can freely access the values of 'cur',
+          // - 'incoming' can freely access the values of 'cur'.
+          if ((curDomDst || curDomInc) && queue.empty()) {
+            LLVM_DEBUG(dbgs() << "\t\t\tBlock " << cur->getName()
+                              << " dominates destination: ");
+            if (srcDomCur) {
+              auto *const blend = opDefBlend[src];
+              opDefBlend[cur] = blend;
+              I.setOperand(idx, blend);
+            } else {
+              auto *const blend = createBlend(cur, opDef);
+              VECZ_ERROR_IF(!blend, "No blend value was found");
+              spareBlends.erase(blend);
+              I.setOperand(idx, blend);
+            }
+            LLVM_DEBUG(dbgs() << "new operand: " << *I.getOperand(idx) << "\n");
+            break;
+          }
+
+          addSuccessors(curTag, queue, visited, dstTag);
+
+          // 'cur' can freely access 'opDef'.
+          if (srcDomCur) {
+            // DANGER! operator[] returns a reference, which may be invalidated
+            // by a second call to it. Therefore we have to copy the value via
+            // a temporary variable.
+            auto *const blendSrc = opDefBlend[src];
+            opDefBlend[cur] = blendSrc;
+            continue;
+          }
+
+          // 'cur' does not have a blend value of 'opDef' so create one.
+          Value *blend = createBlend(cur, opDef);
+          VECZ_ERROR_IF(!blend, "No blend value was found");
+          if (isa<PHINode>(blend)) {
+            spareBlends.insert(blend);
+          }
+        }
+      }
+    }
+  }
+
+  for (auto *blend : spareBlends) {
+    auto *I = cast<Instruction>(blend);
+    if (I->use_empty()) {
+      IRCleanup::deleteInstructionNow(I);
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::Impl::simplifyMasks() {
+  const SimplifyQuery Q(F.getParent()->getDataLayout(), nullptr, DT);
+
+  // We might like to just look at the masks pointed to by the block/loop tags,
+  // however linearization and/or BOSCC can sometimes delete them from under
+  // our nose so it's only safe just to go through all the boolean operations
+  // and see if we can simplify any of them.
+  for (const auto &BBTag : DR->getBlockOrdering()) {
+    SmallVector<Instruction *, 16> toDelete;
+    for (auto &I : *BBTag.BB) {
+      if (isa<SelectInst>(&I) || (I.getType()->getScalarSizeInBits() == 1 &&
+                                  (isa<BinaryOperator>(&I) ||
+                                   isa<PHINode>(&I) || isa<ICmpInst>(&I)))) {
+        if (I.use_empty()) {
+          toDelete.push_back(&I);
+        } else {
+          Value *simpleMask = simplifyInstruction(&I, Q);
+          if (simpleMask && simpleMask != &I) {
+            I.replaceAllUsesWith(simpleMask);
+            toDelete.push_back(&I);
+          }
+        }
+      }
+    }
+    for (auto *I : toDelete) {
+      IRCleanup::deleteInstructionNow(I);
+    }
+  }
+
+  return true;
+}
+
+bool ControlFlowConversionState::computeBlockOrdering() {
+  LLVM_DEBUG(dbgs() << "CFC: COMPUTE BLOCK ORDERING\n");
+  RC->clear();
+  return DR->computeBlockOrdering(*DT);
+}
+
+bool ControlFlowConversionState::Impl::checkBlocksOrder() const {
+  const auto &DCBI = DR->getBlockOrdering();
+  VECZ_ERROR_IF(F.size() != DCBI.size(),
+                "Worklist does not contain all blocks");
+
+  uint32_t next = 0u;
+  for (const auto &BBTag : DCBI) {
+    VECZ_ERROR_IF(BBTag.pos != next,
+                  "BasicBlock indices not in consecutive order");
+    ++next;
+  }
+
+  return true;
+}
+
+void ControlFlowConversionState::Impl::updateMaps(Value *from, Value *to) {
+  // Because we keep track of mapping values between uniform and predicated
+  // version, since we replace 'from' with 'to', we also have to update
+  // the hashtable.
+  if (BOSCC) {
+    BOSCC->updateValue(from, to);
+  }
+
+  // Because we keep track of loop live values, since we replace 'from' with
+  // 'to', we also have to update the hashset.
+  for (auto *const LTag : DR->getLoopOrdering()) {
+    if (LTag->loopLiveValues.erase(from)) {
+      LTag->loopLiveValues.insert(to);
+      auto LRPIt = LTag->loopResultPrevs.find(from);
+      if (LRPIt != LTag->loopResultPrevs.end()) {
+        PHINode *from = LRPIt->second;
+        LTag->loopResultPrevs.erase(LRPIt);
+        LTag->loopResultPrevs[to] = from;
+      }
+      auto LRUIt = LTag->loopResultUpdates.find(from);
+      if (LRUIt != LTag->loopResultUpdates.end()) {
+        SelectInst *select = LRUIt->second;
+        LTag->loopResultUpdates.erase(LRUIt);
+        LTag->loopResultUpdates[to] = select;
+      }
+    }
+  }
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
new file mode 100644
index 0000000000000..3953e4257e1a9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/inline_post_vectorization_pass.cpp
@@ -0,0 +1,132 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/inline_post_vectorization_pass.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/mangling.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Transforms/IPO.h>
+#include <llvm/Transforms/IPO/AlwaysInliner.h>
+
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "vecz/vecz_choices.h"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+/// @brief Process a call site, inlining it or marking it as needing inlining
+/// if required.
+///
+/// @param[in] CI Call site to inspect.
+/// @param[out] NeedLLVMInline Whether the call site needs LLVM inlining.
+/// @param[in] BI Builtin database.
+///
+/// @return New return value for the call instruction.
+Value *processCallSite(CallInst *CI, bool &NeedLLVMInline,
+                       compiler::utils::BuiltinInfo &BI) {
+  NeedLLVMInline = false;
+
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee) {
+    return CI;
+  }
+
+  // Mark called function as needing inlining by LLVM, unless it has the
+  // NoInline attribute
+  if (!Callee->isDeclaration() &&
+      !Callee->hasFnAttribute(Attribute::NoInline)) {
+    CI->addFnAttr(Attribute::AlwaysInline);
+    NeedLLVMInline = true;
+    return CI;
+  }
+
+  // Emit builtins inline when they have no vector/scalar equivalent.
+  IRBuilder<> B(CI);
+  const auto Builtin = BI.analyzeBuiltin(*Callee);
+  if (Builtin && Builtin->properties &
+                     compiler::utils::eBuiltinPropertyInlinePostVectorization) {
+    const SmallVector<Value *, 4> Args(CI->args());
+    if (Value *Impl = BI.emitBuiltinInline(Callee, B, Args)) {
+      VECZ_ERROR_IF(
+          Impl->getType() != CI->getType(),
+          "The inlined function type must match that of the original function");
+      return Impl;
+    }
+  }
+
+  return CI;
+}
+
+} // namespace
+
+PreservedAnalyses
+InlinePostVectorizationPass::run(Function &F, FunctionAnalysisManager &AM) {
+  bool modified = false;
+  bool needToRunInliner = false;
+  auto &BI =
+      AM.getResult<VectorizationContextAnalysis>(F).getContext().builtins();
+
+  SmallVector<Instruction *, 4> ToDelete;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      // Look for calls to builtins with no vector/scalar equivalent.
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI) {
+        continue;
+      }
+
+      bool NeedLLVMInline = false;
+      Value *NewCI = processCallSite(CI, NeedLLVMInline, BI);
+      needToRunInliner |= NeedLLVMInline;
+      if ((NewCI == CI) || !NewCI) {
+        continue;
+      }
+
+      if (!CI->getType()->isVoidTy()) {
+        CI->replaceAllUsesWith(NewCI);
+      }
+      ToDelete.push_back(CI);
+      modified = true;
+    }
+  }
+
+  // Clean up.
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.pop_back_val();
+    I->eraseFromParent();
+  }
+
+  // Run the LLVM inliner if some calls were marked as needing inlining.
+  if (needToRunInliner) {
+    llvm::legacy::PassManager PM;
+    PM.add(llvm::createAlwaysInlinerLegacyPass());
+    PM.run(*F.getParent());
+    modified = true;
+  }
+
+  // Recursively run the pass to inline any newly introduced functions.
+  if (modified) {
+    run(F, AM);
+  }
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
new file mode 100644
index 0000000000000..4235885c8a564
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/instantiation_pass.cpp
@@ -0,0 +1,351 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/instantiation_pass.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/ADT/PostOrderIterator.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include <memory>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "transform/packetization_helpers.h"
+#include "transform/packetizer.h"
+#include "vectorization_context.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz-instantiation"
+
+#undef VECZ_FAIL
+#define VECZ_FAIL() return packetizer.getEmptyRange();
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczInstantiated, "Number of instructions instantiated [ID#I00]");
+STATISTIC(VeczPacketizeFailInstantiate,
+          "Packetize: instantiation failures [ID#P84]");
+
+InstantiationPass::InstantiationPass(Packetizer &pp)
+    : Ctx(pp.context()), packetizer(pp) {}
+
+PacketRange InstantiationPass::instantiate(Value *V) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  if (auto info = packetizer.getPacketized(V)) {
+    const unsigned SimdWidth = packetizer.width().getFixedValue();
+    return info.getAsPacket(SimdWidth);
+  }
+
+  // Handle uniform values first, which instantiate to the same value for all
+  // items.
+  auto *Ins = dyn_cast<Instruction>(V);
+  if (Ins && packetizer.uniform().isMaskVarying(V)) {
+    const PacketRange P = simdBroadcast(Ins);
+    if (!P) {
+      emitVeczRemark(&packetizer.function(), V,
+                     "Failed to broadcast Mask Varying instruction");
+      VECZ_FAIL();
+    }
+    return assignInstance(P, V);
+  }
+
+  if (!packetizer.uniform().isVarying(V)) {
+    return assignInstance(broadcast(V), V);
+  }
+
+  if (Ins) {
+    return instantiateInstruction(Ins);
+  }
+
+  VECZ_STAT_FAIL_IF(true, VeczPacketizeFailInstantiate);
+}
+
+PacketRange InstantiationPass::instantiateInternal(Value *V) {
+  if (packetizer.uniform().isVarying(V)) {
+    // The packetizer will call back into the instantiator when it needs to
+    VECZ_FAIL_IF(packetizer.width().isScalable());
+    const unsigned SimdWidth = packetizer.width().getFixedValue();
+    return packetizer.packetize(V).getAsPacket(SimdWidth);
+  } else {
+    return instantiate(V);
+  }
+}
+
+PacketRange InstantiationPass::instantiateInstruction(Instruction *Ins) {
+  // Figure out what kind of instruction it is and try to instantiate it.
+  switch (Ins->getOpcode()) {
+  default:
+    // No special handling of this Instruction so just clone across lanes..
+    break;
+
+  case Instruction::Call:
+    return assignInstance(instantiateCall(cast<CallInst>(Ins)), Ins);
+
+  case Instruction::Alloca:
+    return assignInstance(instantiateAlloca(cast<AllocaInst>(Ins)), Ins);
+  }
+
+  return assignInstance(instantiateByCloning(Ins), Ins);
+}
+
+PacketRange InstantiationPass::assignInstance(const PacketRange P, Value *V) {
+  if (!P) {
+    emitVeczRemarkMissed(&packetizer.function(), V, "Could not instantiate");
+    VECZ_STAT_FAIL_IF(!P, VeczPacketizeFailInstantiate);
+  } else {
+    ++VeczInstantiated;
+  }
+  return P;
+}
+
+PacketRange InstantiationPass::broadcast(Value *V) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  const unsigned SimdWidth = packetizer.width().getFixedValue();
+  PacketRange P = packetizer.createPacket(V, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    P[i] = V;
+  }
+  return P;
+}
+
+PacketRange InstantiationPass::instantiateCall(CallInst *CI) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  const unsigned SimdWidth = packetizer.width().getFixedValue();
+  // Handle special call instructions that return a lane ID.
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  const auto Builtin = BI.analyzeBuiltinCall(*CI, packetizer.dimension());
+  if (Builtin &&
+      Builtin->properties & compiler::utils::eBuiltinPropertyWorkItem) {
+    const auto Uniformity = Builtin->uniformity;
+    if (Uniformity == compiler::utils::eBuiltinUniformityNever) {
+      // can't handle these (global/local linear ID probably)
+      VECZ_FAIL();
+    } else if (Uniformity & compiler::utils::eBuiltinUniformityInstanceID) {
+      Type *RetTy = CI->getType();
+      PacketRange P = packetizer.createPacket(CI, SimdWidth);
+      VECZ_FAIL_IF(!P);
+      IRBuilder<> B(CI);
+      for (unsigned j = 0; j < SimdWidth; j++) {
+        P[j] = B.CreateAdd(CI, ConstantInt::get(RetTy, j));
+      }
+      packetizer.deleteInstructionLater(CI);
+      return P;
+    }
+  }
+
+  // We can't instantiate noduplicate functions
+  VECZ_FAIL_IF(CI->hasFnAttr(Attribute::NoDuplicate));
+
+  packetizer.deleteInstructionLater(CI);
+  // Check if the instruction has any uses or not, and also if we want to
+  // instantiate call instructions with loops or not.
+  if (CI->hasNUsesOrMore(1) ||
+      !packetizer.choices().instantiateCallsInLoops()) {
+    // Instantiate as always
+    SmallVector<PacketRange, 4> OpPackets;
+    for (unsigned i = 0; i < CI->arg_size(); i++) {
+      Value *Op = CI->getArgOperand(i);
+      const PacketRange OpPacket = instantiateInternal(Op);
+      VECZ_FAIL_IF(!OpPacket);
+      OpPackets.push_back(OpPacket);
+    }
+    PacketRange P = packetizer.createPacket(CI, SimdWidth);
+    VECZ_FAIL_IF(!P);
+    IRBuilder<> B(CI);
+    for (unsigned j = 0; j < SimdWidth; j++) {
+      SmallVector<Value *, 4> Ops;
+      for (unsigned i = 0; i < CI->arg_size(); i++) {
+        Ops.push_back(OpPackets[i][j]);
+      }
+      auto *NewCI = B.CreateCall(CI->getFunctionType(), CI->getCalledOperand(),
+                                 Ops, CI->getName());
+      NewCI->setCallingConv(CI->getCallingConv());
+      NewCI->setAttributes(CI->getAttributes());
+      P[j] = NewCI;
+    }
+    return P;
+  } else {
+    // Instantiate in a loop
+    BasicBlock *BeforeCI = CI->getParent();
+    BasicBlock *AfterCI = SplitBlock(BeforeCI, CI);
+    BasicBlock *LoopHeader = BasicBlock::Create(
+        CI->getContext(), "instloop.header", CI->getFunction(), AfterCI);
+    BasicBlock *LoopBody = BasicBlock::Create(CI->getContext(), "instloop.body",
+                                              CI->getFunction(), AfterCI);
+
+    // Change the branch instruction from BeforeCI -> AfterCI to BeforeCI ->
+    // LoopHeader
+    BeforeCI->getTerminator()->setSuccessor(0, LoopHeader);
+
+    IRBuilder<> B(LoopHeader);
+    // Create the induction variable
+    PHINode *Ind = B.CreatePHI(B.getInt32Ty(), 2, "instance");
+
+    // Create the conditional jump based on the current iteration number
+    Value *ICmp = B.CreateICmpULT(Ind, B.getInt32(SimdWidth));
+    B.CreateCondBr(ICmp, LoopBody, AfterCI);
+
+    B.SetInsertPoint(LoopBody);
+    SmallVector<Value *, 4> Operands;
+    for (auto &Arg : CI->args()) {
+      // We call the packetizer explicitly, instead of calling the
+      // instantiator, because we need a packetized value and not an
+      // instantiateed one.
+      Value *Packetized = packetizer.packetize(Arg).getAsValue();
+      VECZ_FAIL_IF(!Packetized);
+      VECZ_ERROR_IF(!Packetized->getType()->isVectorTy(),
+                    "The packetized Value has to be of a vector type");
+      Operands.push_back(Packetized);
+    }
+    // Each Op is an element extracted from a packetized instruction.
+    SmallVector<Value *, 4> Ops;
+    for (unsigned i = 0; i < Operands.size(); ++i) {
+      Ops.push_back(B.CreateExtractElement(Operands[i], Ind));
+    }
+    // Create the function call
+    auto CO = CI->getCalledOperand();
+    FunctionType *FTy = CI->getFunctionType();
+    CallInst *NewCI = B.CreateCall(FTy, CO, Ops);
+    NewCI->setCallingConv(CI->getCallingConv());
+    NewCI->setAttributes(CI->getAttributes());
+    // Increment the induction variable and jump back to the loop header
+    Value *IndInc = B.CreateAdd(Ind, B.getInt32(1), "");
+    B.CreateBr(LoopHeader);
+
+    // Set the operands to the Phi node in the loop header
+    Ind->addIncoming(B.getInt32(0), BeforeCI);
+    Ind->addIncoming(IndInc, LoopBody);
+
+    // Set the Packet, even though we are not going to be using this value (we
+    // have checked if the call has 0 users). We don't need to populate it.
+    return packetizer.createPacket(CI, SimdWidth);
+  }
+}
+
+PacketRange InstantiationPass::instantiateAlloca(AllocaInst *Alloca) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  const unsigned SimdWidth = packetizer.width().getFixedValue();
+  PacketRange P = packetizer.createPacket(Alloca, SimdWidth);
+  VECZ_FAIL_IF(!P);
+  IRBuilder<> B(Alloca);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Type *Ty = Alloca->getAllocatedType();
+    AllocaInst *New = B.CreateAlloca(Ty, nullptr, Alloca->getName());
+    New->setAlignment(Alloca->getAlign());
+
+    P[i] = New;
+  }
+  packetizer.deleteInstructionLater(Alloca);
+  return P;
+}
+
+PacketRange InstantiationPass::instantiateByCloning(Instruction *I) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  auto SimdWidth = packetizer.width().getFixedValue();
+  PacketRange P = packetizer.createPacket(I, SimdWidth);
+  if (!P || P.at(SimdWidth - 1)) {
+    return P;
+  }
+
+  // Clone breadth first so that the packet is complete before fixing up the
+  // operands, that way we get less stack-thrashing, especially when there
+  // is a circular dependency.
+  SmallVector<Instruction *, 16> Clones;
+  for (decltype(SimdWidth) i = 0; i < SimdWidth; ++i) {
+    if (P.at(i)) {
+      Clones.push_back(nullptr);
+      continue;
+    }
+    Instruction *Clone = I->clone();
+    Clone->insertBefore(I->getIterator());
+    P[i] = Clone;
+    Clones.push_back(Clone);
+  }
+
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    Value *V = I->getOperand(i);
+    if (isa<BasicBlock>(V) || isa<Constant>(V)) {
+      continue;
+    }
+
+    if (const auto OpP = instantiateInternal(V)) {
+      for (decltype(SimdWidth) lane = 0; lane < SimdWidth; ++lane) {
+        if (auto *Clone = Clones[lane]) {
+          if (auto *At = OpP.at(lane)) {
+            Clone->setOperand(i, At);
+          }
+        }
+      }
+    } else {
+      VECZ_FAIL();
+    }
+  }
+
+  packetizer.deleteInstructionLater(I);
+  return P;
+}
+
+PacketRange InstantiationPass::simdBroadcast(Instruction *I) {
+  VECZ_FAIL_IF(packetizer.width().isScalable());
+  auto SimdWidth = packetizer.width().getFixedValue();
+  PacketRange P = packetizer.createPacket(I, SimdWidth);
+  if (!P || P.at(0)) {
+    return P;
+  }
+
+  for (auto &i : P) {
+    i = I;
+  }
+
+  auto Op = MemOp::get(I);
+  if (!Op || !Op->getMaskOperand()) {
+    return P;
+  }
+
+  if (auto *MaskInst = dyn_cast<Instruction>(Op->getMaskOperand())) {
+    const auto MP = instantiateInternal(MaskInst);
+    VECZ_FAIL_IF(!MP);
+
+    auto W = SimdWidth;
+    SmallVector<Value *, 16> Reduce;
+    for (decltype(SimdWidth) i = 0; i < SimdWidth; i++) {
+      Reduce.push_back(MP.at(i));
+    }
+
+    IRBuilder<> B(buildAfter(Reduce.back(), packetizer.function()));
+    while ((W >>= 1)) {
+      for (decltype(W) i = 0; i < W; ++i) {
+        Reduce[i] = B.CreateOr(Reduce[i], Reduce[i + W], "any_of_mask");
+      }
+    }
+    Op->setMaskOperand(Reduce.front());
+  }
+
+  return P;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
new file mode 100644
index 0000000000000..f2308d7bb050f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/interleaved_group_combine_pass.cpp
@@ -0,0 +1,548 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/interleaved_group_combine_pass.h"
+
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/ScalarEvolution.h>
+#include <llvm/Analysis/ScalarEvolutionExpressions.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/Local.h>
+
+#include <optional>
+
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "memory_operations.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+char InterleavedGroupCombinePass::PassID = 0;
+
+struct GroupMemberInfo {
+  int64_t Offset;
+  int64_t Order;
+  CallInst *MemOp;
+  Value *Ptr;
+  Type *DataTy;
+};
+
+/// @brief Information about an interleaved operation.
+struct InterleavedGroupCombinePass::InterleavedOpInfo {
+  /// @brief Interleaved operation.
+  CallInst *Op;
+  /// @brief Kind of interleaved operation.
+  InterleavedOperation Kind;
+  /// @brief Interleaved stride.
+  int Stride;
+  /// @brief Whether the operation was removed or not.
+  bool Removed;
+};
+
+struct InterleavedGroupCombinePass::InterleavedGroupInfo {
+  BasicBlock *BB = nullptr;
+  SmallVector<Value *, 4> Data;
+  SmallVector<GroupMemberInfo, 4> Info;
+  Value *Base = nullptr;
+  unsigned Stride = 0;
+  int Offset = 0;
+  InterleavedOperation Kind = eInterleavedInvalid;
+
+  void clear() {
+    BB = nullptr;
+    Data.clear();
+    Info.clear();
+    Base = nullptr;
+    Stride = 0;
+    Offset = 0;
+    Kind = eInterleavedInvalid;
+  }
+
+  bool isConsecutive() const {
+    auto InfoIt = Info.begin();
+    auto InfoE = Info.end();
+    assert(InfoIt != InfoE);
+    int ExpectedOffset = Info.front().Offset;
+    for (++InfoIt; InfoIt != InfoE; ++InfoIt) {
+      if (InfoIt->Offset != ++ExpectedOffset) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool canDeinterleaveMask(const Instruction &Mask) const;
+};
+
+namespace {
+
+bool canSwap(Instruction *IA, Instruction *IB) {
+  // we need to check for usage-relations here, because a load instruction
+  // might depend on a mask calculation and its uses that might end up
+  // swapped
+  for (auto *const Op : IB->operand_values()) {
+    if (isa<GetElementPtrInst>(Op)) {
+      // GEPs get eliminated later so ignore them for now
+      continue;
+    }
+    if (Op == IA) {
+      return false;
+    }
+  }
+
+  if (IA->mayReadOrWriteMemory()) {
+    if (isa<FenceInst>(IB)) {
+      // can't swap any memory operation with a fence
+      return false;
+    }
+  } else {
+    // if either instruction is not a memory operation, we can swap them.
+    return true;
+  }
+
+  if (IB->mayReadOrWriteMemory()) {
+    if (isa<FenceInst>(IA)) {
+      return false;
+    }
+  } else {
+    return true;
+  }
+
+  // can't swap a write with a write, or a write with a read,
+  // but it should be ok to swap two reads
+  if (IA->mayWriteToMemory() || IB->mayWriteToMemory()) {
+    return false;
+  }
+
+  return true;
+}
+
+bool canMoveUp(const SmallVectorImpl<Value *> &Group, Instruction *IB) {
+  auto Ig = Group.rbegin();
+  auto Ie = Group.rend();
+  Instruction *IA = IB;
+
+  // It looks through all preceding instructions, skipping over any that are
+  // already in the Group, until it reaches the first member of the group,
+  // terminating if it can't move IB through the current instruction.
+  // If it reaches the first member of the Group, it is safe to move IB there.
+  while ((IA = IA->getPrevNode())) {
+    if (IA == *Ig) {
+      if (++Ig == Ie) {
+        // we met every group member so we're done
+        return true;
+      }
+    } else if (!canSwap(IA, IB)) {
+      return false;
+    }
+  }
+  // if we get here, it means we didn't pass any of the other group members,
+  // which shouldn't be able to happen.
+  assert(false);
+  return false;
+}
+
+bool canMoveDown(const SmallVectorImpl<Value *> &Group, Instruction *IA) {
+  auto Ig = Group.rbegin();
+  auto Ie = Group.rend();
+  Instruction *IB = IA;
+
+  // It looks through all following instructions, skipping over any that are
+  // already in the Group, until it reaches the first member of the group,
+  // terminating if it can't move IA through the current instruction.
+  // If it reaches the first member of the Group, it is safe to move IA there.
+  while ((IB = IB->getNextNode())) {
+    if (IB == *Ig) {
+      if (++Ig == Ie) {
+        // we met every group member so we're done
+        return true;
+      }
+    } else if (!canSwap(IA, IB)) {
+      return false;
+    }
+  }
+  // if we get here, it means we didn't pass any of the other group members,
+  // which shouldn't be able to happen.
+  assert(false);
+  return false;
+}
+
+} // namespace
+
+bool InterleavedGroupCombinePass::InterleavedGroupInfo::canDeinterleaveMask(
+    const Instruction &Mask) const {
+  // If the mask definition is not in the same block as the group members, it
+  // is safe to de-interleave.
+  if (Mask.getParent() != BB) {
+    return true;
+  }
+
+  SmallPtrSet<Instruction *, 2> Ops;
+  for (auto &Op : Mask.operands()) {
+    if (auto *OpI = dyn_cast<Instruction>(Op.get())) {
+      // We only care about operands in the same basic block, since otherwise
+      // they cannot be group members or in between group members.
+      if (OpI->getParent() == BB) {
+        Ops.insert(OpI);
+      }
+    }
+  }
+
+  // If the mask has no dependency on anything in the group basic block, it is
+  // safe to de-interleave.
+  if (Ops.empty()) {
+    return true;
+  }
+
+  // Note that the mask can hardly depend on the last group member, since it is
+  // itself an operand of this member.
+  Instruction *IA = cast<Instruction>(Data.back());
+
+  // It looks through all instructions from the last member of the group
+  // back to the first, looking to see if the mask depends on any of them.
+  // If it reaches the first member of the Group, it is safe to move the mask.
+  // If it finds any of the mask's own operands as group members or in
+  // between group members, the mask cannot be (trivially) moved.
+  while (IA) {
+    if (Ops.contains(IA)) {
+      // We found something the mask depends on, so we can't de-interleave...
+      return false;
+    } else if (IA == Data.front()) {
+      // we met every group member so we're done
+      return true;
+    }
+    IA = IA->getPrevNode();
+  }
+
+  // the mask definition was before every group member
+  return true;
+}
+
+PreservedAnalyses
+InterleavedGroupCombinePass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+  IRCleanup IC;
+
+  const bool IsLoad =
+      (Kind == eInterleavedLoad) || (Kind == eMaskedInterleavedLoad);
+
+  LLVM_DEBUG(dbgs() << "vecz: InterleavedGroupCombinePass on " << F.getName()
+                    << "\n");
+
+  scalarEvolution = &AM.getResult<ScalarEvolutionAnalysis>(F);
+
+  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+  const auto &DL = F.getParent()->getDataLayout();
+  std::vector<InterleavedOpInfo> InterleavedOps;
+  for (BasicBlock &BB : F) {
+    // Look for interleaved operations.
+    for (Instruction &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI) {
+        continue;
+      }
+
+      std::optional<MemOp> Op = MemOp::get(CI);
+      // We can't optimize interleaved memops if we don't know the stride at
+      // runtime, since we need to check if the stride and the group size match.
+      if (!Op || !Op->isStrideConstantInt()) {
+        continue;
+      }
+      const int64_t Stride = Op->getStrideAsConstantInt();
+      if ((Stride == 0) || (Stride == 1)) {
+        continue;
+      }
+      Value *Mask = Op->getMaskOperand();
+      InterleavedOpInfo Info;
+
+      const bool OpIsLoad = Op->isLoad();
+      if (OpIsLoad) {
+        Info.Kind = Mask ? eMaskedInterleavedLoad : eInterleavedLoad;
+      } else {
+        Info.Kind = Mask ? eMaskedInterleavedStore : eInterleavedStore;
+      }
+      Info.Op = CI;
+      Info.Stride = Stride;
+      Info.Removed = false;
+
+      // only add the interleaved operation kinds we actually care about
+      if (IsLoad == OpIsLoad) {
+        InterleavedOps.push_back(Info);
+      }
+    }
+    if (!InterleavedOps.empty()) {
+      if (Kind == eInterleavedStore) {
+        // stores are collated downwards, so reverse the list..
+        std::reverse(InterleavedOps.begin(), InterleavedOps.end());
+      }
+
+      InterleavedGroupInfo Group;
+      Group.BB = &BB;
+
+      while (findGroup(InterleavedOps, UVR, Group)) {
+        // Loads have their uses afterwards, while stores use preceding values.
+        // Group.Info is in forwards order for Loads, reverse order for Stores.
+        IRBuilder<> B(Group.Info.front().MemOp);
+
+        Value *Base = Group.Base;
+        if (Kind == eInterleavedLoad && Group.Offset != 0) {
+          auto *EltTy = Group.Info.front().DataTy->getScalarType();
+          // if it's a Load group that was out of order, we have to use the
+          // sequentially first GEP in order to preserve use-def ordering,
+          // which means we have to offset it with an additional GEP and
+          // hope this optimizes out later.
+          // Note that this is not necessary for Stores, since instructions
+          // are inserted at the last Store.
+          Base = Group.Info.front().Ptr;
+          auto *Offset = ConstantInt::getSigned(
+              DL.getIntPtrType(Base->getType()), Group.Offset);
+
+          Base = B.CreateInBoundsGEP(EltTy, Base, Offset, "reorder_offset");
+        }
+
+        SmallVector<Value *, 4> Masks;
+        if (Group.Kind == eMaskedInterleavedStore ||
+            Group.Kind == eMaskedInterleavedLoad) {
+          Masks.reserve(Group.Data.size());
+          for (auto *V : Group.Data) {
+            std::optional<MemOp> Op = MemOp::get(cast<Instruction>(V));
+            assert(Op && "Unanalyzable interleaved access?");
+            Masks.push_back(Op->getMaskOperand());
+          }
+        }
+        if (Ctx.targetInfo().optimizeInterleavedGroup(
+                B, Group.Kind, Group.Data, Masks, Base, Group.Stride)) {
+          for (Value *V : Group.Data) {
+            if (Instruction *Ins = dyn_cast<Instruction>(V)) {
+              IC.deleteInstructionLater(Ins);
+            }
+          }
+        }
+
+        // Remove the group no matter whether we optimized it or not. Otherwise
+        // we will just iterate indefinitely.
+        for (const auto &Info : Group.Info) {
+          InterleavedOps[Info.Order].Removed = true;
+        }
+      }
+      InterleavedOps.clear();
+    }
+  }
+  IC.deleteInstructions();
+
+  LLVM_DEBUG(dbgs() << "vecz: InterleavedGroupCombinePass done!\n");
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<ScalarEvolutionAnalysis>();
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+
+  return Preserved;
+}
+
+bool InterleavedGroupCombinePass::findGroup(
+    const std::vector<InterleavedOpInfo> &Ops, UniformValueResult &UVR,
+    InterleavedGroupInfo &Group) {
+  VECZ_FAIL_IF(Ops.empty());
+  // this check keeps clang-tidy happy
+  VECZ_FAIL_IF(Kind != eInterleavedStore && Kind != eInterleavedLoad);
+
+  auto &SE = *scalarEvolution;
+
+  for (unsigned i = 0; i < Ops.size(); i++) {
+    // Extract the first memory instruction at the given offset.
+    const InterleavedOpInfo &Info0 = Ops[i];
+    if (Info0.Removed) {
+      continue;
+    }
+
+    Type *DataType0 = nullptr;
+    Value *Ptr0 = nullptr;
+    if (Kind == eInterleavedStore) {
+      DataType0 = Info0.Op->getOperand(0)->getType();
+      Ptr0 = Info0.Op->getOperand(1);
+    } else if (Kind == eInterleavedLoad) {
+      DataType0 = Info0.Op->getType();
+      Ptr0 = Info0.Op->getOperand(0);
+    }
+
+    const IRBuilder<> B(cast<Instruction>(Info0.Op));
+    Value *Base0 = UVR.extractMemBase(Ptr0);
+    if (!Base0) {
+      continue;
+    }
+
+    PointerType *PtrTy = dyn_cast<PointerType>(Ptr0->getType());
+    if (!PtrTy) {
+      continue;
+    }
+
+    Type *EleTy = DataType0->getScalarType();
+    const unsigned Align = EleTy->getScalarSizeInBits() / 8;
+    assert(Align != 0 &&
+           "interleaved memory operation with zero-sized elements");
+
+    Group.clear();
+    Group.Data.push_back(Info0.Op);
+    Group.Info.emplace_back(GroupMemberInfo{0, i, Info0.Op, Ptr0, DataType0});
+    Group.Kind = Info0.Kind;
+
+    // Try to find others that have the same stride and base pointer.
+    for (unsigned j = i + 1; j < Ops.size(); j++) {
+      const InterleavedOpInfo &InfoN = Ops[j];
+      if (InfoN.Removed) {
+        continue;
+      }
+
+      if (Group.Kind != InfoN.Kind) {
+        continue;
+      }
+
+      Type *DataTypeN = nullptr;
+      Value *PtrN = nullptr;
+      if (Kind == eInterleavedStore) {
+        DataTypeN = InfoN.Op->getOperand(0)->getType();
+        PtrN = InfoN.Op->getOperand(1);
+      } else if (Kind == eInterleavedLoad) {
+        DataTypeN = InfoN.Op->getType();
+        PtrN = InfoN.Op->getOperand(0);
+      }
+
+      if ((InfoN.Stride != Info0.Stride) || (DataTypeN != DataType0)) {
+        continue;
+      }
+
+      const IRBuilder<> B(cast<Instruction>(InfoN.Op));
+      Value *BaseN = UVR.extractMemBase(PtrN);
+      if (!BaseN || BaseN != Base0) {
+        continue;
+      }
+
+      const SCEV *PtrDiff = SE.getMinusSCEV(SE.getSCEV(PtrN), SE.getSCEV(Ptr0));
+      const auto *ConstDiff = dyn_cast<SCEVConstant>(PtrDiff);
+      if (!ConstDiff) {
+        continue;
+      }
+
+      // Note that the offset calculated here is a byte offset
+      int64_t Offset = ConstDiff->getAPInt().getSExtValue();
+      if (Offset % Align == 0) {
+        // only add them to the group if it is possible to collate them together
+        // at the same place in the function
+        bool CanMove = false;
+        if (Kind == eInterleavedLoad) {
+          CanMove = canMoveUp(Group.Data, cast<Instruction>(InfoN.Op));
+
+          if (InfoN.Kind == eMaskedInterleavedLoad) {
+            std::optional<MemOp> Op = MemOp::get(InfoN.Op);
+            assert(Op && "Unanalyzable load?");
+            if (auto *MaskInst = dyn_cast<Instruction>(Op->getMaskOperand())) {
+              CanMove &= Group.canDeinterleaveMask(*MaskInst);
+            }
+          }
+        } else if (Kind == eInterleavedStore) {
+          CanMove = canMoveDown(Group.Data, cast<Instruction>(InfoN.Op));
+        }
+
+        if (CanMove) {
+          Offset /= Align;
+          Group.Data.push_back(InfoN.Op);
+          Group.Info.emplace_back(
+              GroupMemberInfo{Offset, j, InfoN.Op, PtrN, DataTypeN});
+        }
+      }
+    }
+
+    if (Group.Data.size() > 1) {
+      auto InfoB = Group.Info.begin();
+      auto InfoE = Group.Info.end();
+
+      if (Kind == eInterleavedStore) {
+        // In the case of stores, the instructions are processed in reverse
+        // order, so this just puts them back in forwards order
+        std::reverse(InfoB, InfoE);
+      }
+
+      // Sort the group members in order of their offsets. Use a stable sort
+      // so that any duplicates don't get re-ordered (important for stores).
+      std::stable_sort(
+          InfoB, InfoE,
+          [](const GroupMemberInfo &a, const GroupMemberInfo &b) -> bool {
+            return a.Offset < b.Offset;
+          });
+
+      // If the same offset occurs several times, we can still de-interleave
+      // the unique ones, and maybe catch the rest the next time round.
+      InfoE =
+          Group.Info.erase(std::unique(InfoB, InfoE,
+                                       [](const GroupMemberInfo &a,
+                                          const GroupMemberInfo &b) -> bool {
+                                         return a.Offset == b.Offset;
+                                       }),
+                           InfoE);
+
+      if (Group.Info.size() <= 1) {
+        // This could happen if our entire group has the same address, in
+        // which case "std::unique" removes all but the first element and we
+        // don't have a Group anymore.
+        continue;
+      }
+
+      const unsigned Stride = Info0.Stride;
+      Group.Stride = Stride;
+      // If the group is bigger than the stride we can still de-interleave the
+      // first "Stride" members
+      if (Group.Info.size() > Stride) {
+        Group.Info.resize(Stride);
+        InfoB = Group.Info.begin();
+        InfoE = Group.Info.end();
+      }
+
+      if (!Group.isConsecutive()) {
+        // The group of memory instructions was not consecutive, try further.
+        continue;
+      }
+
+      // Everything is fine, return this group in offset-sorted order.
+      {
+        Group.Data.resize(Group.Info.size());
+        auto InfoIt = InfoB;
+        for (auto &Op : Group.Data) {
+          assert(InfoIt != InfoE);
+          Op = (InfoIt++)->MemOp;
+        }
+      }
+
+      Group.Base = Group.Info.front().Ptr;
+      Group.Offset = Group.Info.front().Offset;
+
+      // Put the Info list back into original Ops vector order
+      // (reverse order for Stores)
+      std::sort(InfoB, InfoE,
+                [](const GroupMemberInfo &a, const GroupMemberInfo &b) -> bool {
+                  return a.Order < b.Order;
+                });
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
new file mode 100644
index 0000000000000..b1274d91cf196
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/loop_rotate_custom_pass.cpp
@@ -0,0 +1,40 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/passes.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+
+using namespace llvm;
+
+llvm::PreservedAnalyses
+vecz::VeczLoopRotatePass::run(llvm::Loop &L, llvm::LoopAnalysisManager &LAM,
+                              llvm::LoopStandardAnalysisResults &AR,
+                              llvm::LPMUpdater &LU) {
+  // Only process loops whose latch cannot exit the loop and its predecessors
+  // cannot either.
+  if (L.isLoopExiting(L.getLoopLatch())) {
+    return PreservedAnalyses::all();
+  }
+
+  for (BasicBlock *pred : predecessors(L.getLoopLatch())) {
+    if (L.contains(pred) && L.isLoopExiting(pred)) {
+      return PreservedAnalyses::all();
+    }
+  }
+
+  return LoopRotatePass().run(L, LAM, AR, LU);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
new file mode 100644
index 0000000000000..cbeb82b3c47f6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_helpers.cpp
@@ -0,0 +1,728 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file contains all the code to perform, on demand, the plumbing between
+// values that have been vectorized, vector-widened, instantiated, or
+// semi-widened/instantiated (otherwise known as Vector Sub-Widening),
+// including the broadcast of uniform values, scatters, gathers, vector splits
+// and concatenations.
+
+#include "transform/packetization_helpers.h"
+
+#include <compiler/utils/group_collective_helpers.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/Analysis/VectorUtils.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/Transforms/Utils/LoopUtils.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "debugging.h"
+#include "transform/packetizer.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-packetization"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+Value *scalableBroadcastHelper(Value *subvec, ElementCount factor,
+                               const vecz::TargetInfo &TI, IRBuilder<> &B,
+                               bool URem);
+
+// Helper to broadcast a fixed vector thus:
+// <A,B> -> vscale x 1 -> <A,B,A,B,A,B,...>
+Value *createScalableBroadcastOfFixedVector(const vecz::TargetInfo &TI,
+                                            IRBuilder<> &B, Value *subvec,
+                                            ElementCount factor) {
+  assert(factor.isScalable());
+  return scalableBroadcastHelper(subvec, factor, TI, B, /*URem*/ true);
+}
+
+// Helper to broadcast a scalable vector thus:
+// <A,B,C, ...> -> x 2 <A,A,B,B,C,C, ...>
+Value *createFixedBroadcastOfScalableVector(const vecz::TargetInfo &TI,
+                                            IRBuilder<> &B, Value *subvec,
+                                            ElementCount factor) {
+  assert(!factor.isScalable());
+  return scalableBroadcastHelper(subvec, factor, TI, B, /*URem*/ false);
+}
+} // namespace
+
+namespace vecz {
+IRBuilder<> buildAfter(Value *V, Function &F, bool IsPhi) {
+  if (auto *const I = dyn_cast<Instruction>(V)) {
+    BasicBlock::iterator Next = I->getIterator();
+    const BasicBlock::iterator End = Next->getParent()->end();
+    do {
+      ++Next;
+    } while (!IsPhi && (Next != End) &&
+             (isa<PHINode>(Next) || isa<AllocaInst>(Next)));
+    // If there is debug info between this instruction and the next, insert
+    // before the debug info. This is required for PHIs and makes sense for
+    // other instructions too.
+    Next.setHeadBit(true);
+    return {I->getParent(), Next};
+  }
+  // Else find the first point in the function after any allocas.
+  auto it = F.getEntryBlock().begin();
+  while (isa<AllocaInst>(*it)) {
+    ++it;
+  }
+  return {&F.getEntryBlock(), it};
+}
+
+static Constant *getShuffleMask(ShuffleVectorInst *shuffle) {
+  // The mask value seems not to be a proper operand for LLVM 11.
+  // NOTE this is marked as "temporary" in the docs!
+  return shuffle->getShuffleMaskForBitcode();
+}
+
+Value *createOptimalShuffle(IRBuilder<> &B, Value *srcA, Value *srcB,
+                            const SmallVectorImpl<int> &mask,
+                            const Twine &name) {
+  const auto &maskC = mask;
+  auto *shuffleA = dyn_cast<ShuffleVectorInst>(srcA);
+  // If we have a unary shuffle of a shuffle, we can just pre-shuffle the masks
+  if (shuffleA && isa<UndefValue>(srcB)) {
+    auto *const srcMask = getShuffleMask(shuffleA);
+    auto *const newMask = ConstantExpr::getShuffleVector(
+        srcMask, PoisonValue::get(srcMask->getType()), maskC);
+
+    return B.CreateShuffleVector(shuffleA->getOperand(0),
+                                 shuffleA->getOperand(1), newMask, name);
+  }
+
+  auto *shuffleB = dyn_cast<ShuffleVectorInst>(srcB);
+
+  if (shuffleA && shuffleB) {
+    auto *const shuffleSrcA = shuffleA->getOperand(0);
+    auto *const shuffleSrcB = shuffleA->getOperand(1);
+
+    // If we have a shuffle of two shuffles with identical source operands,
+    // we can just pre-shuffle their masks together.
+    if (shuffleB->getOperand(0) == shuffleSrcA &&
+        shuffleB->getOperand(1) == shuffleSrcB) {
+      auto *const srcMaskA = getShuffleMask(shuffleA);
+      auto *const srcMaskB = getShuffleMask(shuffleB);
+      auto *const newMask =
+          ConstantExpr::getShuffleVector(srcMaskA, srcMaskB, maskC);
+
+      return B.CreateShuffleVector(shuffleSrcA, shuffleSrcB, newMask, name);
+    }
+  }
+
+  // If either operand is a unary shuffle, we can pull a few more tricks..
+  // For instance:
+  //
+  //    shuffle(shuffle(A, poison, maskA), shuffle(B, poison, maskB), maskC)
+  // => shuffle(A, B, shuffle(maskA, adjust(maskB), maskC))
+  // where "adjust" refers to adjusting the mask values to refer to the second
+  // source vector by adding the width of the first operand to the indices.
+  //
+  // If either source operand is something other than a unary shuffle, we can
+  // "pretend" it is a NOP shuffle of that operand (i.e. a mask of <0, 1, 2..>)
+  // and proceed as before, absorbing the unary shuffle from the other operand.
+  if (shuffleA && !isa<UndefValue>(shuffleA->getOperand(1))) {
+    shuffleA = nullptr;
+  }
+  if (shuffleB && !isa<UndefValue>(shuffleB->getOperand(1))) {
+    shuffleB = nullptr;
+  }
+
+  if (shuffleA || shuffleB) {
+    // We can absorb one or two unary shuffles into the new shuffle..
+    auto *const shuffleAsrc = shuffleA ? shuffleA->getOperand(0) : srcA;
+    auto *const shuffleBsrc = shuffleB ? shuffleB->getOperand(0) : srcB;
+    const auto srcASize =
+        cast<FixedVectorType>(shuffleAsrc->getType())->getNumElements();
+    const auto srcBSize =
+        cast<FixedVectorType>(shuffleBsrc->getType())->getNumElements();
+    if (srcASize == srcBSize) {
+      Constant *srcMaskA = nullptr;
+      Constant *srcMaskB = nullptr;
+
+      if (shuffleA) {
+        srcMaskA = getShuffleMask(shuffleA);
+      } else {
+        // if one operand is not a shuffle, we can make a pretend shuffle..
+        SmallVector<Constant *, 16> newMaskA;
+        for (unsigned i = 0; i < srcASize; ++i) {
+          newMaskA.push_back(B.getInt32(i));
+        }
+        srcMaskA = ConstantVector::get(newMaskA);
+      }
+
+      if (shuffleB) {
+        auto *const maskB = getShuffleMask(shuffleB);
+
+        // adjust the second mask to refer to the second vector..
+        srcMaskB = ConstantExpr::getAdd(
+            maskB, ConstantVector::getSplat(
+                       multi_llvm::getVectorElementCount(maskB->getType()),
+                       B.getInt32(srcASize)));
+      } else {
+        // if one operand is not a shuffle, we can make a pretend shuffle..
+        SmallVector<Constant *, 16> newMaskB;
+        for (unsigned i = 0; i < srcBSize; ++i) {
+          newMaskB.push_back(B.getInt32(i + srcASize));
+        }
+        srcMaskB = ConstantVector::get(newMaskB);
+      }
+
+      auto *const newMask =
+          ConstantExpr::getShuffleVector(srcMaskA, srcMaskB, maskC);
+
+      return B.CreateShuffleVector(shuffleAsrc, shuffleBsrc, newMask, name);
+    }
+  }
+
+  // No more optimal alternative, just build a new one
+  return B.CreateShuffleVector(srcA, srcB, maskC, name);
+}
+
+bool createSubSplats(const vecz::TargetInfo &TI, IRBuilder<> &B,
+                     SmallVectorImpl<Value *> &srcs, unsigned subWidth) {
+  // Scalable sub-splats must be handled specially.
+  if (isa<ScalableVectorType>(srcs.front()->getType())) {
+    if (srcs.size() != 1) {
+      return false;
+    }
+    Value *&val = srcs.front();
+    val = createFixedBroadcastOfScalableVector(
+        TI, B, val, ElementCount::getFixed(subWidth));
+    return val != nullptr;
+  }
+
+  auto *const vecTy = dyn_cast<FixedVectorType>(srcs.front()->getType());
+
+  if (!vecTy) {
+    return false;
+  }
+
+  const unsigned srcWidth = vecTy->getNumElements();
+
+  // Build shuffle mask to widen the vector condition.
+  SmallVector<int, 16> mask;
+  for (unsigned i = 0; i < srcWidth; ++i) {
+    for (unsigned j = 0; j < subWidth; ++j) {
+      mask.push_back(i);
+    }
+  }
+
+  auto *poison = PoisonValue::get(srcs.front()->getType());
+  for (auto &src : srcs) {
+    src = createOptimalShuffle(B, src, poison, mask);
+  }
+  return true;
+}
+
+Value *createMaybeVPReduction(IRBuilderBase &B, Value *Val, RecurKind Kind,
+                              Value *VL) {
+  assert(isa<VectorType>(Val->getType()) && "Must be vector type");
+  // If VL is null, it's not a vector-predicated reduction.
+  if (!VL) {
+    return createSimpleReduction(B, Val, Kind);
+  }
+  auto IntrinsicOp = Intrinsic::not_intrinsic;
+  switch (Kind) {
+  default:
+    break;
+  case RecurKind::None:
+    return nullptr;
+  case RecurKind::Add:
+    IntrinsicOp = Intrinsic::vp_reduce_add;
+    break;
+  case RecurKind::Mul:
+    IntrinsicOp = Intrinsic::vp_reduce_mul;
+    break;
+  case RecurKind::Or:
+    IntrinsicOp = Intrinsic::vp_reduce_or;
+    break;
+  case RecurKind::And:
+    IntrinsicOp = Intrinsic::vp_reduce_and;
+    break;
+  case RecurKind::Xor:
+    IntrinsicOp = Intrinsic::vp_reduce_xor;
+    break;
+  case RecurKind::FAdd:
+    IntrinsicOp = Intrinsic::vp_reduce_fadd;
+    break;
+  case RecurKind::FMul:
+    IntrinsicOp = Intrinsic::vp_reduce_fmul;
+    break;
+  case RecurKind::SMin:
+    IntrinsicOp = Intrinsic::vp_reduce_smin;
+    break;
+  case RecurKind::SMax:
+    IntrinsicOp = Intrinsic::vp_reduce_smax;
+    break;
+  case RecurKind::UMin:
+    IntrinsicOp = Intrinsic::vp_reduce_umin;
+    break;
+  case RecurKind::UMax:
+    IntrinsicOp = Intrinsic::vp_reduce_umax;
+    break;
+  case RecurKind::FMin:
+    IntrinsicOp = Intrinsic::vp_reduce_fmin;
+    break;
+  case RecurKind::FMax:
+    IntrinsicOp = Intrinsic::vp_reduce_fmax;
+    break;
+  }
+
+  auto *const F = Intrinsic::getOrInsertDeclaration(
+      B.GetInsertBlock()->getModule(), IntrinsicOp, Val->getType());
+  assert(F && "Could not declare vector-predicated reduction intrinsic");
+
+  auto *const VecTy = cast<VectorType>(Val->getType());
+  auto *const NeutralVal =
+      compiler::utils::getNeutralVal(Kind, VecTy->getElementType());
+  auto *const Mask = createAllTrueMask(B, VecTy->getElementCount());
+  return B.CreateCall(F, {NeutralVal, Val, Mask, VL});
+}
+
+Value *getGatherIndicesVector(IRBuilder<> &B, Value *Indices, Type *Ty,
+                              unsigned FixedVecElts, const Twine &N) {
+  auto *const Steps = B.CreateStepVector(Ty);
+
+  const auto EltCount = multi_llvm::getVectorElementCount(Ty);
+  auto *const ElTy = multi_llvm::getVectorElementType(Ty);
+
+  auto *const FixedVecEltsSplat =
+      B.CreateVectorSplat(EltCount, ConstantInt::get(ElTy, FixedVecElts));
+  auto *const StepsMul = B.CreateMul(Steps, FixedVecEltsSplat);
+  return B.CreateAdd(StepsMul, Indices, N);
+}
+
+Value *createAllTrueMask(IRBuilderBase &B, ElementCount EC) {
+  return ConstantInt::getTrue(VectorType::get(B.getInt1Ty(), EC));
+}
+
+Value *createIndexSequence(IRBuilder<> &Builder, VectorType *VecTy,
+                           const Twine &Name) {
+  auto EC = VecTy->getElementCount();
+  if (EC.isScalable()) {
+    // FIXME: This intrinsic works on fixed-length types too: should we migrate
+    // to using it starting from LLVM 13?
+    return Builder.CreateStepVector(VecTy, Name);
+  }
+
+  SmallVector<Constant *, 16> Indices;
+  auto *EltTy = VecTy->getElementType();
+  for (unsigned i = 0, e = EC.getFixedValue(); i != e; i++) {
+    Indices.push_back(ConstantInt::get(EltTy, i));
+  }
+  return ConstantVector::get(Indices);
+}
+
+} // namespace vecz
+
+PacketRange PacketInfo::getRange(std::vector<llvm::Value *> &d,
+                                 unsigned width) const {
+  auto found = packets.find(width);
+  if (found != packets.end()) {
+    return PacketRange(d, found->second, width);
+  } else {
+    return PacketRange(d);
+  }
+}
+
+Value *Packetizer::Result::getAsValue() const {
+  if (!scalar || !info) {
+    return nullptr;
+  }
+
+  if (info->vector) {
+    return info->vector;
+  }
+
+  const auto numInstances = info->numInstances;
+  if (numInstances == 0) {
+    return broadcast(1).info->vector;
+  }
+
+  const auto packet = getRange(numInstances);
+  assert(packet && "Packet doesn't exist when it should");
+
+  // If the instantiator broadcast the value, it will have set its own packet,
+  // so we fix that here.
+  bool splat = true;
+  for (auto *v : packet) {
+    if (v != scalar) {
+      splat = false;
+      break;
+    }
+  }
+
+  if (splat) {
+    info->numInstances = 0;
+    return broadcast(1).info->vector;
+  }
+
+  Type *const eleTy = packet.front()->getType();
+  assert(!eleTy->isVoidTy() && "Should not be getting a vector of voids");
+
+  auto name = scalar->getName();
+
+  if (FixedVectorType::isValidElementType(eleTy)) {
+    Value *gather =
+        PoisonValue::get(FixedVectorType::get(eleTy, packet.size()));
+
+    IRBuilder<> B(buildAfter(packet.back(), packetizer.F));
+    for (unsigned i = 0; i < packet.size(); i++) {
+      gather = B.CreateInsertElement(gather, packet.at(i), B.getInt32(i),
+                                     Twine(name, ".gather"));
+    }
+    info->vector = gather;
+  } else if (eleTy->isVectorTy()) {
+    // Gathering an instantiated vector by concatenating all the lanes
+    auto parts = narrow(2);
+    auto *vecTy = cast<FixedVectorType>(parts.front()->getType());
+    const unsigned fullWidth = vecTy->getNumElements() * 2;
+
+    SmallVector<int, 16> mask;
+    for (size_t j = 0; j < fullWidth; ++j) {
+      mask.push_back(j);
+    }
+
+    IRBuilder<> B(buildAfter(parts[1], packetizer.F));
+    info->vector = B.CreateShuffleVector(parts[0], parts[1], mask,
+                                         Twine(name, ".concatenate"));
+  } else {
+    Value *gather = PoisonValue::get(ArrayType::get(eleTy, packet.size()));
+
+    IRBuilder<> B(buildAfter(packet.back(), packetizer.F));
+    for (unsigned i = 0; i < packet.size(); i++) {
+      gather =
+          B.CreateInsertValue(gather, packet.at(i), i, Twine(name, ".gather"));
+    }
+    info->vector = gather;
+  }
+  return info->vector;
+}
+
+PacketRange Packetizer::Result::getAsPacket(unsigned width) const {
+  if (!scalar || !info) {
+    return PacketRange(packetizer.packetData);
+  }
+
+  if (const auto range = getRange(width)) {
+    return range;
+  }
+
+  auto numInstances = info->numInstances;
+  if (numInstances == 0) {
+    return broadcast(width).getRange(width);
+  }
+
+  if (numInstances != 1) {
+    if (numInstances < width) {
+      return widen(width);
+    } else if (numInstances > width) {
+      return narrow(width);
+    } else {
+      assert(false && "Supposedly unreachable condition in Packetizer::Result");
+    }
+  }
+
+  if (!info->vector) {
+    return PacketRange(packetizer.packetData);
+  }
+
+  auto packet = createPacket(width);
+
+  Value *vec = info->vector;
+  if (auto *const vecTy = dyn_cast<FixedVectorType>(vec->getType())) {
+    assert(isa<FixedVectorType>(vecTy) && "Must be a fixed vector type here!");
+    const unsigned scalarWidth = vecTy->getNumElements() / width;
+    if (scalarWidth > 1 || scalar->getType()->isVectorTy()) {
+      auto *const poison = PoisonValue::get(vec->getType());
+
+      // Build shuffle mask to perform the subvector extracts.
+      IRBuilder<> B(buildAfter(vec, packetizer.F));
+      for (size_t i = 0, k = 0; i < width; ++i) {
+        SmallVector<int, 16> mask;
+        for (size_t j = 0; j < scalarWidth; ++j, ++k) {
+          mask.push_back(k);
+        }
+        packet[i] = createOptimalShuffle(B, vec, poison, mask,
+                                         Twine(scalar->getName(), ".split"));
+      }
+    } else {
+      IRBuilder<> B(buildAfter(vec, packetizer.F));
+      for (unsigned i = 0; i < width; i++) {
+        packet[i] = B.CreateExtractElement(vec, B.getInt32(i));
+      }
+    }
+  } else {
+    assert(isa<ArrayType>(vecTy) && "Must be an array here!");
+    IRBuilder<> B(buildAfter(vec, packetizer.F));
+    for (unsigned i = 0; i < width; i++) {
+      packet[i] = B.CreateExtractValue(vec, i);
+    }
+  }
+  return packet;
+}
+
+void Packetizer::Result::getPacketValues(SmallVectorImpl<Value *> &vals) const {
+  assert(info && "No packet info for this packetization result");
+  const auto width = info->numInstances;
+  if (width != 0) {
+    getPacketValues(width, vals);
+    return;
+  }
+}
+
+void Packetizer::Result::getPacketValues(unsigned width,
+                                         SmallVectorImpl<Value *> &vals) const {
+  assert(width != 0 && "Can't get a zero width packet");
+  if (width == 1) {
+    if (auto *const val = getAsValue()) {
+      vals.push_back(val);
+    }
+  } else {
+    auto p = getAsPacket(width);
+    vals.assign(p.begin(), p.end());
+  }
+}
+
+PacketRange Packetizer::Result::createPacket(unsigned width) const {
+  assert(info && "Can't create a packet on a fail state");
+  assert(!info->packets.contains(width) &&
+         "Shouldn't create the same packet twice");
+
+  const auto start = packetizer.packetData.size();
+  packetizer.packetData.resize(start + width, nullptr);
+  info->packets[width] = start;
+  return PacketRange(packetizer.packetData, start, width);
+}
+
+PacketRange Packetizer::Result::getRange(unsigned width) const {
+  return info->getRange(packetizer.packetData, width);
+}
+
+// it makes a wider packet by splitting the sub-vectors
+PacketRange Packetizer::Result::widen(unsigned width) const {
+  const auto numInstances = info->numInstances;
+  const auto parts = getRange(numInstances);
+  auto *const vecTy = dyn_cast<FixedVectorType>(parts.front()->getType());
+  assert(vecTy && "Expected a fixed vector type");
+
+  auto packet = createPacket(width);
+  const auto origWidth = vecTy->getNumElements();
+  const auto newWidth = (origWidth * numInstances) / width;
+  const auto name = scalar->getName();
+
+  auto *it = parts.begin();
+  IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
+  if (newWidth > 1) {
+    auto *const poison = PoisonValue::get(vecTy);
+
+    // Build shuffle mask to perform the subvector extracts.
+    for (size_t i = 0, origIdx = 0; i < width; ++i) {
+      if (origIdx == origWidth) {
+        origIdx = 0;
+        ++it;
+      }
+      SmallVector<int, 16> mask;
+      for (size_t j = 0; j < newWidth; ++j, ++origIdx) {
+        mask.push_back(origIdx);
+      }
+      packet[i] =
+          createOptimalShuffle(B, *it, poison, mask, Twine(name, ".split"));
+    }
+  } else {
+    for (size_t i = 0, origIdx = 0; i < width; ++i, ++origIdx) {
+      if (origIdx == origWidth) {
+        origIdx = 0;
+        ++it;
+      }
+      packet[i] = B.CreateExtractElement(*it, B.getInt32(origIdx),
+                                         Twine(name, ".split"));
+    }
+  }
+  return packet;
+}
+
+// it makes a narrower packet by concatenating the sub-vectors
+PacketRange Packetizer::Result::narrow(unsigned width) const {
+  if (const auto range = getRange(width)) {
+    return range;
+  }
+
+  // Narrow recursively
+  const auto parts = narrow(width * 2);
+  assert(parts && "Error during packet narrowing");
+
+  auto packet = createPacket(width);
+  auto *const ty = parts.front()->getType();
+  auto *const vecTy = dyn_cast<FixedVectorType>(ty);
+  if (!vecTy) {
+    // Build vectors out of pairs of scalar values
+    const auto name = scalar->getName();
+    IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
+    Value *poison = PoisonValue::get(FixedVectorType::get(ty, 2));
+    for (size_t i = 0, pairIdx = 0; i < width; ++i, pairIdx += 2) {
+      Value *in = B.CreateInsertElement(poison, parts[pairIdx], B.getInt32(0),
+                                        Twine(name, ".gather"));
+      packet[i] = B.CreateInsertElement(in, parts[pairIdx + 1], B.getInt32(1),
+                                        Twine(name, ".gather"));
+    }
+    return packet;
+  }
+
+  const unsigned fullWidth = vecTy->getNumElements() * 2;
+
+  SmallVector<int, 16> mask;
+  for (size_t j = 0; j < fullWidth; ++j) {
+    mask.push_back(j);
+  }
+
+  // Build wider vectors by concatenating pairs of sub-vectors
+  const auto name = scalar->getName();
+  IRBuilder<> B(buildAfter(parts.back(), packetizer.F));
+  for (size_t i = 0, pairIdx = 0; i < width; ++i, pairIdx += 2) {
+    packet[i] = createOptimalShuffle(B, parts[pairIdx], parts[pairIdx + 1],
+                                     mask, Twine(name, ".concatenate"));
+  }
+  return packet;
+}
+
+namespace {
+// This method creates the following sequence to broadcast a fixed-length
+// vector to a scalable one or broadcasting a scalable-vector by a fixed
+// amount, barring any optimizations we can perform for broadcasting a splat
+// vector.
+// The general idea is first to store the subvector to a stack 'alloca', then
+// use a gather operation with a vector of pointers created using a step vector
+// modulo the fixed amount.
+// Note that other sequences are possible, such as a series of blend
+// operations. This could perhaps be a target choice.
+Value *scalableBroadcastHelper(Value *subvec, ElementCount factor,
+                               const vecz::TargetInfo &TI, IRBuilder<> &B,
+                               bool URem) {
+  auto *ty = subvec->getType();
+  const auto subVecEltCount = multi_llvm::getVectorElementCount(ty);
+  assert(subVecEltCount.isScalable() ^ factor.isScalable() &&
+         "Must either broadcast fixed vector by scalable factor or scalable "
+         "vector by fixed factor");
+  auto *const wideTy = getWideType(ty, factor);
+  auto wideEltCount = multi_llvm::getVectorElementCount(wideTy);
+
+  // If this vector is a constant splat, just splat it to the wider scalable
+  // type.
+  if (auto *const cvec = dyn_cast<Constant>(subvec)) {
+    if (auto *const splat = cvec->getSplatValue()) {
+      return ConstantVector::getSplat(wideEltCount, splat);
+    }
+  }
+  // Or if it's a splat value, re-splat it. Note we do Constants separately
+  // above as it generates more canonical code, e.g., a splat of 0 becomes
+  // zeroinitializer rather than a insertelement/shufflevector sequence.
+  if (const auto *const splat = getSplatValue(subvec)) {
+    return B.CreateVectorSplat(wideEltCount, const_cast<Value *>(splat));
+  }
+
+  // Compiler support for masked.gather on i1 vectors is lacking, so emit this
+  // operation as the equivalent i8 vector instead.
+  const bool upcast_i1_as_i8 = ty->getScalarType()->isIntegerTy(1);
+  if (upcast_i1_as_i8) {
+    auto *const int8Ty = Type::getInt8Ty(B.getContext());
+    ty = llvm::VectorType::get(int8Ty, subVecEltCount);
+    subvec = B.CreateSExt(subvec, ty);
+  }
+
+  Value *gather =
+      URem ? TI.createOuterScalableBroadcast(B, subvec, /*VL*/ nullptr, factor)
+           : TI.createInnerScalableBroadcast(B, subvec, /*VL*/ nullptr, factor);
+
+  // If we've been performing this broadcast as i8, now's the time to truncate
+  // back down to i1.
+  if (upcast_i1_as_i8) {
+    gather = B.CreateTrunc(gather, wideTy);
+  }
+
+  return gather;
+}
+} // namespace
+
+const Packetizer::Result &Packetizer::Result::broadcast(unsigned width) const {
+  const auto factor = packetizer.width().divideCoefficientBy(width);
+  auto *const ty = scalar->getType();
+  assert(!ty->isVoidTy() && "Should not be broadcasting a void type");
+
+  if (width != 1 && !factor.isScalable() && factor.getFixedValue() == 1) {
+    // Pure instantiation broadcast..
+    for (auto &v : createPacket(width)) {
+      v = scalar;
+    }
+    return *this;
+  }
+
+  auto &F = packetizer.F;
+  Value *result = nullptr;
+  const auto &TI = packetizer.context().targetInfo();
+  if (isa<PoisonValue>(scalar)) {
+    result = PoisonValue::get(getWideType(ty, factor));
+  } else if (isa<UndefValue>(scalar)) {
+    result = PoisonValue::get(getWideType(ty, factor));
+  } else if (ty->isVectorTy() && factor.isScalable()) {
+    IRBuilder<> B(buildAfter(scalar, F));
+    result = createScalableBroadcastOfFixedVector(TI, B, scalar, factor);
+  } else if (ty->isVectorTy()) {
+    auto *const vecTy = cast<FixedVectorType>(ty);
+    const unsigned scalarWidth = vecTy->getNumElements();
+
+    const unsigned simdWidth = factor.getFixedValue();
+
+    // Build shuffle mask to perform the splat.
+    SmallVector<int, 16> mask;
+    for (size_t i = 0; i < simdWidth; ++i) {
+      for (size_t j = 0; j < scalarWidth; ++j) {
+        mask.push_back(j);
+      }
+    }
+
+    IRBuilder<> B(buildAfter(scalar, packetizer.F));
+    result = createOptimalShuffle(B, scalar, PoisonValue::get(ty), mask,
+                                  Twine(scalar->getName(), ".broadcast"));
+  } else if (auto *const C = dyn_cast<Constant>(scalar)) {
+    result = ConstantVector::getSplat(factor, C);
+  } else {
+    IRBuilder<> B(buildAfter(scalar, packetizer.F));
+    result = B.CreateVectorSplat(factor, scalar);
+  }
+
+  if (!result) {
+    // Failed to broadcast this value, return the empty result
+    return *this;
+  }
+
+  if (width == 1) {
+    info->vector = result;
+  } else {
+    for (auto &v : createPacket(width)) {
+      v = result;
+    }
+  }
+  return *this;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
new file mode 100644
index 0000000000000..e45e2d91bf9d5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetization_pass.cpp
@@ -0,0 +1,80 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/packetization_pass.h"
+
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/IR/Dominators.h>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/simd_width_analysis.h"
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/packetizer.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-packetization"
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczPacketizeFail,
+          "Number of kernels that failed to packetize [ID#P80]");
+STATISTIC(VeczSimdAnalysisFail, "Number of kernels that SIMD Width Analysis "
+                                "suggested not to packetize [ID#P81]");
+
+char PacketizationPass::PassID = 0;
+
+PreservedAnalyses PacketizationPass::run(Function &F,
+                                         llvm::FunctionAnalysisManager &AM) {
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+
+  if (!VU.width().isScalable()) {
+    const unsigned SimdWidth = VU.width().getFixedValue();
+    if (VU.autoWidth() && VU.context().targetInfo().getTargetMachine()) {
+      LLVM_DEBUG(dbgs() << "vecz: Original SIMD width: " << SimdWidth << "\n");
+      const unsigned NewSimdWidth = AM.getResult<SimdWidthAnalysis>(F).value;
+      LLVM_DEBUG(dbgs() << "vecz: Re-determined SIMD width: " << NewSimdWidth
+                        << "\n");
+
+      if (NewSimdWidth <= 1u) {
+        ++VeczSimdAnalysisFail;
+        return VU.setFailed("SIMD Width Analysis suggested not to packetize");
+      }
+
+      if (NewSimdWidth < SimdWidth) {
+        VU.setWidth(ElementCount::getFixed(NewSimdWidth));
+      }
+    }
+  }
+
+  if (!Packetizer::packetize(F, AM, VU.width(), VU.dimension())) {
+    ++VeczPacketizeFail;
+    return VU.setFailed("packetization failed");
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+  Preserved.preserve<CFGAnalysis>();
+  Preserved.preserve<DivergenceAnalysis>();
+  return Preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
new file mode 100644
index 0000000000000..5e0a1fbc7e12e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/packetizer.cpp
@@ -0,0 +1,4050 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/packetizer.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/mangling.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/ADT/DepthFirstIterator.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallString.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/VectorUtils.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Transforms/Utils/LoopUtils.h>
+#include <multi_llvm/llvm_version.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <memory>
+#include <optional>
+
+#include "analysis/instantiation_analysis.h"
+#include "analysis/packetization_analysis.h"
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "transform/instantiation_pass.h"
+#include "transform/packetization_helpers.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-packetization"
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczPacketized, "Number of instructions packetized [ID#P00]");
+STATISTIC(VeczPacketizeFailCall,
+          "Packetize: missing function declarations [ID#P81]");
+STATISTIC(VeczPacketizeFailType,
+          "Packetize: inconsistent vector parameters [ID#P87]");
+STATISTIC(VeczPacketizeFailPtr,
+          "Packetize: inconsistent pointer parameters [ID#P88]");
+STATISTIC(VeczPacketizeFailStride,
+          "Packetize: non-constant strides in pointer parameters [ID#P8A]");
+
+// Just a little macro that can return an empty SmallVector, as a drop-in
+// replacement for VECZ_FAIL_IF..
+#define PACK_FAIL_IF(cond)                                                     \
+  do {                                                                         \
+    if (cond) {                                                                \
+      return {};                                                               \
+    }                                                                          \
+  } while (false)
+
+namespace {
+// Returns a type equivalent to the input type plus padding.
+// This converts a <3 x Ty> into a <4 x Ty>, leaving other types unchanged.
+Type *getPaddedType(Type *Ty) {
+  if (auto *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+    if (VecTy->getNumElements() == 3) {
+      return VectorType::get(VecTy->getElementType(),
+                             ElementCount::getFixed(4));
+    }
+  }
+  return Ty;
+}
+} // namespace
+
+using ValuePacket = SmallVector<Value *, 16>;
+
+/// @brief Private implementation of the Packetizer.
+/// It inherits its own outer class, which has only private constructors. This
+/// allows us to pass it by reference to functions that need to access the
+/// Packetizer, while also ensuring that a Packetizer cannot be created except
+/// as the base class of its own implementation.
+class Packetizer::Impl : public Packetizer {
+public:
+  Impl(llvm::Function &F, llvm::FunctionAnalysisManager &AM, ElementCount Width,
+       unsigned Dim);
+  Impl() = delete;
+  Impl(const Packetizer &) = delete;
+  Impl(Packetizer &&) = delete;
+  ~Impl();
+
+  bool packetize();
+
+  /// @brief Handle packetization failure. This method ensures that
+  /// packetization failure does not leave behind invalid IR.
+  void onFailure();
+
+  /// @brief Packetize the given value from the function.
+  ///
+  /// @param[in] V Value to packetize.
+  ///
+  /// @return Packetized value.
+  Result packetize(Value *V);
+
+  /// @brief Packetize the given value and return the packet by values
+  ///
+  /// @param[in] V Value to packetize.
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizeAndGet(Value *V);
+
+  /// @brief Packetize the given value to a specified packet width, and return
+  /// the packet by values
+  ///
+  /// @param[in] V Value to packetize.
+  /// @param[in] Width the requested packet width
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizeAndGet(Value *V, unsigned Width);
+
+  /// @brief Helper to produce a Result from a Packet
+  Packetizer::Result
+  getPacketizationResult(Instruction *I, const SmallVectorImpl<Value *> &Packet,
+                         bool UpdateStats = false);
+
+  /// @brief Packetize the given value from the function, only if it is a
+  /// varying value. Ensures Mask Varying values are handled correctly.
+  ///
+  /// @param[in] V Value to packetize.
+  ///
+  /// @return Packetized value if varying, or the original value if Uniform.
+  Value *packetizeIfVarying(Value *V);
+
+  /// @brief Packetize a uniform value by broadcasting to all vector lanes.
+  ///
+  /// @param[in] V Value to broadcast
+  ///
+  /// @return Packetized instruction
+  Result broadcast(Value *V);
+  /// @brief Reduce a varying boolean condition to a scalar
+  ///
+  /// @param[in] cond Condition to packetize.
+  /// @param[in] terminator Terminator instruction.
+  /// @param[in] allOf Whether to create a all of mask, or any of.
+  ///
+  /// @return reduced boolean value.
+  Value *reduceBranchCond(Value *cond, Instruction *terminator, bool allOf);
+  /// @brief Compute the ideal packet width for subwidening the given type
+  ///
+  /// @param[in] ty Type of the value to subwiden
+  /// @param[in] limit The maximum vector width we allow
+  ///
+  /// @return width of the packet to create
+  unsigned getPacketWidthForType(Type *ty, unsigned limit = ~0u) const;
+  /// @brief Packetize an instruction.
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  ///
+  /// @return Packetized instructions.
+  Result packetizeInstruction(Instruction *Ins);
+  /// @brief Packetize a mask-varying instruction.
+  ///
+  /// @param[in] I Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *packetizeMaskVarying(Instruction *I);
+  /// @brief Packetize a mask-varying subgroup/workgroup reduction.
+  ///
+  /// @param[in] I Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *packetizeGroupReduction(Instruction *I);
+  /// @brief Packetize a subgroup/workgroup broadcast.
+  ///
+  /// @param[in] I Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *packetizeGroupBroadcast(Instruction *I);
+  /// @brief Returns true if the instruction is any subgroup shuffle.
+  ///
+  /// @param[in] I Instruction to query.
+  ///
+  /// @return The group collective data if the instruction is a call to any of
+  /// the mux subgroup shuffle builtins; std::nullopt otherwise.
+  std::optional<compiler::utils::GroupCollective>
+  isSubgroupShuffleLike(Instruction *I);
+  /// @brief Packetize a sub-group shuffle builtin
+  ///
+  /// Note - not any shuffle-like operation, but specifically the 'shuffle'
+  /// builtin.
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  ///
+  /// @return Packetized instructions.
+  Value *packetizeSubgroupShuffle(Instruction *Ins);
+  /// @brief Packetize a sub-group shuffle-xor builtin
+  ///
+  /// Note - not any shuffle-like operation, but specifically the 'shuffle_xor'
+  /// builtin.
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  /// @param[in] ShuffleXor Shuffle to packetize.
+  ///
+  /// @return Packetized instructions.
+  Result
+  packetizeSubgroupShuffleXor(Instruction *Ins,
+                              compiler::utils::GroupCollective ShuffleXor);
+  /// @brief Packetize a sub-group shuffle-up or shuffle-down builtin
+  ///
+  /// Note - not any shuffle-like operation, but specifically the 'shuffle_up'
+  /// and 'shuffle_down' builtins.
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  /// @param[in] ShuffleUpDown Shuffle to packetize.
+  ///
+  /// @return Packetized instructions.
+  Result packetizeSubgroupShuffleUpDown(
+      Instruction *Ins, compiler::utils::GroupCollective ShuffleUpDown);
+
+  /// @brief Packetize PHI node.
+  ///
+  /// @param[in] Phi PHI Node to packetize.
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizePHI(PHINode *Phi);
+  /// @brief Packetize a call instruction.
+  ///
+  /// @param[in] CI Call Instruction to packetize.
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizeCall(CallInst *CI);
+  /// @brief Packetize a subgroup/workgroup scan.
+  ///
+  /// @param[in] CI CallInst to packetize.
+  /// @param[in] Scan type of scan to packetized.
+  ///
+  /// @return Packetized values.
+  ValuePacket packetizeGroupScan(CallInst *CI,
+                                 compiler::utils::GroupCollective Scan);
+  /// @brief Perform post-packetization tasks for the given scalar value.
+  ///
+  /// @param[in] Scalar Scalar value to assign a vectorized value.
+  /// @param[in] Vectorized Packetized value to assign.
+  ///
+  /// @return Packetized values.
+  Result assign(Value *Scalar, Value *Vectorized);
+  /// @brief Vectorize an instruction.
+  ///
+  /// @param[in] Ins Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeInstruction(Instruction *Ins);
+  /// @brief Packetize a load instruction.
+  ///
+  /// @param[in] Load Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeLoad(LoadInst *Load);
+  /// @brief Packetize a store instruction.
+  ///
+  /// @param[in] Store Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeStore(StoreInst *Store);
+  /// @brief Packetize a memory operation.
+  ///
+  /// @param[in] Op Memory operation to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeMemOp(MemOp &Op);
+  /// @brief Packetize a masked atomicrmw or cmpxchg operation.
+  ///
+  /// @param[in] CI Masked atomic builtin call to packetize.
+  /// @param[in] AtomicInfo Information about the masked atomic.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket
+  packetizeMaskedAtomic(CallInst &CI,
+                        VectorizationContext::MaskedAtomic AtomicInfo);
+  /// @brief Packetize a GEP instruction.
+  ///
+  /// @param[in] GEP Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeGEP(GetElementPtrInst *GEP);
+  /// @brief Packetize a cast instruction.
+  ///
+  /// @param[in] CastI Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeCast(CastInst *CastI);
+  /// @brief Packetize a binary operator instruction.
+  ///
+  /// @param[in] BinOp Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeBinaryOp(BinaryOperator *BinOp);
+  /// @brief Packetize a freeze instruction.
+  ///
+  /// @param[in] FreezeI Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeFreeze(FreezeInst *FreezeI);
+  /// @brief Packetize an atomic cmpxchg instruction.
+  ///
+  /// @param[in] AtomicI Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeAtomicCmpXchg(AtomicCmpXchgInst *AtomicI);
+  /// @brief Packetize a unary operator instruction.
+  ///
+  /// @param[in] UnOp Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeUnaryOp(UnaryOperator *UnOp);
+  /// @brief Packetize an integer compare instruction.
+  ///
+  /// @param[in] Cmp Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeICmp(ICmpInst *Cmp);
+  /// @brief Packetize a floating-point compare instruction.
+  ///
+  /// @param[in] Cmp Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeFCmp(FCmpInst *Cmp);
+  /// @brief Packetize a select instruction.
+  ///
+  /// @param[in] Select Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeSelect(SelectInst *Select);
+  /// @brief Packetize a return instruction.
+  ///
+  /// @param[in] Return Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeReturn(ReturnInst *Return);
+  /// @brief Packetize a call instruction.
+  ///
+  /// @param[in] CI Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeCall(CallInst *CI);
+  /// @brief Packetize a call to a work-group builtin.
+  ///
+  /// @param[in] CI Instruction to packetize.
+  /// @param[in] Builtin Builtin identifier.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeWorkGroupCall(CallInst *CI,
+                                const compiler::utils::BuiltinCall &Builtin);
+  /// @brief Packetize an alloca instruction.
+  ///
+  /// @param[in] Alloca Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeAlloca(AllocaInst *Alloca);
+  /// @brief Packetize an extract value instruction.
+  ///
+  /// @param[in] ExtractElement Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  Value *vectorizeExtractValue(ExtractValueInst *ExtractElement);
+  /// @brief Packetize an insert element instruction.
+  ///
+  /// @param[in] InsertElement Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeInsertElement(InsertElementInst *InsertElement);
+  /// @brief Packetize an extract element instruction.
+  ///
+  /// @param[in] ExtractElement Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeExtractElement(ExtractElementInst *ExtractElement);
+  /// @brief Packetize an insert value instruction.
+  ///
+  /// Only packetizes inserts into literal struct types.
+  ///
+  /// @param[in] InsertValue Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeInsertValue(InsertValueInst *InsertValue);
+  /// @brief Packetize an extract value instruction.
+  ///
+  /// Only packetizes extracts from literal struct types.
+  ///
+  /// @param[in] ExtractValue Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeExtractValue(ExtractValueInst *ExtractValue);
+  /// @brief Packetize a shuffle vector instruction.
+  ///
+  /// @param[in] Shuffle Instruction to packetize.
+  ///
+  /// @return Packetized instruction.
+  ValuePacket packetizeShuffleVector(ShuffleVectorInst *Shuffle);
+  /// @brief Preserves debug information attached to old scalar instruction,
+  ///        updating the debug info type to match the vector width.
+  ///
+  /// @param[in] Scalar Scalar instruction before packetization.
+  /// @param[in] Packet Packetized instruction.
+  void vectorizeDI(Instruction *Scalar, Value *Packet);
+
+  /// @brief Helps handle instructions that cannot be packetized.
+  std::unique_ptr<InstantiationPass> Instantiator;
+
+  /// @brief List of phi nodes that can be used by passes to defer the
+  /// processing of these nodes.
+  std::vector<PHINode *> pendingPhis;
+
+  /// @brief The target transform info
+  const TargetTransformInfo TTI;
+};
+
+Packetizer::Packetizer(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
+                       ElementCount Width, unsigned Dim)
+    : AM(AM), VU(AM.getResult<VectorizationUnitAnalysis>(F).getVU()),
+      Ctx(AM.getResult<VectorizationContextAnalysis>(F).getContext()),
+      Choices(VU.choices()), UVR(AM.getResult<UniformValueAnalysis>(F)),
+      SAR(AM.getResult<StrideAnalysis>(F)),
+      PAR(AM.getResult<PacketizationAnalysis>(F)), F(F), SimdWidth(Width),
+      Dimension(Dim) {}
+
+Packetizer::Impl::Impl(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
+                       ElementCount Width, unsigned Dim)
+    : Packetizer(F, AM, Width, Dim), TTI(Ctx.getTargetTransformInfo(F)) {
+  Instantiator.reset(new InstantiationPass(*this));
+}
+
+Packetizer::Impl::~Impl() = default;
+
+bool Packetizer::packetize(llvm::Function &F, llvm::FunctionAnalysisManager &AM,
+                           ElementCount Width, unsigned Dim) {
+  Impl impl(F, AM, Width, Dim);
+  const bool Res = impl.packetize();
+  if (!Res) {
+    impl.onFailure();
+  }
+  return Res;
+}
+
+bool Packetizer::Impl::packetize() {
+  LLVM_DEBUG(if (PAR.isEmpty()) {
+    llvm::dbgs() << "No vector leaves in function "
+                 << VU.scalarFunction()->getName() << "\n";
+  });
+
+  // If requested, set up the base vector length for this kernel based on the
+  // number of remaining work items: the local size minus the local id. Since
+  // VP intrinsics are undefined for %evl values larger than the actual vector
+  // width, we also constrain it based on the vectorization width.
+  BasicBlock &EntryBB = F.getEntryBlock();
+  IRBuilder<> B(&*EntryBB.getFirstInsertionPt());
+
+  if (Choices.vectorPredication()) {
+    auto &M = *F.getParent();
+    auto *const I32Ty = Type::getInt32Ty(F.getContext());
+    auto *const LocalIdFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+        compiler::utils::eMuxBuiltinGetLocalId, M);
+    auto *const LocalSizeFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+        compiler::utils::eMuxBuiltinGetLocalSize, M);
+    assert(LocalIdFn && LocalSizeFn && "Unable to create mux builtins");
+    auto *const ID =
+        B.CreateCall(LocalIdFn, B.getInt32(VU.dimension()), "local.id");
+    ID->setAttributes(LocalIdFn->getAttributes());
+    ID->setCallingConv(LocalIdFn->getCallingConv());
+    auto *const Size =
+        B.CreateCall(LocalSizeFn, B.getInt32(VU.dimension()), "local.size");
+    Size->setAttributes(LocalSizeFn->getAttributes());
+    Size->setCallingConv(LocalSizeFn->getCallingConv());
+    VECZ_FAIL_IF(!ID || !Size);
+
+    VL = B.CreateSub(Size, ID, "work.remaining", /*HasNUW*/ true,
+                     /*HasNSW*/ true);
+
+    if (auto *RVVVL = Ctx.targetInfo().createVPKernelWidth(
+            B, VL, /*WidestType*/ 32, VU.width())) {
+      VL = RVVVL;
+    } else {
+      auto *const VectorLength =
+          B.CreateElementCount(VL->getType(), VU.width());
+      VL = B.CreateIntrinsic(Intrinsic::umin, {VL->getType()},
+                             {VL, VectorLength});
+
+      VL = B.CreateTrunc(VL, I32Ty);
+    }
+  }
+
+  // Manifest the memory operation stride values as actual `llvm::Value`s
+  SAR.manifestAll(B);
+
+  // Pre-process the arguments first to replace any placeholders with their
+  // proper vector values, and convert pointer return arguments to vector of
+  // pointers where required.
+  {
+    Value *idxVector = nullptr;
+    for (const auto &TargetArg : VU.arguments()) {
+      if (auto *const Placeholder = TargetArg.Placeholder) {
+        auto &info = packets[Placeholder];
+        info.vector = TargetArg.NewArg;
+        info.numInstances = 1;
+      } else if (TargetArg.PointerRetPointeeTy &&
+                 PAR.needsPacketization(TargetArg.NewArg)) {
+        if (!idxVector) {
+          idxVector = createIndexSequence(
+              B, VectorType::get(B.getInt32Ty(), SimdWidth), "index.vec");
+        }
+
+        // This implementation looks unlikely to be correct, but for
+        // now we just maintain the original behaviour, until we have a better
+        // idea of what is going on or whether any of this is still needed.
+        // This case will never be encountered during kernel vectorization.
+        auto *const Arg = TargetArg.NewArg;
+        auto *const EleTy = TargetArg.PointerRetPointeeTy;
+        auto &info = packets[Arg];
+        info.vector = B.CreateGEP(EleTy, Arg, idxVector);
+        info.numInstances = 1;
+      }
+    }
+  }
+
+  // Build an ordered list of the instructions to packetize, in depth first
+  // order so that we don't have to recurse too much. We build the list first
+  // because packetization of calls can produce loops, which messes up our
+  // iteration over the basic blocks of the function.
+  std::vector<Instruction *> ordered;
+  for (auto *BB : depth_first(&F)) {
+    for (auto &I : *BB) {
+      if (PAR.needsPacketization(&I)) {
+        ordered.push_back(&I);
+      }
+    }
+  }
+
+  for (auto *const I : ordered) {
+    if (!packetize(I)) {
+      emitVeczRemarkMissed(&F, I, "Could not packetize");
+      VECZ_FAIL();
+    }
+  }
+
+  // Packetize remaining phi nodes until they have all been packetized.
+  // Packetizing one phi node may involve the packetization of another node.
+  // Some nodes might need to be instantiated instead of being packetized, but
+  // we are handling this here because the instantiation pass is not run as a
+  // standalone pass.
+  // Note: pendingPhis *may* change as we progress through this loop, by
+  // calling packetize(Incoming). Therefore we can't cache the vector size when
+  // setting up the loop.
+  for (unsigned i = 0; i < pendingPhis.size(); i++) {
+    PHINode *Phi = pendingPhis[i];
+    auto &info = packets[Phi];
+    assert(info.numInstances > 0 && "A PHI pending packetization has no stub");
+    if (info.numInstances == 1) {
+      auto *NewPhi = cast<PHINode>(info.vector);
+      for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) {
+        Value *Incoming = Phi->getIncomingValue(i);
+        BasicBlock *BB = Phi->getIncomingBlock(i);
+        Value *VecIncoming = packetize(Incoming).getAsValue();
+        VECZ_FAIL_IF(!VecIncoming);
+        NewPhi->addIncoming(VecIncoming, BB);
+      }
+    } else {
+      const auto PhiPacket = info.getRange(packetData);
+      for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) {
+        Value *Incoming = Phi->getIncomingValue(i);
+        BasicBlock *BB = Phi->getIncomingBlock(i);
+        auto PackIncoming = packetize(Incoming).getAsPacket(PhiPacket.size());
+        for (unsigned j = 0; j < PhiPacket.size(); ++j) {
+          auto *NewPhi = cast<PHINode>(PhiPacket.at(j));
+          auto *Incoming = PackIncoming.at(j);
+          VECZ_FAIL_IF(!NewPhi);
+          VECZ_FAIL_IF(!Incoming);
+          NewPhi->addIncoming(Incoming, BB);
+        }
+      }
+    }
+    IC.deleteInstructionLater(Phi);
+  }
+
+  auto *insertPt = &*EntryBB.begin();
+  for (auto &I : EntryBB) {
+    auto *const alloca = dyn_cast<AllocaInst>(&I);
+    if (!alloca) {
+      insertPt = I.getNextNode();
+      continue;
+    }
+
+    while (isa<AllocaInst>(insertPt)) {
+      insertPt = insertPt->getNextNode();
+    }
+
+    // It's possible for some uses of the alloca to be packetized and others
+    // not. For instance, where we have a store to a constant address, since
+    // the execution order of work items is undefined, the data operand need
+    // not be packetized, and we can end up with uses of the scalar alloca
+    // still present in the vector function. In such a case we can replace it
+    // with the first element of the packetized alloca.
+    if (auto res = getPacketized(alloca)) {
+      SmallVector<Value *, 16> vals;
+      res.getPacketValues(vals);
+      if (vals.empty()) {
+        // It is a broadcast value, so we don't need to do anything.
+        continue;
+      }
+      auto *element0 = vals.front();
+
+      if (!isa<AllocaInst>(element0)) {
+        assert(isa<GetElementPtrInst>(element0) && "vecz: expected GEP");
+        auto *const GEP = cast<GetElementPtrInst>(element0);
+        // If the alloca was packetized, it will be indexed by a GEP.
+        // We only need the original, un-indexed pointer.
+        alloca->replaceAllUsesWith(GEP->getPointerOperand());
+        continue;
+      }
+
+      if (element0->getType()->isVectorTy()) {
+        B.SetInsertPoint(insertPt);
+        element0 = B.CreateExtractElement(element0, B.getInt32(0));
+      }
+      alloca->replaceAllUsesWith(element0);
+      continue;
+    }
+
+    // We have to widen allocas if they are varying, regardless of the result
+    // of the packetization analysis, because they need enough storage for all
+    // lanes, even though they are only accessed through a scalar pointer.
+    // We do this last, otherwise it messes with the stride analysis etc.
+    // Only non-instantiated allocas should be left by now.
+    if (!UVR.isVarying(alloca)) {
+      continue;
+    }
+    // Array allocas need to be instantiated.
+    assert(!alloca->isArrayAllocation() &&
+           "vecz: unexpected array alloca; should have been instantiated");
+
+    B.SetInsertPoint(alloca);
+    auto *const dataTy = alloca->getAllocatedType();
+    if (dataTy->isVectorTy() || VectorType::isValidElementType(dataTy)) {
+      // We can vectorize or vector widen this type.
+      auto *const newAlloca =
+          B.CreateAlloca(getWideType(getPaddedType(dataTy), SimdWidth));
+      newAlloca->setAlignment(alloca->getAlign());
+      newAlloca->takeName(alloca);
+
+      // Absorb other bitcasts (e.g. i8* for lifetime instrinsics, or bitcasts
+      // back to vector type for contiguous loads/stores)
+      bool needCast = false;
+      auto *const newTy = newAlloca->getType();
+      for (const Use &U : alloca->uses()) {
+        auto *const user = dyn_cast<BitCastInst>(U.getUser());
+        if (!user) {
+          needCast = true;
+          continue;
+        }
+
+        auto *const dstTy = user->getType();
+        if (dstTy == newTy) {
+          // Bitcasts totally redundant
+          user->replaceAllUsesWith(newAlloca);
+        } else {
+          // Bitcast into different bitcast
+          B.SetInsertPoint(user);
+          user->replaceAllUsesWith(B.CreateBitCast(newAlloca, user->getType()));
+        }
+        IC.deleteInstructionLater(cast<Instruction>(user));
+      }
+
+      if (needCast) {
+        // Insert the bitcast after all the allocas
+        B.SetInsertPoint(insertPt);
+        auto *const scalarPtr =
+            B.CreatePointerCast(newAlloca, alloca->getType());
+        alloca->replaceAllUsesWith(scalarPtr);
+      }
+    } else {
+      // We couldn't vectorize the type, so create an array instead.
+      VECZ_FAIL_IF(SimdWidth.isScalable());
+      const unsigned fixedWidth = SimdWidth.getFixedValue();
+
+      AllocaInst *const wideAlloca =
+          B.CreateAlloca(dataTy, getSizeInt(B, fixedWidth), alloca->getName());
+      auto align = alloca->getAlign();
+
+      // Make sure the alloca has an alignment at least as wide as any of the
+      // packetized loads or stores using it.
+      SmallVector<Instruction *, 8> users;
+      for (const Use &U : alloca->uses()) {
+        users.push_back(cast<Instruction>(U.getUser()));
+      }
+      while (!users.empty()) {
+        auto *const user = users.pop_back_val();
+        if (isa<BitCastInst>(user) || isa<GetElementPtrInst>(user)) {
+          for (const Use &U : user->uses()) {
+            users.push_back(cast<Instruction>(U.getUser()));
+          }
+        } else if (auto memop = MemOp::get(user)) {
+          const auto memAlign = memop->getAlignment();
+          if (memAlign > align.value()) {
+            align = Align(memAlign);
+          }
+        }
+      }
+
+      wideAlloca->setAlignment(align);
+      wideAlloca->takeName(alloca);
+
+      // It's just a direct replacement.
+      alloca->replaceAllUsesWith(wideAlloca);
+    }
+
+    // Note that we don't assign the widened allocas a packet, because they
+    // are not really being packetized. The problem is, a packetized alloca
+    // would be expected to be a vector of pointers to scalars, not a scalar
+    // pointer to a vector. Only instantiation can create such a packet.
+    IC.deleteInstructionLater(alloca);
+  }
+
+  const compiler::utils::NameMangler Mangler(&F.getContext());
+
+  // Handle __mux_get_sub_group_size specially (i.e., not in BuiltinInfo) since
+  // inlining it requires extra vectorization context, such as the vectorization
+  // width and choices; this inlining is too tightly coupled to the vectorizer
+  // context to exist in a generic sense.
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      CallInst *CI = dyn_cast<CallInst>(&I);
+      if (!CI) {
+        continue;
+      }
+
+      auto *const Callee = CI->getCalledFunction();
+      if (!Callee) {
+        continue;
+      }
+      auto B = Ctx.builtins().analyzeBuiltin(*Callee);
+      if (!B) {
+        continue;
+      }
+      if (B->ID == compiler::utils::eMuxBuiltinGetSubGroupSize) {
+        auto *const replacement = [this](CallInst *CI) -> Value * {
+          // The vectorized sub-group size is the mux sub-group reduction sum
+          // of all of the vectorized sub-group sizes:
+          // |   mux 0     |      mux 1       |
+          // | < a,b,c,d > | < e,f,g > (vl=3) |
+          // The total sub-group size above is 4 + 3 => 7.
+          // Note that this expects that the mux sub-group consists entirely of
+          // equivalently vectorized kernels.
+          Value *VecgroupSize;
+          IRBuilder<> B(CI);
+          auto *const I32Ty = B.getInt32Ty();
+          if (VL) {
+            VecgroupSize = VL;
+          } else {
+            VecgroupSize = B.CreateElementCount(I32Ty, SimdWidth);
+          }
+          assert(VecgroupSize && "Could not determine vector group size");
+
+          auto *ReduceFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+              compiler::utils::eMuxBuiltinSubgroupReduceAdd, *F.getParent(),
+              {I32Ty});
+          assert(ReduceFn && "Could not get reduction builtin");
+
+          return B.CreateCall(ReduceFn, VecgroupSize, "subgroup.size");
+        }(CI);
+        CI->replaceAllUsesWith(replacement);
+        IC.deleteInstructionLater(CI);
+      }
+    }
+  }
+
+  IC.deleteInstructions();
+  return true;
+}
+
+void Packetizer::Impl::onFailure() {
+  // On failure, clean up pending Phis, which may still be invalid in that they
+  // have no incoming operands. For simplicity, just erase and replace all of
+  // them with poison: the failed vectorized function will be removed anyway.
+  for (auto *Phi : pendingPhis) {
+    auto &info = packets[Phi];
+    assert(info.numInstances > 0 && "A PHI pending packetization has no stub");
+    if (info.numInstances == 1) {
+      IRCleanup::deleteInstructionNow(cast<PHINode>(info.vector));
+    } else {
+      const auto PhiPacket = info.getRange(packetData);
+      for (unsigned j = 0; j < PhiPacket.size(); ++j) {
+        IRCleanup::deleteInstructionNow(cast<PHINode>(PhiPacket.at(j)));
+      }
+    }
+  }
+}
+
+Packetizer::Result Packetizer::packetize(Value *V) {
+  // This is safe because we only ever create an instance of Impl, never an
+  // instance of the base class.
+  return static_cast<Impl *>(this)->packetize(V);
+}
+
+Packetizer::Result Packetizer::getPacketized(Value *V) {
+  auto found = packets.find(V);
+  auto *info = found != packets.end() ? &found->second : nullptr;
+  return Packetizer::Result(*this, V, info);
+}
+
+PacketRange Packetizer::createPacket(Value *V, unsigned width) {
+  auto &info = packets[V];
+  info.numInstances = width;
+  return Result(*this, V, &info).createPacket(width);
+}
+
+Packetizer::Result Packetizer::Impl::getPacketizationResult(
+    Instruction *I, const SmallVectorImpl<Value *> &Packet, bool UpdateStats) {
+  if (Packet.empty()) {
+    return Result(*this);
+  }
+  auto PacketWidth = Packet.size();
+
+  // If there's only one value in the packet, we can assign the new packetized
+  // value to the old instruction directly.
+  if (PacketWidth == 1) {
+    Value *Vec = Packet.front();
+    if (Vec != I) {
+      // Only delete if the vectorized value is different from the scalar.
+      IC.deleteInstructionLater(I);
+    }
+    vectorizeDI(I, Vec);
+    return assign(I, Vec);
+  }
+
+  // Otherwise we have to create a 'Result' out of the packetized values.
+  IC.deleteInstructionLater(I);
+  auto &Info = packets[I];
+  auto Res = Result(*this, I, &Info);
+  auto P = Res.createPacket(PacketWidth);
+  for (unsigned i = 0; i < PacketWidth; ++i) {
+    P[i] = Packet[i];
+  }
+
+  if (UpdateStats) {
+    ++VeczPacketized;
+  }
+  Info.numInstances = PacketWidth;
+  return Res;
+}
+
+Value *Packetizer::Impl::reduceBranchCond(Value *cond, Instruction *terminator,
+                                          bool allOf) {
+  // Get the branch condition at its natural packet width
+  auto conds = packetizeAndGet(cond);
+  VECZ_FAIL_IF(conds.empty());
+
+  // Branches can only take a scalar mask. The new branch condition is true
+  // only if the original condition is true for any lane (or for all lanes if
+  // the condition is used in a BOSCC block indirection.)
+  IRBuilder<> B(terminator);
+  const auto name = cond->getName();
+
+  // Reduce the packet to a single value
+  auto w = conds.size();
+
+  if (VL && w != 1) {
+    emitVeczRemarkMissed(&F, cond,
+                         "Can not vector-predicate packets larger than 1");
+    return nullptr;
+  }
+
+  while ((w >>= 1)) {
+    for (decltype(w) i = 0; i < w; ++i) {
+      conds[i] =
+          allOf ? B.CreateAnd(conds[i], conds[i + w], Twine(name, ".all_of"))
+                : B.CreateOr(conds[i], conds[i + w], Twine(name, ".any_of"));
+    }
+  }
+
+  const RecurKind kind = allOf ? RecurKind::And : RecurKind::Or;
+
+  // VP reduction intrinsics didn't make it into LLVM 13 so we have to make do
+  // by pre-sanitizing the input such that elements past VL get the identity
+  // value.
+  Value *&f = conds.front();
+
+  return createMaybeVPReduction(B, f, kind, VL);
+}
+
+Packetizer::Result Packetizer::Impl::assign(Value *Scalar, Value *Vectorized) {
+  if (!Vectorized) {
+    emitVeczRemarkMissed(&F, Scalar, "Failed to vectorize");
+    return Packetizer::Result(*this);
+  } else {
+    ++VeczPacketized;
+    auto &info = packets[Scalar];
+    info.vector = Vectorized;
+    info.numInstances = 1;
+    return Packetizer::Result(*this, Scalar, &info);
+  }
+}
+
+Value *Packetizer::Impl::packetizeIfVarying(Value *V) {
+  if (UVR.isVarying(V)) {
+    return packetize(V).getAsValue();
+  } else if (UVR.isMaskVarying(V)) {
+    VECZ_FAIL_IF(!packetize(V));
+  }
+  return V;
+}
+
+Packetizer::Result Packetizer::Impl::packetize(Value *V) {
+  // Do not packetize the same value twice.
+  if (const auto res = getPacketized(V)) {
+    return res;
+  }
+  // Now check whether this value is actually packetizable.
+  if (!Ctx.targetInfo().canPacketize(V, SimdWidth)) {
+    return Packetizer::Result(*this);
+  }
+
+  if (!isa<Instruction>(V)) {
+    return broadcast(V);
+  }
+
+  auto *const Ins = cast<Instruction>(V);
+
+  if (auto *const Branch = dyn_cast<BranchInst>(Ins)) {
+    if (Branch->isConditional()) {
+      // varying reductions need to be packetized
+      auto *newCond = packetize(Branch->getCondition()).getAsValue();
+      if (!newCond) {
+        return Packetizer::Result(*this);
+      }
+
+      // Packetization should normally have produced a reduction to scalar.
+      // However, when Packetize Uniform is on, a uniform branch won't have
+      // a divergence reduction so it will need reducing manually here.
+      if (newCond->getType()->isVectorTy()) {
+        IRBuilder<> B(Branch);
+        const RecurKind kind = RecurKind::Or;
+        newCond = createMaybeVPReduction(B, newCond, kind, VL);
+      }
+
+      Branch->setCondition(newCond);
+    }
+    return broadcast(Ins);
+  }
+
+  if (isa<SwitchInst>(Ins)) {
+    // we can't handle varying switches
+    return Packetizer::Result(*this);
+  }
+
+  if (UVR.isMaskVarying(Ins)) {
+    if (auto *const res = packetizeMaskVarying(Ins)) {
+      return broadcast(res);
+    }
+    // Fall back on instantiation if the instruction could not be packetized
+    Instantiator->instantiate(Ins);
+    return getPacketized(Ins);
+  }
+
+  if (auto *reduction = packetizeGroupReduction(Ins)) {
+    return broadcast(reduction);
+  }
+
+  if (auto *brdcast = packetizeGroupBroadcast(Ins)) {
+    return broadcast(brdcast);
+  }
+
+  if (auto shuffle = isSubgroupShuffleLike(Ins)) {
+    switch (shuffle->Op) {
+    default:
+      break;
+    case compiler::utils::GroupCollective::OpKind::Shuffle:
+      if (auto *s = packetizeSubgroupShuffle(Ins)) {
+        return broadcast(s);
+      }
+      break;
+    case compiler::utils::GroupCollective::OpKind::ShuffleXor:
+      if (auto s = packetizeSubgroupShuffleXor(Ins, *shuffle)) {
+        return s;
+      }
+      break;
+    case compiler::utils::GroupCollective::OpKind::ShuffleUp:
+    case compiler::utils::GroupCollective::OpKind::ShuffleDown:
+      if (auto s = packetizeSubgroupShuffleUpDown(Ins, *shuffle)) {
+        return s;
+      }
+      break;
+    }
+    // We can't packetize all sub-group shuffle-like operations, but we also
+    // can't vectorize or instantiate them - so provide a diagnostic saying as
+    // much.
+    emitVeczRemarkMissed(&F, Ins, "Could not packetize sub-group shuffle");
+    return Packetizer::Result(*this);
+  }
+
+  // Check if we should broadcast the instruction.
+  // Broadcast uniform instructions, unless we want to packetize uniform
+  // instructions as well. We can assume that isMaskVarying is false at this
+  // point.
+  bool shouldBroadcast = !UVR.isVarying(Ins) && !Choices.packetizeUniform();
+  // Or unless this instruction is in a loop and we want to packetize uniform
+  // instructions in loops
+  if (shouldBroadcast && Choices.packetizeUniformInLoops()) {
+    const LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+    shouldBroadcast = !LI.getLoopFor(Ins->getParent());
+  }
+
+  // The packetization of a mask-varying value takes care of its own broadcast
+  if (shouldBroadcast) {
+    // Insert broadcast instructions after the instruction to broadcast
+    return broadcast(Ins);
+  }
+
+  if (const auto res = packetizeInstruction(Ins)) {
+    return res;
+  }
+  // Fall back on instantiation if the instruction could not be packetized,
+  // unless we're vector predicating.
+  if (VL) {
+    return Packetizer::Result(*this);
+  }
+  Instantiator->instantiate(Ins);
+  return getPacketized(Ins);
+}
+
+ValuePacket Packetizer::Impl::packetizeAndGet(Value *v) {
+  ValuePacket results;
+  if (auto res = packetize(v)) {
+    res.getPacketValues(results);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeAndGet(Value *v, unsigned w) {
+  ValuePacket results;
+  if (auto res = packetize(v)) {
+    res.getPacketValues(w, results);
+  }
+  return results;
+}
+
+Packetizer::Result Packetizer::Impl::broadcast(Value *V) {
+  return Result(*this, V, &packets[V]);
+}
+
+unsigned Packetizer::Impl::getPacketWidthForType(Type *ty,
+                                                 unsigned limit) const {
+  if (SimdWidth.isScalable()) {
+    return 1;
+  }
+
+  const unsigned simdWidth = SimdWidth.getFixedValue();
+  unsigned maxWidth = 0;
+
+  if (!Choices.targetIndependentPacketization()) {
+    maxWidth = std::min(limit, Ctx.targetInfo().getVectorWidthForType(
+                                   TTI, *ty->getScalarType()));
+
+    // We let the target return a value wider than the SIMD Width, but not
+    // narrower.
+    if (maxWidth) {
+      maxWidth = std::max(simdWidth, maxWidth);
+    }
+  }
+
+  if (maxWidth == 0) {
+    maxWidth = std::max(simdWidth, 16u);
+  }
+
+  unsigned elts = 1;
+  if (ty->isVectorTy()) {
+    auto *vecTy = cast<FixedVectorType>(ty);
+    elts = vecTy->getNumElements();
+  }
+
+  const unsigned fullWidth = elts * simdWidth;
+  if (fullWidth <= maxWidth) {
+    return 1;
+  }
+
+  // Round up to the next power of two..
+  // This should only be needed if the type was a 3-vector..
+  // Note that we don't really expect huge values here, over 16 is still
+  // currently not officially supported, over 256 would be astonishing,
+  // and over 65536 would be inconcievable, so we don't bother to >> 16.
+  unsigned width = (fullWidth / maxWidth) - 1;
+  width |= width >> 1;
+  width |= width >> 2;
+  width |= width >> 4;
+  width |= width >> 8;
+
+  // Can't have a packet wider than the simdWidth..
+  return std::min(width + 1, simdWidth);
+}
+
+Packetizer::Result Packetizer::Impl::packetizeInstruction(Instruction *Ins) {
+  ValuePacket results;
+
+  // Figure out what kind of instruction it is and try to vectorize it.
+  switch (Ins->getOpcode()) {
+  default:
+    if (Ins->isBinaryOp()) {
+      results = packetizeBinaryOp(cast<BinaryOperator>(Ins));
+    } else if (Ins->isCast()) {
+      results = packetizeCast(cast<CastInst>(Ins));
+    } else if (Ins->isUnaryOp()) {
+      results = packetizeUnaryOp(cast<UnaryOperator>(Ins));
+    }
+    break;
+
+  case Instruction::PHI:
+    results = packetizePHI(cast<PHINode>(Ins));
+    break;
+  case Instruction::GetElementPtr:
+    results = packetizeGEP(cast<GetElementPtrInst>(Ins));
+    break;
+  case Instruction::Store:
+    results = packetizeStore(cast<StoreInst>(Ins));
+    break;
+  case Instruction::Load:
+    results = packetizeLoad(cast<LoadInst>(Ins));
+    break;
+  case Instruction::Call:
+    results = packetizeCall(cast<CallInst>(Ins));
+    break;
+  case Instruction::ICmp:
+    results = packetizeICmp(cast<ICmpInst>(Ins));
+    break;
+  case Instruction::FCmp:
+    results = packetizeFCmp(cast<FCmpInst>(Ins));
+    break;
+  case Instruction::Select:
+    results = packetizeSelect(cast<SelectInst>(Ins));
+    break;
+  case Instruction::InsertElement:
+    results = packetizeInsertElement(cast<InsertElementInst>(Ins));
+    break;
+  case Instruction::ExtractElement:
+    results = packetizeExtractElement(cast<ExtractElementInst>(Ins));
+    break;
+  case Instruction::InsertValue:
+    results = packetizeInsertValue(cast<InsertValueInst>(Ins));
+    break;
+  case Instruction::ExtractValue:
+    results = packetizeExtractValue(cast<ExtractValueInst>(Ins));
+    break;
+  case Instruction::ShuffleVector:
+    results = packetizeShuffleVector(cast<ShuffleVectorInst>(Ins));
+    break;
+  case Instruction::Freeze:
+    results = packetizeFreeze(cast<FreezeInst>(Ins));
+    break;
+  case Instruction::AtomicCmpXchg:
+    results = packetizeAtomicCmpXchg(cast<AtomicCmpXchgInst>(Ins));
+    break;
+  }
+
+  if (auto res = getPacketizationResult(Ins, results, /*update stats*/ true)) {
+    return res;
+  }
+
+  if (auto *vec = vectorizeInstruction(Ins)) {
+    return assign(Ins, vec);
+  }
+
+  return Packetizer::Result(*this, Ins, nullptr);
+}
+
+Value *Packetizer::Impl::packetizeGroupReduction(Instruction *I) {
+  auto *const CI = dyn_cast<CallInst>(I);
+  if (!CI) {
+    return nullptr;
+  }
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    return nullptr;
+  }
+
+  const auto Builtin = BI.analyzeBuiltin(*callee);
+  if (!Builtin) {
+    return nullptr;
+  }
+  const auto Info = BI.isMuxGroupCollective(Builtin->ID);
+
+  if (!Info || (!Info->isSubGroupScope() && !Info->isWorkGroupScope()) ||
+      (!Info->isAnyAll() && !Info->isReduction())) {
+    return nullptr;
+  }
+
+  const bool isWorkGroup = Info->isWorkGroupScope();
+  const unsigned argIdx = isWorkGroup ? 1 : 0;
+
+  SmallVector<Value *, 16> opPackets;
+  IRBuilder<> B(CI);
+  auto *const argTy = CI->getArgOperand(argIdx)->getType();
+  auto packetWidth = getPacketWidthForType(argTy);
+
+  // Don't vector predicate if we have to split into multiple packets. The
+  // introduction of instructions to manage the splitting up of our VL into N
+  // chunks is likely to kill performance anyway.
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, CI,
+                         "Can not vector-predicate packets larger than 1");
+    return nullptr;
+  }
+
+  auto op = packetize(CI->getArgOperand(argIdx));
+
+  // Reduce the packet values in-place.
+  // TODO: can we add 'reassoc' to the floating-point reductions to absolve
+  // them of ordering?
+  op.getPacketValues(packetWidth, opPackets);
+
+  assert((!VL || packetWidth) &&
+         "Should have bailed if dealing with more than one VP packet");
+
+  // According to the OpenCL Spec, we are allowed to rearrange the operation
+  // order of a workgroup/subgroup reduction any way we like (even though
+  // floating point addition is not associative so might not produce exactly
+  // the same result), so we reduce to a single vector first, if necessary, and
+  // then do a single reduction to scalar. This is more efficient than doing
+  // multiple reductions to scalar and then BinOp'ing multiple scalars
+  // together.
+  //
+  // Reduce to a single vector.
+  while ((packetWidth >>= 1)) {
+    for (decltype(packetWidth) i = 0; i < packetWidth; ++i) {
+      Value *const lhs = opPackets[i];
+      Value *const rhs = opPackets[i + packetWidth];
+      opPackets[i] = compiler::utils::createBinOpForRecurKind(B, lhs, rhs,
+                                                              Info->Recurrence);
+    }
+  }
+
+  // Reduce to a scalar.
+  Value *v = createMaybeVPReduction(B, opPackets.front(), Info->Recurrence, VL);
+
+  // We leave the original reduction function and divert the vectorized
+  // reduction through it, giving us a reduction over the full apparent
+  // sub-group or work-group size (vecz * mux).
+  CI->setOperand(argIdx, v);
+
+  return CI;
+}
+
+Value *Packetizer::Impl::packetizeGroupBroadcast(Instruction *I) {
+  auto *const CI = dyn_cast<CallInst>(I);
+  if (!CI) {
+    return nullptr;
+  }
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    return nullptr;
+  }
+  const auto Builtin = BI.analyzeBuiltin(*callee);
+  if (!Builtin) {
+    return nullptr;
+  }
+
+  bool isWorkGroup = false;
+  if (auto Info = BI.isMuxGroupCollective(Builtin->ID)) {
+    if (!Info->isBroadcast() ||
+        (!Info->isSubGroupScope() && !Info->isWorkGroupScope())) {
+      return nullptr;
+    }
+    isWorkGroup = Info->isWorkGroupScope();
+  } else {
+    return nullptr;
+  }
+
+  IRBuilder<> B(CI);
+
+  const unsigned argIdx = isWorkGroup ? 1 : 0;
+  auto *const src = CI->getArgOperand(argIdx);
+
+  auto op = packetize(src);
+  PACK_FAIL_IF(!op);
+
+  // If the source operand happened to be a broadcast value already, we can use
+  // it directly.
+  if (op.info->numInstances == 0) {
+    IC.deleteInstructionLater(CI);
+    CI->replaceAllUsesWith(src);
+    return src;
+  }
+
+  auto *idx = CI->getArgOperand(argIdx + 1);
+  // We need to sanitize the input index so that it stays within the range of
+  // one vectorized group.
+  Value *idxFactor = B.CreateElementCount(idx->getType(), SimdWidth);
+  auto *const vecIdx = B.CreateURem(idx, idxFactor);
+
+  Value *val = nullptr;
+  // Optimize the constant fixed-vector case, where we can choose the exact
+  // subpacket to extract from directly.
+  if (isa<ConstantInt>(vecIdx) && !SimdWidth.isScalable()) {
+    ValuePacket opPackets;
+    op.getPacketValues(opPackets);
+    auto factor = SimdWidth.divideCoefficientBy(opPackets.size());
+    const unsigned subvecSize = factor.getFixedValue();
+    assert(subvecSize > 0 && "Subvector size cannot be zero");
+    const unsigned idxVal = cast<ConstantInt>(vecIdx)->getZExtValue();
+    // If individual elements are scalar (through instantiation, say) then just
+    // use the desired packet directly.
+    if (subvecSize == 1) {
+      val = opPackets[idxVal];
+    } else {
+      // Else extract from the correct packet, adjusting the index as we go.
+      val = B.CreateExtractElement(
+          opPackets[idxVal / subvecSize],
+          ConstantInt::get(vecIdx->getType(), idxVal % subvecSize));
+    }
+  } else {
+    val = B.CreateExtractElement(op.getAsValue(), vecIdx);
+  }
+
+  // We leave the original broadcast function and divert the vectorized
+  // broadcast through it, giving us a broadcast over the full apparent
+  // sub-group or work-group size (vecz * mux).
+  CI->setOperand(argIdx, val);
+  if (!isWorkGroup) {
+    // For sub-groups, we need to normalize the sub-group ID into the range of
+    // mux sub-groups.
+    //       |-----------------|-----------------|
+    //       | broadcast(X, 6) | broadcast(A, 6) |
+    // VF=4  |-----------------|-----------------|
+    //       | b(<X,Y,Z,W>, 6) | b(<A,B,C,D>, 6) |
+    //       |-----------------|-----------------|
+    // M=I/4 |        1        |        1        |
+    // V=I%4 |        2        |        2        |
+    //       |-----------------|-----------------|
+    //       |   <X,Y,Z,W>[V]  |   <A,B,C,D>[V]  |
+    //       |       Z         |       C         |
+    //       |-----------------|-----------------|
+    //       | broadcast(Z, M) | broadcast(C, M) |
+    // res   |       C         |       C         |
+    // splat |    <C,C,C,C>    |    <C,C,C,C>    |
+    //       |-----------------|-----------------|
+    auto *const muxIdx = B.CreateUDiv(idx, idxFactor);
+    CI->setOperand(argIdx + 1, muxIdx);
+  }
+
+  return CI;
+}
+
+std::optional<compiler::utils::GroupCollective>
+Packetizer::Impl::isSubgroupShuffleLike(Instruction *I) {
+  auto *const CI = dyn_cast<CallInst>(I);
+  if (!CI) {
+    return std::nullopt;
+  }
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    return std::nullopt;
+  }
+
+  const auto Builtin = BI.analyzeBuiltin(*callee);
+  if (!Builtin) {
+    return std::nullopt;
+  }
+
+  const auto Info = BI.isMuxGroupCollective(Builtin->ID);
+
+  if (Info && Info->isSubGroupScope() && Info->isShuffleLike()) {
+    return Info;
+  }
+
+  return std::nullopt;
+}
+
+Value *Packetizer::Impl::packetizeSubgroupShuffle(Instruction *I) {
+  auto *const CI = cast<CallInst>(I);
+
+  // We don't support scalable vectorization of sub-group shuffles.
+  if (SimdWidth.isScalable()) {
+    return nullptr;
+  }
+
+  auto *const Data = CI->getArgOperand(0);
+  auto *const Idx = CI->getArgOperand(1);
+
+  auto PackData = packetize(Data);
+  if (!PackData) {
+    return nullptr;
+  }
+
+  // If the data operand happened to be a broadcast value already, we can use
+  // it directly.
+  if (PackData.info->numInstances == 0) {
+    IC.deleteInstructionLater(CI);
+    CI->replaceAllUsesWith(Data);
+    return Data;
+  }
+
+  // We can't packetize varying shuffle indices yet.
+  if (UVR.isVarying(Idx)) {
+    return nullptr;
+  }
+
+  IRBuilder<> B(CI);
+
+  // We need to sanitize the input index so that it stays within the range of
+  // one vectorized group.
+  const unsigned VF = SimdWidth.getFixedValue();
+  auto *const VecIdxFactor = ConstantInt::get(Idx->getType(), VF);
+  // This index is the element of the vector-group which holds the desired
+  // data, per mux sub-group.
+  // <x, y>, <z, w>: idx 1 -> vector element 1, idx 2 -> vector element 0.
+  auto *const VecIdx = B.CreateURem(Idx, VecIdxFactor);
+  // This index is the mux sub-group in which the desired data resides.
+  // <x, y>, <z, w>: idx 1 -> mux sub-group 0, idx 3 -> mux sub-group 1.
+  auto *const MuxIdx = B.CreateUDiv(Idx, VecIdxFactor);
+
+  Value *VecData = PackData.getAsValue();
+
+  // Note: in each illustrative example, imagine two invocations across a
+  // single mux sub-groups, each being vectorized by 4; in other words, 8
+  // 'original' invocations to a sub-group, running in two vectorized
+  // invocations.
+  if (auto *const DataVecTy = dyn_cast<VectorType>(Data->getType());
+      !DataVecTy) {
+    // The vectorized shuffle is producing a scalar (assuming uniform indices,
+    // see above). Imagine i=6 (6 % 4 = 2 and 6 / 4 = 1):
+    //       |  shuffle(X, 6)  |  shuffle(A, 6)  |
+    // VF=4  |-----------------|-----------------|
+    //       | s(<X,Y,Z,W>, 2) | s(<A,B,C,D>, 2) |
+    // elt 2 |        Z        |        C        |
+    // shuff | shuffle(Z, 1)   | shuffle(C, 1)   |
+    //       |        C        |        C        |
+    // bcast |   <C,C,C,C>     |   <C,C,C,C>     |
+    // This way we can see how each of the 8 invocations end up with the 6th
+    // element of the total sub-group.
+    VecData = B.CreateExtractElement(VecData, VecIdx, "vec.extract");
+  } else if (auto *const CIdx = dyn_cast<ConstantInt>(VecIdx)) {
+    // The shuffle produces a vector, and we have a constant shuffle index - we
+    // can extract a subvector easily.
+    // Imagine i=6 (6 % 4 = 2 and 6 / 4 = 1):
+    //       |     shuffle(<X,Y>, 6)   |     shuffle(<A,B>, 6)   |
+    // VF=4  |-------------------------|-------------------------|
+    //       | s(<X,Y,Z,W,P,Q,-,->, 2) | s(<A,B,C,D,E,F,-,->, 2) |
+    // vec 2 |           <P,Q>         |           <E,F>         |
+    // shuff |     shuffle(<P,Q>, 1)   |     shuffle(<E,F>, 1)   |
+    //       |           <E,F>         |           <E,F>         |
+    // bcast |   <E,F,E,F,E,F,E,F>     |   <E,F,E,F,E,F,E,F>     |
+    // This way we can see how each of the 8 invocations end up with the 6th
+    // element of the total sub-group, which is a two-element vector.
+
+    // Note: the subvector vector index type has to be i64. Scale it up by the
+    // size of the vector we're extracting: the index is the *element* from
+    // which to extract - it is not implicitly scaled by the vector size.
+    auto *const ExtractIdx = B.getInt64(
+        CIdx->getZExtValue() * DataVecTy->getElementCount().getFixedValue());
+    VecData = B.CreateExtractVector(Data->getType(), VecData, ExtractIdx,
+                                    "vec.extract");
+  } else {
+    // This is as above, but the process of extracting the initial vector is
+    // more complicated - we have to manually extract and insert each element.
+    // It's possible that for some targets and for some combinations of vector
+    // width and vectorization factor, that going through memory would be
+    // faster.
+    Value *ExtractedVec = PoisonValue::get(DataVecTy);
+    const unsigned DataNumElts = DataVecTy->getElementCount().getFixedValue();
+    auto *const BaseIdx = B.CreateMul(VecIdx, B.getInt32(DataNumElts));
+    for (unsigned i = 0; i < DataNumElts; i++) {
+      auto *const SubIdx = B.CreateAdd(BaseIdx, B.getInt32(i));
+      auto *const Elt = B.CreateExtractElement(VecData, SubIdx);
+      ExtractedVec = B.CreateInsertElement(ExtractedVec, Elt, B.getInt32(i));
+    }
+    VecData = ExtractedVec;
+  }
+
+  // We leave the original shuffle function and divert the vectorized
+  // shuffle through it, giving us a shuffle over the full apparent
+  // sub-group size (vecz * mux).
+  CI->setOperand(0, VecData);
+  CI->setOperand(1, MuxIdx);
+
+  return CI;
+}
+
+Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleXor(
+    Instruction *I, compiler::utils::GroupCollective ShuffleXor) {
+  auto *const CI = cast<CallInst>(I);
+
+  // We don't support scalable vectorization of sub-group shuffles.
+  if (SimdWidth.isScalable()) {
+    return Packetizer::Result(*this);
+  }
+  const unsigned VF = SimdWidth.getFixedValue();
+
+  auto *const Data = CI->getArgOperand(0);
+  auto *const Val = CI->getArgOperand(1);
+
+  auto PackData = packetize(Data);
+  if (!PackData) {
+    return Packetizer::Result(*this);
+  }
+
+  // If the data operand happened to be a broadcast value already, we can use
+  // it directly.
+  if (PackData.info->numInstances == 0) {
+    IC.deleteInstructionLater(CI);
+    CI->replaceAllUsesWith(Data);
+    return PackData;
+  }
+
+  auto PackVal = packetize(Val);
+  if (!PackVal) {
+    return Packetizer::Result(*this);
+  }
+
+  // With the packetize operands in place, we have to perform the actual
+  // shuffling operation. Since we are one layer higher than the mux
+  // sub-groups, our IDs do not easily translate to the mux level. Therefore we
+  // perform each shuffle using the regular 'shuffle' and do the XOR of the IDs
+  // ourselves.
+
+  // Note: in this illustrative example, imagine two invocations across a
+  // single mux sub-groups, each being vectorized by 4; in other words, 8
+  // 'original' invocations to a sub-group, running in two vectorized
+  // invocations. Imagine value = 5:
+  //                |  shuffle(X, 5)       |  shuffle(A, 5)       |
+  // VF=4           |----------------------|----------------------|
+  //                |    s(<X,Y,Z,W>, 5)   |    s(<A,B,C,D>, 5)   |
+  // SG IDs         |       0,1,2,3        |       4,5,6,7        |
+  // SG IDs^5       |       5,4,7,6        |       1,0,3,2        |
+  // I=(SG IDs^5)/4 |       1,1,1,1        |       0,0,0,0        |
+  // J=(SG IDs^5)%4 |       1,0,3,2        |       1,0,3,2        |
+  // <X,Y,Z,W>[J]   |       Y,X,W,Z        |       B,A,D,A        |
+  // Mux-shuffle[I] | [Y,B][1],[X,A][1],.. | [Y,B][0],[X,A][1],.. |
+  //                |       B,A,D,A        |       Y,X,W,Z        |
+  IRBuilder<> B(CI);
+
+  auto *const SubgroupLocalIDFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      compiler::utils::eMuxBuiltinGetSubGroupLocalId, *F.getParent(),
+      {CI->getType()});
+  assert(SubgroupLocalIDFn);
+
+  auto *const SubgroupLocalID =
+      B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id");
+  const auto Builtin =
+      Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension);
+  if (!Builtin) {
+    return Packetizer::Result(*this);
+  }
+
+  // Vectorize the sub-group local ID
+  auto *const VecSubgroupLocalID =
+      vectorizeWorkGroupCall(SubgroupLocalID, *Builtin);
+  if (!VecSubgroupLocalID) {
+    return Packetizer::Result(*this);
+  }
+  VecSubgroupLocalID->setName("vec.sg.local.id");
+
+  // The value is always i32, as is the sub-group local ID. Vectorizing both of
+  // them should result in the same vector type, with as many elements as the
+  // vectorization factor.
+  auto *const VecVal = PackVal.getAsValue();
+
+  assert(VecVal->getType() == VecSubgroupLocalID->getType() &&
+         VecVal->getType()->isVectorTy() &&
+         cast<VectorType>(VecVal->getType())
+                 ->getElementCount()
+                 .getKnownMinValue() == VF &&
+         "Unexpected vectorization of sub-group shuffle xor");
+
+  // Perform the XOR of the sub-group IDs with the 'value', as per the
+  // semantics of the builtin.
+  auto *const XoredID = B.CreateXor(VecSubgroupLocalID, VecVal);
+
+  // We need to sanitize the input index so that it stays within the range of
+  // one vectorized group.
+  auto *const VecIdxFactor = ConstantInt::get(SubgroupLocalID->getType(), VF);
+
+  // Bring this ID into the range of 'mux' sub-groups by dividing it by the
+  // vector size.
+  auto *const MuxXoredID =
+      B.CreateUDiv(XoredID, B.CreateVectorSplat(VF, VecIdxFactor));
+  // And into the range of the vector group
+  auto *const VecXoredID =
+      B.CreateURem(XoredID, B.CreateVectorSplat(VF, VecIdxFactor));
+
+  // Now we defer to an *exclusive* scan over the group.
+  auto RegularShuffle = ShuffleXor;
+  RegularShuffle.Op = compiler::utils::GroupCollective::OpKind::Shuffle;
+
+  auto RegularShuffleID = Ctx.builtins().getMuxGroupCollective(RegularShuffle);
+  assert(RegularShuffleID);
+
+  auto *const RegularShuffleFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      *RegularShuffleID, *F.getParent(), {CI->getType()});
+  assert(RegularShuffleFn);
+
+  auto *const VecData = PackData.getAsValue();
+  Value *CombinedShuffle = PoisonValue::get(VecData->getType());
+
+  for (unsigned i = 0; i < VF; i++) {
+    auto *Idx = B.getInt32(i);
+    // Get the XORd index local to the vector group that this vector group
+    // element wants to shuffle with.
+    auto *const VecGroupIdx = B.CreateExtractElement(VecXoredID, Idx);
+    // Grab that element. It may be a vector, in which case we must extract
+    // each element individually.
+    Value *DataElt = nullptr;
+    if (auto *DataVecTy = dyn_cast<VectorType>(Data->getType()); !DataVecTy) {
+      DataElt = B.CreateExtractElement(VecData, VecGroupIdx);
+    } else {
+      DataElt = PoisonValue::get(DataVecTy);
+      auto VecWidth = DataVecTy->getElementCount().getFixedValue();
+      // VecGroupIdx is the 'base' of the subvector, whose elements are stored
+      // sequentially from that point.
+      auto *const VecVecGroupIdx =
+          B.CreateMul(VecGroupIdx, B.getInt32(VecWidth));
+      for (unsigned j = 0; j != VecWidth; j++) {
+        auto *const Elt = B.CreateExtractElement(
+            VecData, B.CreateAdd(VecVecGroupIdx, B.getInt32(j)));
+        DataElt = B.CreateInsertElement(DataElt, Elt, B.getInt32(j));
+      }
+    }
+    assert(DataElt);
+    // Shuffle it across the mux sub-group.
+    auto *const MuxID = B.CreateExtractElement(MuxXoredID, Idx);
+    auto *const Shuff = B.CreateCall(RegularShuffleFn, {DataElt, MuxID});
+    // Combine that back into the final shuffled vector.
+    if (auto *DataVecTy = dyn_cast<VectorType>(Data->getType()); !DataVecTy) {
+      CombinedShuffle = B.CreateInsertElement(CombinedShuffle, Shuff, Idx);
+    } else {
+      auto VecWidth = DataVecTy->getElementCount().getFixedValue();
+      CombinedShuffle = B.CreateInsertVector(
+          CombinedShuffle->getType(), CombinedShuffle, Shuff,
+          B.getInt64(static_cast<uint64_t>(i) * VecWidth));
+    }
+  }
+
+  IC.deleteInstructionLater(CI);
+  return assign(CI, CombinedShuffle);
+}
+
+Packetizer::Result Packetizer::Impl::packetizeSubgroupShuffleUpDown(
+    Instruction *I, compiler::utils::GroupCollective ShuffleUpDown) {
+  const bool IsDown =
+      ShuffleUpDown.Op == compiler::utils::GroupCollective::OpKind::ShuffleDown;
+  assert((IsDown || ShuffleUpDown.Op ==
+                        compiler::utils::GroupCollective::OpKind::ShuffleUp) &&
+         "Invalid shuffle kind");
+
+  auto *const CI = cast<CallInst>(I);
+
+  // We don't support scalable vectorization of sub-group shuffles.
+  if (SimdWidth.isScalable()) {
+    return Packetizer::Result(*this);
+  }
+  const unsigned VF = SimdWidth.getFixedValue();
+
+  // LHS is 'current' for a down-shuffle, and 'previous' for an up-shuffle.
+  auto *const LHSOp = CI->getArgOperand(0);
+  // RHS is 'next' for a down-shuffle, and 'current' for an up-shuffle.
+  auto *const RHSOp = CI->getArgOperand(1);
+  auto *const DeltaOp = CI->getArgOperand(2);
+
+  auto PackDelta = packetize(DeltaOp);
+  if (!PackDelta) {
+    return Packetizer::Result(*this);
+  }
+
+  auto PackLHS = packetize(LHSOp);
+  if (!PackLHS) {
+    return Packetizer::Result(*this);
+  }
+
+  auto PackRHS = packetize(RHSOp);
+  if (!PackRHS) {
+    return Packetizer::Result(*this);
+  }
+
+  auto *const LHSPackVal = PackLHS.getAsValue();
+  auto *const RHSPackVal = PackRHS.getAsValue();
+  assert(LHSPackVal && RHSPackVal &&
+         LHSPackVal->getType() == RHSPackVal->getType());
+
+  // Remember in the example below that the builtins take *deltas* which add
+  // onto the mux sub-group local ID. Therefore a delta of 2 returns different
+  // data for each of the mux sub-group elements.
+  //                |----------------------------|----------------------------|
+  //                |   shuffle_down(A, X, 2)    |   shuffle_down(E, I, 2)    |
+  // VF=4           |----------------------------|----------------------------|
+  //                | s(<A,B,C,D>, <X,Y,Z,W>, 2) | s(<E,F,G,H>, <I,J,K,L>, 2) |
+  // SGIds          |          0,1,2,3           |          4,5,6,7           |
+  // SGIds+D        |          2,3,4,5           |          6,7,8,9           |
+  // MuxSGIds       |          0,0,0,0           |          1,1,1,1           |
+  //                |----------------------------|----------------------------|
+  // M=(SGIds+D)/VF |          0,0,1,1           |          1,1,2,2           |
+  // V=(SGIds+D)%VF |          2,3,0,1           |          2,3,0,1           |
+  //                |----------------------------|----------------------------|
+  // M - MuxSGIds   |          0,0,1,1           |          0,0,1,1           |
+  //                |----------------------------|----------------------------|
+  // Shuff[0]       | s(<A,B,C,D>, <X,Y,Z,W>, 0) | s(<E,F,G,H>, <I,J,K,L>, 0) |
+  // Data returned  | 0+0 => 0 => <A,B,C,D>      | 1+0 => 1 => <E,F,G,H>      |
+  // Shuff[0][V[0]] |     <A,B,C,D>[2] = C       |     <E,F,G,H>[2] = G       |
+  //                |----------------------------|----------------------------|
+  // Shuff[1]       | s(<A,B,C,D>, <X,Y,Z,W>, 0) | s(<E,F,G,H>, <I,J,K,L>, 0) |
+  // Data returned  | 0+0 => 0 => <A,B,C,D>      | 1+0 => 1 => <E,F,G,H>      |
+  // Shuff[1][V[1]] |     <A,B,C,D>[3] = D       |     <E,F,G,H>[3] = H       |
+  //                |----------------------------|----------------------------|
+  // Shuff[2]       | s(<A,B,C,D>, <X,Y,Z,W>, 1) | s(<E,F,G,H>, <I,J,K,L>, 1) |
+  // Data returned  | 0+1 => 1 => <E,F,G,H>      | 1+1 => 2 => 0 => <X,Y,Z,W> |
+  // Shuff[2][V[2]] |     <E,F,G,H>[0] = E       |     <X,Y,Z,W>[0] = X       |
+  //                |----------------------------|----------------------------|
+  // Shuff[3]       | s(<A,B,C,D>, <X,Y,Z,W>, 1) | s(<E,F,G,H>, <I,J,K,L>, 1) |
+  // Data returned  | 0+1 => 1 => <E,F,G,H>      | 1+1 => 2 => 0 => <X,Y,Z,W> |
+  // Shuff[3][V[3]] |     <E,F,G,H>[1] = F       |     <X,Y,Z,W>[1] = Y       |
+  //                |----------------------------|----------------------------|
+  // Result         |          C,D,E,F           |          G,H,X,Y           |
+  IRBuilder<> B(CI);
+
+  // Grab the packetized/vectorized sub-group local IDs
+  auto *const SubgroupLocalIDFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      compiler::utils::eMuxBuiltinGetSubGroupLocalId, *F.getParent(),
+      {CI->getType()});
+  assert(SubgroupLocalIDFn);
+
+  auto *const SubgroupLocalID =
+      B.CreateCall(SubgroupLocalIDFn, {}, "sg.local.id");
+  const auto Builtin =
+      Ctx.builtins().analyzeBuiltinCall(*SubgroupLocalID, Dimension);
+  if (!Builtin) {
+    return Packetizer::Result(*this);
+  }
+
+  // Vectorize the sub-group local ID
+  auto *const VecSubgroupLocalID =
+      vectorizeWorkGroupCall(SubgroupLocalID, *Builtin);
+  if (!VecSubgroupLocalID) {
+    return Packetizer::Result(*this);
+  }
+  VecSubgroupLocalID->setName("vec.sg.local.id");
+
+  auto *const DeltaVal = PackDelta.getAsValue();
+
+  // The delta is always i32, as is the sub-group local ID. Vectorizing both of
+  // them should result in the same vector type, with as many elements as the
+  // vectorization factor.
+  assert(DeltaVal->getType() == VecSubgroupLocalID->getType() &&
+         DeltaVal->getType()->isVectorTy() &&
+         cast<VectorType>(DeltaVal->getType())
+                 ->getElementCount()
+                 .getKnownMinValue() == VF &&
+         "Unexpected vectorization of sub-group shuffle up/down");
+
+  // Produce the sum of the sub-group IDs with the 'delta', as per the
+  // semantics of the builtin.
+  auto *const IDPlusDelta = IsDown ? B.CreateAdd(VecSubgroupLocalID, DeltaVal)
+                                   : B.CreateSub(VecSubgroupLocalID, DeltaVal);
+
+  // We need to sanitize the input indices so that they stay within the range
+  // of one vectorized group.
+  auto *const VecIdxFactor = ConstantInt::get(SubgroupLocalID->getType(), VF);
+
+  // Bring this ID into the range of 'mux' sub-groups by dividing it by the
+  // vector size. We have to do this differently for 'up' and 'down' shuffles
+  // because the 'up' shuffles use signed indexing, and we need to round down
+  // to negative infinity to get the right sub-group delta.
+  Value *MuxAbsoluteIDs = nullptr;
+  Value *VecEltIDs = nullptr;
+  if (IsDown) {
+    MuxAbsoluteIDs =
+        B.CreateUDiv(IDPlusDelta, B.CreateVectorSplat(VF, VecIdxFactor));
+    // And into the range of the vector group
+    VecEltIDs =
+        B.CreateURem(IDPlusDelta, B.CreateVectorSplat(VF, VecIdxFactor));
+  } else {
+    // Note that shuffling up is more complicated, owing to the signed
+    // sub-group local IDs.
+    // The steps are identical to the example outlined above, except both the
+    // division and modulo operations performed on the sub-group IDs have to
+    // floor towards negative infinity. That is, we want to see:
+    //                |----------------------------|---------------------------|
+    //                |    shuffle_up(A, X, 2)     |    shuffle_up(E, I, 2)    |
+    // VF=4           |----------------------------|---------------------------|
+    //                | s(<A,B,C,D>, <X,Y,Z,W>, 2) | s(<E,F,G,H>, <I,J,K,L>, 2)|
+    // SGIds          |          0,1,2,3           |          4,5,6,7          |
+    // SGIds-D        |        -2,-1,0,1           |          2,3,4,5          |
+    // MuxSGIds       |          0,0,0,0           |          1,1,1,1          |
+    //                |----------------------------|---------------------------|
+    // both flooring: |                            |                           |
+    // M=(SGIds-D)/VF |        -1,-1,0,0           |          0,0,1,1          |
+    // V=(SGIds-D)%VF |          2,3,0,1           |          2,3,0,1          |
+    //                |----------------------------|---------------------------|
+    // MuxSGIds - M   |          1,1,0,0           |          1,1,0,0          |
+    //                |----------------------------|---------------------------|
+    //
+    // We use the following formulae for division and modulo:
+    // int div_floor(int x, int y) {
+    //   int q = x/y;
+    //   int r = x%y;
+    //   if ((r!=0) && ((r<0) != (y<0))) --q;
+    //   return q;
+    // }
+    // int mod_floor(int x, int y) {
+    //   int r = x%y;
+    //   if ((r!=0) && ((r<0) != (y<0))) { r += y; }
+    //   return r;
+    // }
+    // We note also that the conditions are equal between the two operations,
+    // and that the condition is equivalent to:
+    //   if ((r!=0) && ((x ^ y) < 0)) { ... }
+    // (see https://alive2.llvm.org/ce/z/ebGrdL)
+    auto *X = IDPlusDelta;
+    auto *Y = B.CreateVectorSplat(VF, VecIdxFactor);
+    auto *const Quotient = B.CreateSDiv(X, Y, "quotient");
+    auto *const Remainder = B.CreateSRem(X, Y, "remainder");
+
+    auto *const ArgXor = B.CreateXor(X, Y, "arg.xor");
+    auto *const One = ConstantInt::get(ArgXor->getType(), 1);
+    auto *const Zero = ConstantInt::get(ArgXor->getType(), 0);
+    auto *const ArgSignDifferent =
+        B.CreateICmpSLT(ArgXor, Zero, "signs.different");
+    auto *const RemainderIsNotZero =
+        B.CreateICmpNE(Remainder, Zero, "remainder.nonzero");
+    auto *const ConditionHolds =
+        B.CreateAnd(RemainderIsNotZero, ArgSignDifferent, "condition.holds");
+    auto *const QuotientMinus1 = B.CreateSub(Quotient, One, "quotient.minus.1");
+    auto *const RemainderPlusY = B.CreateAdd(Remainder, Y, "remainder.plus.y");
+
+    MuxAbsoluteIDs = B.CreateSelect(ConditionHolds, QuotientMinus1, Quotient);
+    VecEltIDs = B.CreateSelect(ConditionHolds, RemainderPlusY, Remainder);
+  }
+
+  // We've produced the 'absolute' mux sub-group local IDs for the data we want
+  // to access in each shuffle, but we want to get back to 'relative' IDs in
+  // the form of deltas. Splat the mux sub-group local ID.
+  auto *const SplatSubgroupLocalID =
+      B.CreateVectorSplat(VF, SubgroupLocalID, "splat.sg.local.id");
+  auto *DeltaLHS = MuxAbsoluteIDs;
+  auto *DeltaRHS = SplatSubgroupLocalID;
+  if (!IsDown) {
+    // For 'up' shuffles, we invert the operation as the deltas are implicitly
+    // negative. See above.
+    std::swap(DeltaLHS, DeltaRHS);
+  }
+  auto *const MuxDeltas =
+      B.CreateSub(DeltaLHS, DeltaRHS, "mux.sg.local.id.deltas");
+
+  auto ShuffleID = Ctx.builtins().getMuxGroupCollective(ShuffleUpDown);
+  assert(ShuffleID);
+  auto *const ShuffleFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      *ShuffleID, *F.getParent(), {LHSPackVal->getType()});
+  assert(ShuffleFn);
+
+  SmallVector<Value *, 16> Results(VF);
+  for (unsigned i = 0; i != VF; i++) {
+    auto *const MuxDelta = B.CreateExtractElement(MuxDeltas, B.getInt32(i));
+    auto *const Shuffle =
+        B.CreateCall(ShuffleFn, {LHSPackVal, RHSPackVal, MuxDelta});
+
+    Value *Elt = nullptr;
+    auto *const Idx = B.CreateExtractElement(VecEltIDs, B.getInt32(i));
+    if (auto *DataVecTy = dyn_cast<VectorType>(LHSOp->getType()); !DataVecTy) {
+      Elt = B.CreateExtractElement(Shuffle, Idx);
+    } else {
+      // For vector data types we need to extract consecutive elements starting
+      // at the sub-vector whose index is Idx.
+      Elt = PoisonValue::get(DataVecTy);
+      auto VecWidth = DataVecTy->getElementCount().getFixedValue();
+      // Idx is the 'base' of the subvector, whose elements are stored
+      // sequentially from that point.
+      auto *const VecVecGroupIdx = B.CreateMul(Idx, B.getInt32(VecWidth));
+      for (unsigned j = 0; j != VecWidth; j++) {
+        auto *const E = B.CreateExtractElement(
+            Shuffle, B.CreateAdd(VecVecGroupIdx, B.getInt32(j)));
+        Elt = B.CreateInsertElement(Elt, E, B.getInt32(j));
+      }
+    }
+    Results[i] = Elt;
+  }
+
+  IC.deleteInstructionLater(CI);
+  return getPacketizationResult(I, Results);
+}
+
+Value *Packetizer::Impl::packetizeMaskVarying(Instruction *I) {
+  if (auto memop = MemOp::get(I)) {
+    auto *const mask = memop->getMaskOperand();
+    if (!mask) {
+      return nullptr;
+    }
+
+    Value *vecMask = nullptr;
+
+    const MemOpDesc desc = memop->getDesc();
+    const bool isVector = desc.getDataType()->isVectorTy();
+
+    // If only the mask operand is varying, we do not need to vectorize the
+    // MemOp itself, only reduce the mask with an OR.
+    if (!isVector) {
+      vecMask = packetize(mask).getAsValue();
+    } else {
+      // If it's a vector, and the mask is splatted, then packetize the
+      // splatted value, reduce it, then re-splat it as a vector. Otherwise, we
+      // send it to the instantiator.
+      auto *const splatVal = getSplatValue(mask);
+      if (!splatVal) {
+        return nullptr;
+      }
+      vecMask = packetize(splatVal).getAsValue();
+    }
+
+    VECZ_FAIL_IF(!vecMask);
+
+    // Build the reduction right after the vector to reduce register
+    // pressure, and to make it easier for CSE/GVN to combine them if there
+    // are multiple uses of the same value (we could cache these?)
+    auto *maskInst = dyn_cast<Instruction>(vecMask);
+    IRBuilder<> B = [&] {
+      if (maskInst) {
+        return buildAfter(maskInst, F);
+      } else {
+        return IRBuilder<>(I);
+      }
+    }();
+
+    Value *anyOfMask = createMaybeVPReduction(B, vecMask, RecurKind::Or, VL);
+    anyOfMask->setName("any_of_mask");
+
+    if (isVector) {
+      anyOfMask = B.CreateVectorSplat(
+          multi_llvm::getVectorElementCount(desc.getDataType()), anyOfMask);
+    }
+
+    memop->setMaskOperand(anyOfMask);
+
+    return I;
+  }
+
+  auto *const CI = dyn_cast<CallInst>(I);
+  if (!CI) {
+    return nullptr;
+  }
+
+  Function *callee = CI->getCalledFunction();
+
+  // Handle internal builtins.
+  if (Ctx.isInternalBuiltin(callee)) {
+    // Handle lane mask reductions.
+    // We treat these as Mask Varying instructions since their single argument
+    // represents a lane mask and their result is a reduction over all lanes,
+    // which means it is effectively uniform. We don't actually have to check
+    // that they are mask varying, because that is the only possible uniformity
+    // value of these function calls.
+    compiler::utils::Lexer L(callee->getName());
+    VECZ_FAIL_IF(!L.Consume(VectorizationContext::InternalBuiltinPrefix));
+    bool any = false;
+    bool divergence = false;
+    if (L.Consume("divergence_any")) {
+      divergence = true;
+    } else if (L.Consume("divergence_all")) {
+      any = true;
+      divergence = true;
+    }
+
+    if (divergence) {
+      IC.deleteInstructionLater(CI);
+      auto *const reduce = reduceBranchCond(CI->getOperand(0), CI, any);
+      CI->replaceAllUsesWith(reduce);
+      return reduce;
+    }
+  }
+
+  return nullptr;
+}
+
+ValuePacket Packetizer::Impl::packetizePHI(PHINode *Phi) {
+  ValuePacket results;
+  auto *const ty = Phi->getType();
+
+  auto *wideTy = ty;
+  unsigned packetWidth = 0;
+  if (auto structTy = dyn_cast<StructType>(ty);
+      ty->isVectorTy() || VectorType::isValidElementType(ty) ||
+      (structTy && structTy->isLiteral())) {
+    packetWidth = getPacketWidthForType(ty);
+    wideTy =
+        getWideType(Phi->getType(), SimdWidth.divideCoefficientBy(packetWidth));
+  } else {
+    // It's not a type we can widen, but we can save the instantiator the job..
+    if (SimdWidth.isScalable()) {
+      // as long as we aren't requesting a scalable vectorization factor..
+      return results;
+    }
+    packetWidth = SimdWidth.getFixedValue();
+  }
+
+  IRBuilder<> B(buildAfter(Phi, F, true));
+  auto numVals = Phi->getNumIncomingValues();
+  auto name = Phi->getName();
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(B.CreatePHI(wideTy, numVals, name));
+  }
+
+  // To avoid cycles in the use/def chain, packetize the incoming values later.
+  // This allows packetizing phi uses by creating an 'empty' phi placeholder.
+  pendingPhis.push_back(Phi);
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeCall(CallInst *CI) {
+  ValuePacket results;
+
+  Function *Callee = CI->getCalledFunction();
+  if (!Callee) {
+    return results;
+  }
+
+  IRBuilder<> B(CI);
+  // Handle LLVM intrinsics.
+  if (Callee->isIntrinsic()) {
+    auto IntrID = Intrinsic::ID(Callee->getIntrinsicID());
+    if (IntrID == llvm::Intrinsic::lifetime_end ||
+        IntrID == llvm::Intrinsic::lifetime_start) {
+      auto *ptr = CI->getArgOperand(CI->arg_size() - 1);
+      if (auto *const bcast = dyn_cast<BitCastInst>(ptr)) {
+        ptr = bcast->getOperand(0);
+      }
+
+      if (auto *const alloca = dyn_cast<AllocaInst>(ptr)) {
+        if (!needsInstantiation(Ctx, *alloca)) {
+#if LLVM_VERSION_GREATER_EQUAL(23, 0)
+          const bool HaveSizeArg = false;
+#elif LLVM_VERSION_GREATER_EQUAL(22, 0)
+          // TODO Remove runtime check when we no longer need to worry about
+          // older LLVM 22 snapshots.
+          const bool HaveSizeArg = CI->arg_size() == 2;
+#else
+          const bool HaveSizeArg = true;
+#endif
+          if (HaveSizeArg) {
+            // If it's an alloca we can widen, we can just change the size
+            const llvm::TypeSize allocSize =
+                Ctx.dataLayout()->getTypeAllocSize(alloca->getAllocatedType());
+            const auto lifeSize =
+                allocSize.isScalable() || SimdWidth.isScalable()
+                    ? -1
+                    : allocSize.getKnownMinValue() *
+                          SimdWidth.getKnownMinValue();
+            CI->setOperand(
+                0, ConstantInt::get(CI->getOperand(0)->getType(), lifeSize));
+          }
+          results.push_back(CI);
+        }
+      }
+      return results;
+    }
+
+    auto Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
+    if (!Builtin || !(Builtin->properties &
+                      compiler::utils::eBuiltinPropertyVectorEquivalent)) {
+      return results;
+    }
+
+    // Only floating point intrinsics need this to be set to CI.
+    // The IR Builder helpfully crashes when we pass it unnecessarily.
+    Instruction *fastMathSrc = isa<FPMathOperator>(CI) ? CI : nullptr;
+
+    // Using a native array with hard coded size for simplicity, make sure
+    // to increase this if intrinsics with more operands are to be handled
+    size_t constexpr maxOperands = 3;
+    // Some llvm intrinsic functions like abs have argument that are constants
+    // and define as llvm_i1_ty. This means that thoses operand can't
+    // be packetized. To solve that temporary, we use this vector so every
+    // cases can set independently what operand must be skipped.
+    SmallVector<bool, maxOperands> operandsToSkip(maxOperands, false);
+    switch (IntrID) {
+    case Intrinsic::abs:
+    case Intrinsic::ctlz:
+    case Intrinsic::cttz:
+      // def abs [LLVMMatchType<0>, llvm_i1_ty]
+      operandsToSkip = {false, true};
+      break;
+    default:
+      break;
+    }
+
+    auto *const ty = CI->getType();
+    auto packetWidth = getPacketWidthForType(ty);
+    auto *const wideTy =
+        getWideType(ty, SimdWidth.divideCoefficientBy(packetWidth));
+
+    const auto n = CI->arg_size();
+    assert(n <= maxOperands && "Intrinsic has too many arguments");
+
+    SmallVector<Value *, 16> opPackets[maxOperands];
+    for (auto i = decltype(n){0}; i < n; ++i) {
+      auto *argOperand = CI->getArgOperand(i);
+
+      if (operandsToSkip[i]) {
+        assert(isa<Constant>(argOperand) && "Operand should be a Constant");
+        opPackets[i].resize(packetWidth);
+        std::fill(opPackets[i].begin(), opPackets[i].end(), argOperand);
+      } else {
+        auto op = packetize(CI->getArgOperand(i));
+        if (!op) {
+          return results;
+        }
+        op.getPacketValues(packetWidth, opPackets[i]);
+        PACK_FAIL_IF(opPackets[i].empty());
+      }
+    }
+
+    const auto name = CI->getName();
+    Type *const types[1] = {wideTy}; // because LLVM 13 is a numpty
+    Value *opVals[maxOperands];
+    for (unsigned i = 0; i < packetWidth; ++i) {
+      for (unsigned j = 0; j < n; ++j) {
+        opVals[j] = opPackets[j][i];
+      }
+
+      results.push_back(B.CreateIntrinsic(
+          IntrID, types, ArrayRef<Value *>(opVals, n), fastMathSrc, name));
+    }
+    return results;
+  }
+
+  // Handle internal builtins.
+  if (Ctx.isInternalBuiltin(Callee)) {
+    // Handle masked loads and stores.
+    if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
+      if (MaskedOp->isMaskedMemOp()) {
+        return packetizeMemOp(*MaskedOp);
+      }
+    }
+    if (auto AtomicInfo = Ctx.isMaskedAtomicFunction(*Callee)) {
+      return packetizeMaskedAtomic(*CI, *AtomicInfo);
+    }
+  }
+
+  const auto Builtin = Ctx.builtins().analyzeBuiltin(*Callee);
+
+  // Handle scans, which defer to internal builtins.
+  if (Builtin) {
+    if (auto Info = Ctx.builtins().isMuxGroupCollective(Builtin->ID)) {
+      if (Info->isScan()) {
+        return packetizeGroupScan(CI, *Info);
+      }
+    }
+
+    // Handle external builtins.
+    const auto Props = Builtin->properties;
+    if (Props & compiler::utils::eBuiltinPropertyExecutionFlow ||
+        Props & compiler::utils::eBuiltinPropertyWorkItem) {
+      return results;
+    }
+  }
+
+  auto *const ty = CI->getType();
+
+  // Our builtins are only defined up to a width of 16 so will not vectorize
+  // above that. Inspect the operands as well in case they are wider, for
+  // instance a convert from float to i8, we would rather widen according to
+  // the float and not the i8 so we don't create too wide a vector of floats.
+  auto packetWidth = getPacketWidthForType(ty, 16u);
+  for (const auto &op : CI->data_ops()) {
+    auto *const vTy = op.get()->getType();
+    if (!vTy->isPointerTy()) {
+      packetWidth = std::max(packetWidth, getPacketWidthForType(vTy, 16u));
+    }
+  }
+
+  auto factor = SimdWidth.divideCoefficientBy(packetWidth);
+
+  // Try to find a unit for this builtin.
+  auto CalleeVec = Ctx.getVectorizedFunction(*Callee, factor);
+  if (!CalleeVec) {
+    // No vectorization strategy found. Fall back on Instantiation.
+    return results;
+  }
+
+  // Packetize call operands.
+  // But not if they have pointer return arguments (handled in vectorizeCall).
+  for (const auto &TargetArg : CalleeVec.args) {
+    PACK_FAIL_IF(TargetArg.kind == VectorizationResult::Arg::POINTER_RETURN);
+  }
+
+  auto *const vecTy = dyn_cast<FixedVectorType>(ty);
+  const unsigned scalarWidth = vecTy ? vecTy->getNumElements() : 1;
+  unsigned i = 0;
+  SmallVector<SmallVector<Value *, 16>, 4> opPackets;
+  for (const auto &TargetArg : CalleeVec.args) {
+    opPackets.emplace_back();
+
+    // Handle scalar arguments.
+    Value *scalarOp = CI->getArgOperand(i);
+    if (TargetArg.kind == VectorizationResult::Arg::SCALAR) {
+      for (unsigned j = 0; j < packetWidth; ++j) {
+        opPackets.back().push_back(scalarOp);
+      }
+      i++;
+      continue;
+    }
+
+    // Vectorize scalar operands.
+    auto op = packetize(CI->getOperand(i));
+    PACK_FAIL_IF(!op);
+
+    // The vector versions of some builtins can have a mix of vector and scalar
+    // arguments. We need to widen any scalar arguments by sub-splatting.
+    auto *const scalarTy = scalarOp->getType();
+    auto *const argTy = TargetArg.type;
+    if (vecTy && !scalarTy->isVectorTy()) {
+      PACK_FAIL_IF(argTy->getScalarType() != scalarTy);
+
+      op.getPacketValues(packetWidth, opPackets.back());
+      PACK_FAIL_IF(opPackets.back().empty());
+
+      // Widen the scalar operands.
+      PACK_FAIL_IF(
+          !createSubSplats(Ctx.targetInfo(), B, opPackets.back(), scalarWidth));
+    } else {
+      // Make sure the type is correct for vector arguments.
+      Type *wideTy = getWideType(scalarOp->getType(), factor);
+      PACK_FAIL_IF(argTy != wideTy);
+
+      op.getPacketValues(packetWidth, opPackets.back());
+      PACK_FAIL_IF(opPackets.back().empty());
+    }
+    i++;
+  }
+
+  auto numArgs = opPackets.size();
+  SmallVector<Value *, 4> opVals;
+  opVals.resize(numArgs);
+
+  auto *vecFn = CalleeVec.get();
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    for (unsigned j = 0; j < numArgs; ++j) {
+      opVals[j] = opPackets[j][i];
+    }
+
+    CallInst *newCI = B.CreateCall(vecFn, opVals, CI->getName());
+    newCI->setCallingConv(CI->getCallingConv());
+    results.push_back(newCI);
+  }
+
+  return results;
+}
+
+ValuePacket
+Packetizer::Impl::packetizeGroupScan(CallInst *CI,
+                                     compiler::utils::GroupCollective Scan) {
+  ValuePacket results;
+
+  Function *callee = CI->getCalledFunction();
+  if (!callee) {
+    return results;
+  }
+
+  compiler::utils::NameMangler mangler(&CI->getContext());
+
+  const unsigned ArgOffset = Scan.isWorkGroupScope() ? 1 : 0;
+
+  // The operands and types for the internal builtin
+  SmallVector<Value *, 2> Ops = {
+      packetize(CI->getArgOperand(ArgOffset)).getAsValue()};
+  SmallVector<Type *, 2> Tys = {getWideType(CI->getType(), SimdWidth)};
+
+  const bool isInclusive =
+      Scan.Op == compiler::utils::GroupCollective::OpKind::ScanInclusive;
+  StringRef op = "add";
+  // min/max scans are prefixed with s/u if they are signed/unsigned integer
+  // operations. The value 'None' here represents an operation where the sign
+  // of the operands is unimportant, such as floating-point operations, or
+  // integer addition.
+  bool opIsSignedInt = false;
+
+  switch (Scan.Recurrence) {
+  default:
+    assert(false && "Impossible subgroup scan kind");
+    return results;
+  case llvm::RecurKind::Add:
+  case llvm::RecurKind::FAdd:
+    op = "add";
+    break;
+  case llvm::RecurKind::SMin:
+    op = "smin";
+    opIsSignedInt = true;
+    break;
+  case llvm::RecurKind::UMin:
+    op = "umin";
+    break;
+  case llvm::RecurKind::FMin:
+    op = "min";
+    break;
+  case llvm::RecurKind::SMax:
+    op = "smax";
+    opIsSignedInt = true;
+    break;
+  case llvm::RecurKind::UMax:
+    op = "umax";
+    break;
+  case llvm::RecurKind::FMax:
+    op = "max";
+    break;
+  case llvm::RecurKind::Mul:
+  case llvm::RecurKind::FMul:
+    op = "mul";
+    break;
+  case llvm::RecurKind::And:
+    op = "and";
+    break;
+  case llvm::RecurKind::Or:
+    op = "or";
+    break;
+  case llvm::RecurKind::Xor:
+    op = "xor";
+    break;
+  }
+
+  // Now create the mangled builtin function name.
+  SmallString<128> NameSV;
+  raw_svector_ostream O(NameSV);
+
+  // We don't bother with VP for fixed vectors, because it doesn't save us
+  // anything.
+  const bool VP = VL && SimdWidth.isScalable();
+
+  O << VectorizationContext::InternalBuiltinPrefix << "sub_group_scan_"
+    << (isInclusive ? "inclusive" : "exclusive") << "_" << op
+    << (VP ? "_vp" : "") << "_";
+
+  const compiler::utils::TypeQualifiers VecQuals(
+      compiler::utils::eTypeQualNone, opIsSignedInt
+                                          ? compiler::utils::eTypeQualSignedInt
+                                          : compiler::utils::eTypeQualNone);
+  if (!mangler.mangleType(O, Tys[0], VecQuals)) {
+    return results;
+  }
+
+  // VP operations mangle the extra i32 VL operand.
+  if (VP) {
+    Ops.push_back(VL);
+    Tys.push_back(VL->getType());
+    const compiler::utils::TypeQualifiers VLQuals(
+        compiler::utils::eTypeQualNone);
+    if (!mangler.mangleType(O, Tys[1], VLQuals)) {
+      return results;
+    }
+  }
+
+  auto *VecgroupScanFnTy = FunctionType::get(Tys[0], Tys, /*isVarArg*/ false);
+  auto *const VecgroupFn =
+      Ctx.getOrCreateInternalBuiltin(NameSV, VecgroupScanFnTy);
+
+  IRBuilder<> B(CI);
+
+  auto *VectorScan = B.CreateCall(VecgroupFn, Ops);
+
+  // We've currently got a scan over each vector group, but the full group scan
+  // is further multiplied by the group size (either the work-group size or the
+  // 'mux' hardware sub-group size). For example, we may have a vectorization
+  // factor sized group of 4 and a group size of 2. Together the full group
+  // size to the user is 4*2 = 8.
+  // In terms of invocations, we've essentially currently got:
+  //   <a0, a0+a1, a0+a1+a2, a0+a1+a2+a3> (invocation 0)
+  //   <a4, a4+a5, a4+a5+a6, a4+a5+a6+a7> (invocation 1)
+  // These two iterations need to be further scanned over the group
+  // size. We do this by adding the identity to the first invocation, the
+  // result of the scan over the first invocation to the second, etc. This is
+  // an exclusive scan over the *reduction* of the input vector:
+  //   <a0, a1, a2, a3> (invocation 0)
+  //   <a4, a5, a6, a7> (invocation 1)
+  // -> reduction
+  //   (a0+a1+a2+a3) (invocation 0)
+  //   (a4+a5+a6+a7) (invocation 1)
+  // -> exclusive group scan
+  //               I (invocation 0)
+  //   (a0+a1+a2+a3) (invocation 1)
+  // -> adding that to the result of the vector scan:
+  //   <I+a0, I+a0+a1, I+a0+a1+a2, I+a0+a1+a2+a3>          (invocation 0)
+  //   <(a0+a1+a2+a3)+a4, (a0+a1+a2+a3)+a4+a5,             (invocation 1)
+  //    (a0+a1+a2+a3)+a4+a5+a6, (a0+a1+a2+a3)+a4+a5+a6+a7>
+  // When viewed as a full 8-element vector, this is our final scan.
+  // Thus we essentially keep the original group scan, but change it to be an
+  // exclusive one.
+  auto *Reduction = Ops.front();
+  Reduction = createMaybeVPReduction(B, Reduction, Scan.Recurrence, VL);
+
+  // Now we defer to an *exclusive* scan over the group.
+  auto ExclScan = Scan;
+  ExclScan.Op = compiler::utils::GroupCollective::OpKind::ScanExclusive;
+
+  auto ExclScanID = Ctx.builtins().getMuxGroupCollective(ExclScan);
+  assert(ExclScanID);
+
+  auto *const ExclScanFn = Ctx.builtins().getOrDeclareMuxBuiltin(
+      *ExclScanID, *F.getParent(), {CI->getType()});
+  assert(ExclScanFn);
+
+  SmallVector<Value *, 2> ExclScanOps = {Reduction};
+  if (Scan.isWorkGroupScope()) {
+    // Forward on the current barrier ID.
+    ExclScanOps.insert(ExclScanOps.begin(), CI->getArgOperand(0));
+  }
+  auto *const ExclScanCI = B.CreateCall(ExclScanFn, ExclScanOps);
+
+  Value *const Splat = B.CreateVectorSplat(SimdWidth, ExclScanCI);
+
+  auto *const Result = compiler::utils::createBinOpForRecurKind(
+      B, VectorScan, Splat, Scan.Recurrence);
+
+  results.push_back(Result);
+  return results;
+}
+
+Value *Packetizer::Impl::vectorizeInstruction(Instruction *Ins) {
+  if (needsInstantiation(Ctx, *Ins)) {
+    return nullptr;
+  }
+
+  // Figure out what kind of instruction it is and try to vectorize it.
+  Value *Result = nullptr;
+  switch (Ins->getOpcode()) {
+  default:
+    break;
+  case Instruction::Call:
+    Result = vectorizeCall(cast<CallInst>(Ins));
+    break;
+  case Instruction::Ret:
+    Result = vectorizeReturn(cast<ReturnInst>(Ins));
+    break;
+  case Instruction::Alloca:
+    Result = vectorizeAlloca(cast<AllocaInst>(Ins));
+    break;
+  case Instruction::ExtractValue:
+    Result = vectorizeExtractValue(cast<ExtractValueInst>(Ins));
+    break;
+  }
+
+  if (Result) {
+    vectorizeDI(Ins, Result);
+  }
+  return Result;
+}
+
+ValuePacket Packetizer::Impl::packetizeLoad(LoadInst *Load) {
+  if (auto Op = MemOp::get(Load)) {
+    return packetizeMemOp(*Op);
+  }
+  return ValuePacket{};
+}
+
+ValuePacket Packetizer::Impl::packetizeStore(StoreInst *Store) {
+  if (auto Op = MemOp::get(Store)) {
+    return packetizeMemOp(*Op);
+  }
+  return ValuePacket{};
+}
+
+ValuePacket Packetizer::Impl::packetizeMemOp(MemOp &op) {
+  ValuePacket results;
+
+  // Determine the stride of the memory operation.
+  // Vectorize the pointer if there is no valid stride.
+  Value *ptr = op.getPointerOperand();
+  assert(ptr && "Could not get pointer operand of Op");
+
+  auto *const dataTy = op.getDataType();
+  if (!dataTy->isVectorTy() && !VectorType::isValidElementType(dataTy)) {
+    return results;
+  }
+
+  if (auto *const vecTy = dyn_cast<FixedVectorType>(dataTy)) {
+    const auto elts = vecTy->getNumElements();
+    if (elts & (elts - 1)) {
+      // If the data type is a vector with number of elements not a power of 2,
+      // it is not safe to widen, because of alignment padding. Reject it and
+      // let instantiation deal with it..
+      return results;
+    }
+  }
+
+  const auto packetWidth = getPacketWidthForType(dataTy);
+  // Note: NOT const because LLVM 11 can't multiply a const ElementCount.
+  auto factor = SimdWidth.divideCoefficientBy(packetWidth);
+
+  if (factor.isScalar()) {
+    // not actually widening anything here, so just instantiate it
+    return results;
+  }
+
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, op.getInstr(),
+                         "Can not vector-predicate packets larger than 1");
+    return {};
+  }
+
+  IRBuilder<> B(op.getInstr());
+  IC.deleteInstructionLater(op.getInstr());
+
+  const auto name = op.getInstr()->getName();
+  auto *const mask = op.getMaskOperand();
+  auto *const data = op.getDataOperand();
+  auto *const stride = SAR.buildMemoryStride(B, ptr, dataTy);
+
+  auto *const vecPtrTy = dyn_cast<FixedVectorType>(dataTy);
+
+  // If we're vector-predicating a vector access, scale the vector length up by
+  // the original number of vector elements.
+  // Adjust the MemOp so that it is VL-predicated, if we must.
+  Value *EVL = VL;
+  if (vecPtrTy && VL) {
+    EVL = B.CreateMul(VL, B.getInt32(vecPtrTy->getNumElements()));
+  }
+
+  auto *const constantStrideVal = dyn_cast_or_null<ConstantInt>(stride);
+  const int constantStride =
+      constantStrideVal ? constantStrideVal->getSExtValue() : 0;
+  const bool validStride =
+      stride && (!constantStrideVal || constantStride != 0);
+  if (!validStride) {
+    if (dataTy->isPointerTy()) {
+      // We do not have vector-of-pointers support in Vecz builtins, hence
+      // instantiate instead of packetize
+      return results;
+    }
+
+    const bool scalable = SimdWidth.isScalable();
+    if (!mask && dataTy->isVectorTy() && !scalable) {
+      // unmasked scatter/gathers are better off instantiated..
+      return results;
+    }
+
+    // Assume that individual masked loads/stores are more efficient when the
+    // type does not fit into a native integer. Since instantiation is never an
+    // option for scalable vectors, we do not consider this option.
+    if (vecPtrTy && !scalable &&
+        !Ctx.dataLayout()->fitsInLegalInteger(
+            dataTy->getPrimitiveSizeInBits())) {
+      return results;
+    }
+
+    auto ptrPacket = packetizeAndGet(ptr, packetWidth);
+    PACK_FAIL_IF(ptrPacket.empty());
+
+    auto *const scalarTy = dataTy->getScalarType();
+    auto *const ptrTy = cast<PointerType>(ptr->getType()->getScalarType());
+
+    // When scattering/gathering with a vector type, we can cast it to a
+    // vector of pointers to the scalar type and widen it into a vector
+    // of pointers to all the individual elements, and then gather/scatter
+    // using that.
+    if (vecPtrTy && scalable) {
+      // Scalable requires special codegen that avoids shuffles, but the idea
+      // is the same.
+      // We only handle the one packet right now.
+      PACK_FAIL_IF(ptrPacket.size() != 1);
+      const auto scalarWidth = vecPtrTy->getNumElements();
+      Value *&vecPtr = ptrPacket.front();
+      const ElementCount wideEC = factor * scalarWidth;
+      // Sub-splat the pointers such that we get, e.g.:
+      // <A, B> -> x4 -> <A, A, A, A, B, B, B, B>
+      const bool success =
+          createSubSplats(Ctx.targetInfo(), B, ptrPacket, scalarWidth);
+      PACK_FAIL_IF(!success);
+      auto *const newPtrTy = llvm::VectorType::get(ptrTy, wideEC);
+      // Bitcast the above sub-splat to purely scalar pointers
+      vecPtr = B.CreateBitCast(vecPtr, newPtrTy);
+      // Create an index sequence to start the offseting process
+      Value *idxVector = createIndexSequence(
+          B, VectorType::get(B.getInt32Ty(), wideEC), "index.vec");
+      PACK_FAIL_IF(!idxVector);
+      // Modulo the indices 0,1,2,.. with the original vector type, producing,
+      // e.g., for the above: <0,1,2,3,0,1,2,3>
+      auto *const subVecEltsSplat =
+          B.CreateVectorSplat(wideEC, B.getInt32(scalarWidth));
+      idxVector = B.CreateURem(idxVector, subVecEltsSplat);
+      // Index into the pointer vector with the offsets, e.g.,:
+      // <A, A+1, A+2, A+3, B, B+1, B+2, B+3>
+      vecPtr = B.CreateInBoundsGEP(scalarTy, vecPtr, idxVector);
+    } else if (vecPtrTy && !scalable) {
+      const auto simdWidth = factor.getFixedValue();
+      const auto scalarWidth = vecPtrTy->getNumElements();
+
+      // Build shuffle mask to widen the pointer
+      SmallVector<Constant *, 16> indices;
+      SmallVector<int, 16> widenMask;
+      for (size_t i = 0; i < simdWidth; ++i) {
+        for (size_t j = 0; j < scalarWidth; ++j) {
+          widenMask.push_back(i);
+          indices.push_back(B.getInt32(j));
+        }
+      }
+
+      auto *const newPtrTy = FixedVectorType::get(ptrTy, simdWidth);
+
+      auto *const idxVector = ConstantVector::get(indices);
+      auto *const poison = PoisonValue::get(newPtrTy);
+      for (auto &vecPtr : ptrPacket) {
+        vecPtr = B.CreateBitCast(vecPtr, newPtrTy);
+        vecPtr = B.CreateShuffleVector(vecPtr, poison, widenMask);
+        vecPtr = B.CreateInBoundsGEP(scalarTy, vecPtr, idxVector);
+      }
+    }
+
+    ValuePacket dataPacket;
+    if (data) {
+      auto src = packetize(data);
+      PACK_FAIL_IF(!src);
+      src.getPacketValues(packetWidth, dataPacket);
+      PACK_FAIL_IF(dataPacket.empty());
+    } else {
+      dataPacket.resize(packetWidth, nullptr);
+    }
+
+    // Vector-predicated scatters/gathers are always masked.
+    ValuePacket maskPacket(packetWidth, nullptr);
+    auto *const packetVecTy = getWideType(dataTy, factor);
+    if (mask || EVL) {
+      if (!mask) {
+        // If there's no mask then just splat a trivial one.
+        auto *const trueMask = createAllTrueMask(
+            B, multi_llvm::getVectorElementCount(packetVecTy));
+        std::fill(maskPacket.begin(), maskPacket.end(), trueMask);
+      } else {
+        maskPacket = packetizeAndGet(mask, packetWidth);
+        PACK_FAIL_IF(maskPacket.empty());
+      }
+    }
+
+    // Gather load or scatter store.
+    for (unsigned i = 0; i != packetWidth; ++i) {
+      if (op.isLoad()) {
+        auto *gather =
+            createGather(Ctx, packetVecTy, ptrPacket[i], maskPacket[i], EVL,
+                         op.getAlignment(), name);
+        PACK_FAIL_IF(!gather);
+        gather->insertBefore(op.getInstr()->getIterator());
+        results.push_back(gather);
+      } else {
+        auto *scatter =
+            createScatter(Ctx, dataPacket[i], ptrPacket[i], maskPacket[i], EVL,
+                          op.getAlignment(), name);
+        PACK_FAIL_IF(!scatter);
+        scatter->insertBefore(op.getInstr()->getIterator());
+        results.push_back(scatter);
+      }
+    }
+  } else if (!constantStrideVal || constantStride != 1) {
+    if (dataTy->isPointerTy() || dataTy->isVectorTy()) {
+      // No builtins for memops on pointer types, and we can't do interleaved
+      // memops over vector types.
+      return results;
+    }
+
+    ValuePacket dataPacket;
+    if (data) {
+      auto src = packetize(data);
+      PACK_FAIL_IF(!src);
+      src.getPacketValues(packetWidth, dataPacket);
+      PACK_FAIL_IF(dataPacket.empty());
+    } else {
+      dataPacket.resize(packetWidth, nullptr);
+    }
+
+    Value *packetStride = nullptr;
+    if (packetWidth != 1) {
+      // Make sure the stride is at least as wide as a GEP index needs to be
+      const unsigned indexBits = Ctx.dataLayout()->getIndexSizeInBits(
+          ptr->getType()->getPointerAddressSpace());
+      unsigned strideBits = stride->getType()->getPrimitiveSizeInBits();
+      auto *const elementStride =
+          (indexBits > strideBits)
+              ? B.CreateSExt(stride, B.getIntNTy((strideBits = indexBits)))
+              : stride;
+
+      const auto simdWidth = factor.getFixedValue();
+      packetStride =
+          B.CreateMul(elementStride, B.getIntN(strideBits, simdWidth),
+                      Twine(name, ".packet_stride"));
+    }
+
+    // Vector-predicated interleaved operations are always masked.
+    ValuePacket maskPacket(packetWidth, nullptr);
+    auto *const packetVecTy = getWideType(dataTy, factor);
+    if (mask || EVL) {
+      if (!mask) {
+        // If there's no mask then just splat a trivial one.
+        auto *const trueMask = createAllTrueMask(
+            B, multi_llvm::getVectorElementCount(packetVecTy));
+        std::fill(maskPacket.begin(), maskPacket.end(), trueMask);
+      } else {
+        maskPacket = packetizeAndGet(mask, packetWidth);
+        PACK_FAIL_IF(maskPacket.empty());
+      }
+    }
+
+    // Interleaved (strided) load or store.
+    for (unsigned i = 0; i != packetWidth; ++i) {
+      if (i != 0) {
+        ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
+                                  Twine(name, ".incr"));
+      }
+      if (op.isLoad()) {
+        auto *newLoad =
+            createInterleavedLoad(Ctx, packetVecTy, ptr, stride, maskPacket[i],
+                                  EVL, op.getAlignment(), name);
+        newLoad->insertBefore(op.getInstr()->getIterator());
+        results.push_back(newLoad);
+      } else {
+        auto *newStore =
+            createInterleavedStore(Ctx, dataPacket[i], ptr, stride,
+                                   maskPacket[i], EVL, op.getAlignment(), name);
+        newStore->insertBefore(op.getInstr()->getIterator());
+        results.push_back(newStore);
+      }
+    }
+  } else {
+    ValuePacket dataPacket;
+    if (data) {
+      auto src = packetize(data);
+      PACK_FAIL_IF(!src);
+      src.getPacketValues(packetWidth, dataPacket);
+      PACK_FAIL_IF(dataPacket.empty());
+    } else if (mask) {
+      // don't need the data packet for unmasked stores
+      dataPacket.resize(packetWidth, nullptr);
+    }
+
+    Value *packetStride = nullptr;
+    if (packetWidth != 1) {
+      const auto simdWidth = factor.getFixedValue();
+      packetStride = B.getInt64(simdWidth);
+    }
+
+    // Calculate the alignment. The MemOp's alignment is the original
+    // alignment, but may be overaligned. After vectorization it can't be
+    // larger than the pointee element type.
+    unsigned alignment = op.getAlignment();
+    const unsigned sizeInBits =
+        dataTy->getPrimitiveSizeInBits().getKnownMinValue();
+    alignment = std::min(alignment, std::max(sizeInBits, 8u) / 8u);
+
+    // Regular load or store.
+    if (mask) {
+      const bool isVectorMask = mask->getType()->isVectorTy();
+      auto maskPacket = packetizeAndGet(mask, packetWidth);
+      PACK_FAIL_IF(maskPacket.empty());
+
+      // If the original instruction was a vector but the mask was a scalar i1,
+      // we have to broadcast the mask elements across the data vector.
+      auto *const vecTy = dyn_cast<FixedVectorType>(dataTy);
+      if (vecTy && !isVectorMask) {
+        PACK_FAIL_IF(factor.isScalable());
+        const unsigned simdWidth = factor.getFixedValue();
+        const unsigned scalarWidth = vecTy->getNumElements();
+
+        // Build shuffle mask to widen the vector condition.
+        SmallVector<int, 16> widenMask;
+        for (size_t i = 0; i < simdWidth; ++i) {
+          for (size_t j = 0; j < scalarWidth; ++j) {
+            widenMask.push_back(i);
+          }
+        }
+
+        auto *const poison = PoisonValue::get(maskPacket.front()->getType());
+        for (auto &vecMask : maskPacket) {
+          vecMask = createOptimalShuffle(B, vecMask, poison, widenMask);
+        }
+      }
+
+      for (unsigned i = 0; i != packetWidth; ++i) {
+        if (i != 0) {
+          ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
+                                    Twine(name, ".incr"));
+        }
+        if (op.isLoad()) {
+          auto *newLoad =
+              createMaskedLoad(Ctx, getWideType(dataTy, factor), ptr,
+                               maskPacket[i], EVL, op.getAlignment(), name);
+          newLoad->insertBefore(op.getInstr()->getIterator());
+          results.push_back(newLoad);
+        } else {
+          auto *newStore =
+              createMaskedStore(Ctx, dataPacket[i], ptr, maskPacket[i], EVL,
+                                op.getAlignment(), name);
+          newStore->insertBefore(op.getInstr()->getIterator());
+          results.push_back(newStore);
+        }
+      }
+    } else {
+      const TargetInfo &VTI = Ctx.targetInfo();
+      if (op.isLoad()) {
+        auto *const one = B.getInt64(1);
+        for (unsigned i = 0; i != packetWidth; ++i) {
+          if (i != 0) {
+            ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
+                                      Twine(name, ".incr"));
+          }
+          results.push_back(VTI.createLoad(B, getWideType(dataTy, factor), ptr,
+                                           one, alignment, EVL));
+        }
+      } else {
+        auto *const one = B.getInt64(1);
+        for (unsigned i = 0; i != packetWidth; ++i) {
+          if (i != 0) {
+            ptr = B.CreateInBoundsGEP(dataTy, ptr, packetStride,
+                                      Twine(name, ".incr"));
+          }
+          results.push_back(
+              VTI.createStore(B, dataPacket[i], ptr, one, alignment, EVL));
+        }
+      }
+    }
+  }
+
+  // Transfer attributes from an old call instruction to a new one.
+  if (CallInst *oldCI = op.getCall()) {
+    for (auto *r : results) {
+      if (CallInst *newCI = dyn_cast_or_null<CallInst>(r)) {
+        newCI->setCallingConv(oldCI->getCallingConv());
+      }
+    }
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeMaskedAtomic(
+    CallInst &CI, VectorizationContext::MaskedAtomic AtomicInfo) {
+  ValuePacket results;
+
+  const bool IsCmpXchg = AtomicInfo.isCmpXchg();
+
+  Value *const ptrArg = CI.getArgOperand(0);
+  Value *const valOrCmpArg = CI.getArgOperand(1);
+  Value *const maskArg = CI.getArgOperand(2 + IsCmpXchg);
+
+  assert(AtomicInfo.ValTy == valOrCmpArg->getType() && "AtomicInfo mismatch");
+  const auto packetWidth = getPacketWidthForType(valOrCmpArg->getType());
+
+  if (VL && packetWidth != 1) {
+    emitVeczRemarkMissed(&F, &CI,
+                         "Can not vector-predicate packets larger than 1");
+    return {};
+  }
+
+  ValuePacket valOrCmpPacket;
+  const Result valResult = packetize(valOrCmpArg);
+  PACK_FAIL_IF(!valResult);
+  valResult.getPacketValues(packetWidth, valOrCmpPacket);
+  PACK_FAIL_IF(valOrCmpPacket.empty());
+
+  ValuePacket newValPacket;
+  if (IsCmpXchg) {
+    Value *const newValArg = CI.getArgOperand(2);
+    const Result newValResult = packetize(newValArg);
+    PACK_FAIL_IF(!newValResult);
+    newValResult.getPacketValues(packetWidth, newValPacket);
+    PACK_FAIL_IF(newValPacket.empty());
+  }
+
+  ValuePacket ptrPacket;
+  const Result ptrResult = packetize(ptrArg);
+  PACK_FAIL_IF(!ptrResult);
+  ptrResult.getPacketValues(packetWidth, ptrPacket);
+  PACK_FAIL_IF(ptrPacket.empty());
+
+  ValuePacket maskPacket;
+  const Result maskResult = packetize(maskArg);
+  PACK_FAIL_IF(!maskResult);
+  maskResult.getPacketValues(packetWidth, maskPacket);
+  PACK_FAIL_IF(maskPacket.empty());
+
+  IRBuilder<> B(&CI);
+  IC.deleteInstructionLater(&CI);
+
+  for (unsigned i = 0; i != packetWidth; ++i) {
+    auto *const ptr = ptrPacket[i];
+    auto *const valOrCmp = valOrCmpPacket[i];
+
+    AtomicInfo.ValTy = valOrCmp->getType();
+    AtomicInfo.PointerTy = ptr->getType();
+    auto *maskedAtomicF =
+        Ctx.getOrCreateMaskedAtomicFunction(AtomicInfo, Choices, SimdWidth);
+    PACK_FAIL_IF(!maskedAtomicF);
+
+    SmallVector<Value *, 4> args = {ptr, valOrCmp};
+    if (IsCmpXchg) {
+      args.push_back(newValPacket[i]);
+    }
+    args.push_back(maskPacket[i]);
+    if (AtomicInfo.IsVectorPredicated) {
+      assert(VL && "Missing vector length");
+      args.push_back(VL);
+    }
+
+    results.push_back(B.CreateCall(maskedAtomicF, args));
+  }
+
+  return results;
+}
+
+void Packetizer::Impl::vectorizeDI(Instruction *, Value *) {
+  // FIXME: Reinstate support for vectorizing debug info
+  return;
+}
+
+ValuePacket Packetizer::Impl::packetizeGEP(GetElementPtrInst *GEP) {
+  ValuePacket results;
+  Value *pointer = GEP->getPointerOperand();
+  if (isa<AllocaInst>(pointer)) {
+    return results;
+  }
+
+  if (isa<VectorType>(GEP->getType())) {
+    // instantiate vector GEPs, for safety
+    return results;
+  }
+
+  // Work out the packet width from the pointed to type, rather than the
+  // pointer type itself, because this is the width the memops will be using.
+  auto *const ty = GEP->getSourceElementType();
+  const auto packetWidth = getPacketWidthForType(ty);
+
+  // It is legal to create a GEP with a mixture of scalar and vector operands.
+  // If any operand is a vector, the result will be a vector of pointers.
+  ValuePacket pointerPacket;
+  if (UVR.isVarying(pointer)) {
+    auto res = packetize(pointer);
+    PACK_FAIL_IF(!res);
+    res.getPacketValues(packetWidth, pointerPacket);
+    PACK_FAIL_IF(pointerPacket.empty());
+  } else {
+    for (unsigned i = 0; i != packetWidth; ++i) {
+      pointerPacket.push_back(pointer);
+    }
+  }
+
+  // Packetize the GEP indices.
+  SmallVector<SmallVector<Value *, 16>, 4> opPackets;
+  for (unsigned i = 0, n = GEP->getNumIndices(); i != n; i++) {
+    Value *idx = GEP->getOperand(i + 1);
+    opPackets.emplace_back();
+
+    // Handle constant indices
+    if (isa<ConstantInt>(idx)) {
+      for (unsigned j = 0; j < packetWidth; ++j) {
+        opPackets.back().push_back(idx);
+      }
+    } else {
+      auto op = packetize(idx);
+      PACK_FAIL_IF(!op);
+      op.getPacketValues(packetWidth, opPackets.back());
+      PACK_FAIL_IF(opPackets.back().empty());
+    }
+  }
+
+  IRBuilder<> B(GEP);
+  IC.deleteInstructionLater(GEP);
+
+  const bool inBounds = GEP->isInBounds();
+  const auto name = GEP->getName();
+
+  const auto numIndices = opPackets.size();
+  SmallVector<Value *, 4> opVals;
+  opVals.resize(numIndices);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    for (unsigned j = 0; j < numIndices; ++j) {
+      opVals[j] = opPackets[j][i];
+    }
+
+    if (inBounds) {
+      results.push_back(
+          B.CreateInBoundsGEP(ty, pointerPacket[i], opVals, name));
+    } else {
+      results.push_back(B.CreateGEP(ty, pointerPacket[i], opVals, name));
+    }
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeBinaryOp(BinaryOperator *BinOp) {
+  ValuePacket results;
+  auto packetWidth = getPacketWidthForType(BinOp->getType());
+
+  auto LHS = packetizeAndGet(BinOp->getOperand(0), packetWidth);
+  auto RHS = packetizeAndGet(BinOp->getOperand(1), packetWidth);
+  PACK_FAIL_IF(LHS.empty() || RHS.empty());
+
+  auto opcode = BinOp->getOpcode();
+  auto name = BinOp->getName();
+  IRBuilder<> B(BinOp);
+  if (VL) {
+    auto *const VecTy = LHS[0]->getType();
+    // Support for VP legalization is still lacking so fall back to non-VP
+    // operations in other cases. This support will improve over time.
+    if (Ctx.targetInfo().isVPVectorLegal(F, VecTy)) {
+      PACK_FAIL_IF(packetWidth != 1);
+      auto VPId = VPIntrinsic::getForOpcode(opcode);
+      PACK_FAIL_IF(VPId == Intrinsic::not_intrinsic);
+      auto *const Mask = createAllTrueMask(
+          B, multi_llvm::getVectorElementCount(LHS[0]->getType()));
+      // Scale the base length by the number of vector elements, where
+      // appropriate.
+      Value *EVL = VL;
+      if (auto *const VecTy = dyn_cast<VectorType>(BinOp->getType())) {
+        EVL = B.CreateMul(
+            EVL,
+            B.getInt32(
+                multi_llvm::getVectorElementCount(VecTy).getKnownMinValue()));
+      }
+      auto *const NewBinOp = B.CreateIntrinsic(VPId, {LHS[0]->getType()},
+                                               {LHS[0], RHS[0], Mask, EVL});
+      NewBinOp->copyIRFlags(BinOp, true);
+      NewBinOp->copyMetadata(*BinOp);
+      results.push_back(NewBinOp);
+      return results;
+    }
+    // If we haven't matched [us]div or [us]rem then we may be executing
+    // out-of-bounds elements if we don't predicate. Since this isn't safe,
+    // bail.
+    PACK_FAIL_IF(
+        opcode == BinaryOperator::UDiv || opcode == BinaryOperator::SDiv ||
+        opcode == BinaryOperator::URem || opcode == BinaryOperator::SRem);
+  }
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    auto *const NewV = B.CreateBinOp(opcode, LHS[i], RHS[i], name);
+    if (auto *const NewBinOp = dyn_cast<BinaryOperator>(NewV)) {
+      NewBinOp->copyIRFlags(BinOp, true);
+      NewBinOp->copyMetadata(*BinOp);
+    }
+    results.push_back(NewV);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeFreeze(FreezeInst *FreezeI) {
+  ValuePacket results;
+  auto resC = packetize(FreezeI->getOperand(0));
+  PACK_FAIL_IF(!resC);
+
+  SmallVector<Value *, 16> src;
+  resC.getPacketValues(src);
+  PACK_FAIL_IF(src.empty());
+
+  const auto packetWidth = src.size();
+  const auto name = FreezeI->getName();
+
+  IRBuilder<> B(FreezeI);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(B.CreateFreeze(src[i], name));
+  }
+  return results;
+}
+
+ValuePacket
+Packetizer::Impl::packetizeAtomicCmpXchg(AtomicCmpXchgInst *AtomicI) {
+  ValuePacket results;
+
+  VectorizationContext::MaskedAtomic MA;
+  MA.VF = SimdWidth;
+  MA.IsVectorPredicated = VU.choices().vectorPredication();
+
+  MA.Align = AtomicI->getAlign();
+  MA.BinOp = AtomicRMWInst::BAD_BINOP;
+  MA.IsWeak = AtomicI->isWeak();
+  MA.IsVolatile = AtomicI->isVolatile();
+  MA.Ordering = AtomicI->getSuccessOrdering();
+  MA.CmpXchgFailureOrdering = AtomicI->getFailureOrdering();
+  MA.SyncScope = AtomicI->getSyncScopeID();
+
+  IRBuilder<> B(AtomicI);
+
+  // Set up the arguments to this function
+  Value *Ptr = packetize(AtomicI->getPointerOperand()).getAsValue();
+  Value *Cmp = packetize(AtomicI->getCompareOperand()).getAsValue();
+  Value *New = packetize(AtomicI->getNewValOperand()).getAsValue();
+
+  MA.ValTy = Cmp->getType();
+  MA.PointerTy = Ptr->getType();
+
+  auto *const TrueMask = createAllTrueMask(B, SimdWidth);
+  SmallVector<Value *, 8> MaskedFnArgs = {Ptr, Cmp, New, TrueMask};
+  if (VL) {
+    MaskedFnArgs.push_back(VL);
+  }
+
+  Function *MaskedAtomicFn =
+      Ctx.getOrCreateMaskedAtomicFunction(MA, VU.choices(), SimdWidth);
+  PACK_FAIL_IF(!MaskedAtomicFn);
+
+  CallInst *MaskedCI = B.CreateCall(MaskedAtomicFn, MaskedFnArgs);
+
+  results.push_back(MaskedCI);
+
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeUnaryOp(UnaryOperator *UnOp) {
+  ValuePacket results;
+
+  auto opcode = UnOp->getOpcode();
+
+  auto packetWidth = getPacketWidthForType(UnOp->getType());
+  auto src = packetizeAndGet(UnOp->getOperand(0), packetWidth);
+  PACK_FAIL_IF(src.empty());
+
+  auto name = UnOp->getName();
+  IRBuilder<> B(UnOp);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    Value *New = B.CreateUnOp(opcode, src[i], name);
+    auto *NewUnOp = cast<UnaryOperator>(New);
+    NewUnOp->copyIRFlags(UnOp, true);
+    results.push_back(NewUnOp);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeCast(CastInst *CastI) {
+  ValuePacket results;
+
+  auto *const ty = CastI->getType();
+  auto packetWidth = std::max(getPacketWidthForType(ty),
+                              getPacketWidthForType(CastI->getSrcTy()));
+
+  auto src = packetizeAndGet(CastI->getOperand(0), packetWidth);
+  PACK_FAIL_IF(src.empty());
+
+  auto *const wideTy =
+      getWideType(ty, SimdWidth.divideCoefficientBy(packetWidth));
+  auto name = CastI->getName();
+  IRBuilder<> B(CastI);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(B.CreateCast(CastI->getOpcode(), src[i], wideTy, name));
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeICmp(ICmpInst *Cmp) {
+  ValuePacket results;
+  auto packetWidth = getPacketWidthForType(Cmp->getOperand(0)->getType());
+
+  auto LHS = packetizeAndGet(Cmp->getOperand(0), packetWidth);
+  auto RHS = packetizeAndGet(Cmp->getOperand(1), packetWidth);
+  PACK_FAIL_IF(LHS.empty() || RHS.empty());
+
+  auto pred = Cmp->getPredicate();
+  auto name = Cmp->getName();
+  IRBuilder<> B(Cmp);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    auto *const NewICmp = B.CreateICmp(pred, LHS[i], RHS[i], name);
+    if (isa<ICmpInst>(NewICmp)) {
+      cast<ICmpInst>(NewICmp)->copyIRFlags(Cmp, true);
+    }
+    results.push_back(NewICmp);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeFCmp(FCmpInst *Cmp) {
+  ValuePacket results;
+  auto packetWidth = getPacketWidthForType(Cmp->getOperand(0)->getType());
+
+  auto LHS = packetizeAndGet(Cmp->getOperand(0), packetWidth);
+  auto RHS = packetizeAndGet(Cmp->getOperand(1), packetWidth);
+  PACK_FAIL_IF(LHS.empty() || RHS.empty());
+
+  auto pred = Cmp->getPredicate();
+  auto name = Cmp->getName();
+  IRBuilder<> B(Cmp);
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    auto *NewICmp = cast<FCmpInst>(B.CreateFCmp(pred, LHS[i], RHS[i], name));
+    NewICmp->copyIRFlags(Cmp, true);
+    results.push_back(NewICmp);
+  }
+  return results;
+}
+
+ValuePacket Packetizer::Impl::packetizeSelect(SelectInst *Select) {
+  ValuePacket results;
+  auto *const ty = Select->getType();
+  if (!ty->isVectorTy() && !VectorType::isValidElementType(ty)) {
+    // Selects can work on struct/aggregate types, but we can't widen them..
+    return results;
+  }
+
+  auto packetWidth = getPacketWidthForType(ty);
+  auto vecT = packetizeAndGet(Select->getOperand(1), packetWidth);
+  auto vecF = packetizeAndGet(Select->getOperand(2), packetWidth);
+  PACK_FAIL_IF(vecT.empty() || vecF.empty());
+
+  auto *cond = Select->getOperand(0);
+  auto resC = packetize(cond);
+  PACK_FAIL_IF(!resC);
+
+  IRBuilder<> B(Select);
+  const bool isVectorSelect = cond->getType()->isVectorTy();
+  SmallVector<Value *, 16> vecC;
+  if (UVR.isVarying(cond)) {
+    resC.getPacketValues(packetWidth, vecC);
+    PACK_FAIL_IF(vecC.empty());
+
+    // If the original select returns a vector, but the condition was scalar,
+    // and its packet members are widened, we have to sub-broadcast it across
+    // the lanes of the original vector.
+    if (!isVectorSelect && vecC.front()->getType()->isVectorTy()) {
+      if (auto *vecTy = dyn_cast<FixedVectorType>(Select->getType())) {
+        PACK_FAIL_IF(!createSubSplats(Ctx.targetInfo(), B, vecC,
+                                      vecTy->getNumElements()));
+      }
+    }
+  } else if (isVectorSelect) {
+    // If the condition is a uniform vector, get its broadcast packets
+    resC.getPacketValues(packetWidth, vecC);
+    PACK_FAIL_IF(vecC.empty());
+  } else {
+    // If the condition is a uniform scalar, we can just use it as is
+    vecC.assign(packetWidth, cond);
+  }
+
+  auto name = Select->getName();
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(B.CreateSelect(vecC[i], vecT[i], vecF[i], name));
+  }
+  return results;
+}
+
+Value *Packetizer::Impl::vectorizeReturn(ReturnInst *Return) {
+  IRBuilder<> B(Return);
+  Value *Op = packetize(Return->getOperand(0)).getAsValue();
+  VECZ_FAIL_IF(!Op);
+  IC.deleteInstructionLater(Return);
+  return B.CreateRet(Op);
+}
+
+Value *Packetizer::Impl::vectorizeCall(CallInst *CI) {
+  Function *Callee = CI->getCalledFunction();
+  VECZ_STAT_FAIL_IF(!Callee, VeczPacketizeFailCall);
+
+  IRBuilder<> B(CI);
+  // Handle LLVM intrinsics.
+  if (Callee->isIntrinsic()) {
+    Value *Result = nullptr;
+    auto IntrID = Intrinsic::ID(Callee->getIntrinsicID());
+    if (IntrID == Intrinsic::fmuladd || IntrID == Intrinsic::fma) {
+      SmallVector<Value *, 3> Ops;
+      SmallVector<Type *, 1> Tys;
+      for (unsigned i = 0; i < 3; ++i) {
+        Value *P = packetize(CI->getOperand(i)).getAsValue();
+        VECZ_FAIL_IF(!P);
+        Ops.push_back(P);
+      }
+      Tys.push_back(getWideType(CI->getType(), SimdWidth));
+      Result = B.CreateIntrinsic(IntrID, Tys, Ops, CI, CI->getName());
+    }
+
+    if (Result) {
+      IC.deleteInstructionLater(CI);
+      return Result;
+    }
+  }
+
+  // Handle internal builtins.
+  if (Ctx.isInternalBuiltin(Callee)) {
+    // These should have been handled by packetizeCall, if not, off to the
+    // instantiator they go...
+    if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
+      if (MaskedOp->isMaskedMemOp()) {
+        return nullptr;
+      }
+    }
+  }
+
+  if (VectorizationContext::isVector(*CI)) {
+    return nullptr;
+  }
+
+  // Handle external builtins.
+  const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  if (const auto Builtin = BI.analyzeBuiltinCall(*CI, Dimension)) {
+    if (Builtin->properties & compiler::utils::eBuiltinPropertyExecutionFlow) {
+      return nullptr;
+    }
+    if (Builtin->properties & compiler::utils::eBuiltinPropertyWorkItem) {
+      return vectorizeWorkGroupCall(CI, *Builtin);
+    }
+  }
+
+  // Try to find a unit for this builtin.
+  auto CalleeVec = Ctx.getVectorizedFunction(*Callee, SimdWidth);
+  if (!CalleeVec) {
+    // No vectorization strategy found. Fall back on Instantiation.
+    return nullptr;
+  }
+  IC.deleteInstructionLater(CI);
+
+  // Vectorize call operands.
+  unsigned i = 0;
+  AllocaInst *PointerRetAlloca = nullptr;
+  Value *PointerRetAddr = nullptr;
+  int PointerRetStride = 0;
+  SmallVector<Value *, 4> Ops;
+  for (const auto &TargetArg : CalleeVec.args) {
+    // Handle scalar arguments.
+    Value *ScalarOp = CI->getArgOperand(i);
+    Type *ScalarTy = ScalarOp->getType();
+    if (TargetArg.kind == VectorizationResult::Arg::POINTER_RETURN) {
+      // 'Pointer return' arguments that are not sequential need to be handled
+      // specially.
+      auto *const PtrTy = dyn_cast<PointerType>(ScalarOp->getType());
+      auto *const PtrEleTy = TargetArg.pointerRetPointeeTy;
+      Value *Stride = SAR.buildMemoryStride(B, ScalarOp, PtrEleTy);
+      VECZ_STAT_FAIL_IF(!Stride, VeczPacketizeFailStride);
+      bool hasConstantStride = false;
+      int64_t ConstantStride = 0;
+      if (ConstantInt *CInt = dyn_cast<ConstantInt>(Stride)) {
+        ConstantStride = CInt->getSExtValue();
+        hasConstantStride = true;
+      }
+      VECZ_STAT_FAIL_IF(!hasConstantStride || ConstantStride < 1,
+                        VeczPacketizeFailStride);
+      if (ConstantStride == 1) {
+        Ops.push_back(B.CreateBitCast(ScalarOp, TargetArg.type));
+        i++;
+        continue;
+      }
+      // Create an alloca in the function's entry block. The alloca will be
+      // passed instead of the original pointer. After the function call,
+      // the value from the alloca will be loaded sequentially and stored to the
+      // original address using an interleaved store.
+      VECZ_STAT_FAIL_IF(!PtrTy || PointerRetAddr, VeczPacketizeFailPtr);
+      BasicBlock *BB = CI->getParent();
+      VECZ_FAIL_IF(!BB);
+      Function *F = BB->getParent();
+      VECZ_FAIL_IF(!F);
+      BasicBlock &EntryBB = F->getEntryBlock();
+      B.SetInsertPoint(&*EntryBB.getFirstInsertionPt());
+      Type *AllocaTy = getWideType(PtrEleTy, SimdWidth);
+      PointerRetAlloca = B.CreateAlloca(AllocaTy, nullptr, "ptr_ret_temp");
+      Value *NewOp = B.CreateAddrSpaceCast(PointerRetAlloca, PtrTy);
+      PointerRetAddr = ScalarOp;
+      PointerRetStride = ConstantStride;
+      Ops.push_back(NewOp);
+      i++;
+      continue;
+    } else if (TargetArg.kind != VectorizationResult::Arg::VECTORIZED) {
+      Ops.push_back(ScalarOp);
+      i++;
+      continue;
+    }
+
+    // Make sure the type is correct for vector arguments.
+    auto VectorTy = dyn_cast<FixedVectorType>(TargetArg.type);
+    VECZ_STAT_FAIL_IF(!VectorTy || VectorTy->getElementType() != ScalarTy,
+                      VeczPacketizeFailType);
+
+    // Vectorize scalar operands.
+    Value *VecOp = packetize(ScalarOp).getAsValue();
+    VECZ_FAIL_IF(!VecOp);
+    Ops.push_back(VecOp);
+    i++;
+  }
+
+  CallInst *NewCI = B.CreateCall(CalleeVec.get(), Ops, CI->getName());
+  NewCI->setCallingConv(CI->getCallingConv());
+  if (PointerRetAddr) {
+    // Load the 'pointer return' value from the alloca and store it to the
+    // original address using an interleaved store.
+    LoadInst *PointerRetResult =
+        B.CreateLoad(PointerRetAlloca->getAllocatedType(), PointerRetAlloca);
+    Value *Stride = getSizeInt(B, PointerRetStride);
+    auto *Store =
+        createInterleavedStore(Ctx, PointerRetResult, PointerRetAddr, Stride,
+                               /*Mask*/ nullptr, /*EVL*/ nullptr,
+                               PointerRetAlloca->getAlign().value());
+    if (!Store) {
+      return nullptr;
+    }
+    Store->insertBefore(B.GetInsertPoint());
+  }
+  return NewCI;
+}
+
+Value *Packetizer::Impl::vectorizeWorkGroupCall(
+    CallInst *CI, const compiler::utils::BuiltinCall &Builtin) {
+  // Insert instructions after the call to the builtin, since they reference
+  // the result of that call.
+  IRBuilder<> B(buildAfter(CI, F));
+
+  // Do not vectorize ranks equal to vectorization dimension. The value of
+  // get_global_id with other ranks is uniform.
+
+  Value *IDToSplat = CI;
+  // Multiply the sub-group local ID by the vectorization factor, to vectorize
+  // across the entire sub-group size.
+  // For example, with a vector width of 4 and a mux sub-group size of 2, the
+  // apparent sub-group size is 8 and the sub-group IDs are:
+  // | mux sub group 0 | mux sub group 1 |
+  // |-----------------|-----------------|
+  // |  0   1   2   3  |  4   5   6   7  |
+  if (Builtin.ID == compiler::utils::eMuxBuiltinGetSubGroupLocalId) {
+    IDToSplat = B.CreateMul(
+        IDToSplat, B.CreateElementCount(IDToSplat->getType(), SimdWidth));
+  }
+
+  // Broadcast the builtin's return value.
+  Value *Splat = B.CreateVectorSplat(SimdWidth, IDToSplat);
+
+  // Add an index sequence [0, 1, 2, ...] to the value unless uniform.
+  const auto Uniformity = Builtin.uniformity;
+  if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
+      Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+    Value *StepVector =
+        createIndexSequence(B, cast<VectorType>(Splat->getType()), "index.vec");
+    VECZ_FAIL_IF(!StepVector);
+
+    Value *Result = B.CreateAdd(Splat, StepVector);
+
+    if (Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+      Value *Rank = CI->getArgOperand(0);
+
+      // if the Rank is varying, need to packetize that as well!
+      if (UVR.isVarying(Rank)) {
+        Rank = packetize(Rank).getAsValue();
+        VECZ_FAIL_IF(!Rank);
+      }
+      Value *dim = ConstantInt::get(Rank->getType(), Dimension);
+      Value *Test = B.CreateICmpEQ(Rank, dim);
+      Result = B.CreateSelect(Test, Result, Splat, "maybe_rank");
+    }
+    return Result;
+  } else if (Uniformity == compiler::utils::eBuiltinUniformityNever) {
+    VECZ_FAIL();
+  }
+
+  return Splat;
+}
+
+Value *Packetizer::Impl::vectorizeAlloca(AllocaInst *alloca) {
+  // We create an array allocation here, because the resulting value needs to
+  // represent a vector of pointers, not a pointer to vector. As such, it's a
+  // bit of a trick to handle scalable vectorization factors, since that would
+  // require creating instrucions *before* the alloca, to get the array length,
+  // which could be a surprise to some of our later passes that expect allocas
+  // to be grouped at the top of the first Basic Block. This is not an LLVM
+  // requirement, however, so it should be investigated.
+  //
+  // Note that normally, an alloca would not be packtized anyway, since access
+  // is contiguous, Load and Store operations don't need to packetize their
+  // pointer operand and the alloca would be widened after packetization, which
+  // has no trouble with scalables. This function is required for the case that
+  // some pointer-dependent instruction unexpectedly fails to packetize, and
+  // falls back to instantiation, in which case we need a pointer per lane. In
+  // actual fact, "normal" alloca vectorization is not very common, since such
+  // allocas tend to be easy to remove by the Mem-to-Reg pass, so this "edge
+  // case" is actually the most likely.
+  //
+  VECZ_FAIL_IF(SimdWidth.isScalable());
+  const unsigned fixedWidth = SimdWidth.getFixedValue();
+  IRBuilder<> B(alloca);
+  auto *const ty = alloca->getAllocatedType();
+  AllocaInst *wideAlloca =
+      B.CreateAlloca(ty, getSizeInt(B, fixedWidth), alloca->getName());
+  wideAlloca->setAlignment(alloca->getAlign());
+
+  // Put the GEP after all allocas.
+  Instruction *insertPt = alloca;
+  while (isa<AllocaInst>(*insertPt)) {
+    insertPt = insertPt->getNextNode();
+  }
+  B.SetInsertPoint(insertPt);
+  deleteInstructionLater(alloca);
+
+  auto *const idxTy = Ctx.dataLayout()->getIndexType(wideAlloca->getType());
+  Value *const indices =
+      createIndexSequence(B, VectorType::get(idxTy, SimdWidth));
+
+  return B.CreateInBoundsGEP(ty, wideAlloca, ArrayRef<Value *>{indices},
+                             Twine(alloca->getName(), ".lanes"));
+}
+
+Value *Packetizer::Impl::vectorizeExtractValue(ExtractValueInst *ExtractValue) {
+  IRBuilder<> B(buildAfter(ExtractValue, F));
+
+  Value *Aggregate =
+      packetize(ExtractValue->getAggregateOperand()).getAsValue();
+  SmallVector<unsigned, 4> Indices;
+  Indices.push_back(0);
+  for (auto Index : ExtractValue->indices()) {
+    Indices.push_back(Index);
+  }
+
+  SmallVector<Value *, 16> Extracts;
+
+  VECZ_FAIL_IF(SimdWidth.isScalable());
+  auto Width = SimdWidth.getFixedValue();
+
+  // Check that the width is non-zero so the zeroth element is initialized.
+  VECZ_FAIL_IF(Width < 1);
+
+  for (decltype(Width) i = 0; i < Width; i++) {
+    Indices[0] = i;
+    Extracts.push_back(B.CreateExtractValue(Aggregate, Indices));
+  }
+
+  Type *CompositeTy = getWideType(Extracts[0]->getType(), SimdWidth);
+  Value *Result = PoisonValue::get(CompositeTy);
+  for (decltype(Width) i = 0; i < Width; i++) {
+    Result = B.CreateInsertElement(Result, Extracts[i], B.getInt32(i));
+  }
+
+  return Result;
+}
+
+ValuePacket
+Packetizer::Impl::packetizeInsertElement(InsertElementInst *InsertElement) {
+  ValuePacket results;
+  Value *Result = nullptr;
+
+  Value *Into = InsertElement->getOperand(0);
+  assert(Into && "Could not get operand 0 of InsertElement");
+  const auto ScalarWidth = multi_llvm::getVectorNumElements(Into->getType());
+
+  Value *Elt = InsertElement->getOperand(1);
+  Value *Index = InsertElement->getOperand(2);
+  assert(Elt && "Could not get operand 1 of InsertElement");
+  assert(Index && "Could not get operand 2 of InsertElement");
+
+  if (SimdWidth.isScalable()) {
+    auto packetWidth = getPacketWidthForType(Into->getType());
+    auto intoVals = packetizeAndGet(Into, packetWidth);
+    // Scalable vectorization (currently) only ever generates 1 packet
+    PACK_FAIL_IF(intoVals.size() != 1);
+    Value *packetizedInto = intoVals.front();
+
+    auto eltPacketWidth = getPacketWidthForType(Elt->getType());
+    auto eltVals = packetizeAndGet(Elt, eltPacketWidth);
+    // Scalable vectorization (currently) only ever generates 1 packet
+    PACK_FAIL_IF(eltVals.size() != 1);
+    Value *packetizedElt = eltVals.front();
+
+    Value *packetizedIndices = packetizeIfVarying(Index);
+
+    auto *packetizedEltTy = packetizedElt->getType();
+    auto *packetizedIntoTy = packetizedInto->getType();
+    auto *scalarTy = packetizedEltTy->getScalarType();
+
+    // Compiler support for masked.gather/riscv.vrgather* on i1 vectors is
+    // lacking, so emit this operation as the equivalent i8 vector instead.
+    auto *const origPacketizedIntoTy = packetizedIntoTy;
+    const bool upcastI1AsI8 = scalarTy->isIntegerTy(1);
+    IRBuilder<> B(buildAfter(InsertElement, F));
+    if (upcastI1AsI8) {
+      auto *const int8Ty = Type::getInt8Ty(F.getContext());
+      packetizedIntoTy = llvm::VectorType::get(
+          int8Ty, multi_llvm::getVectorElementCount(packetizedIntoTy));
+      packetizedEltTy = llvm::VectorType::get(
+          int8Ty, multi_llvm::getVectorElementCount(packetizedEltTy));
+      packetizedElt = B.CreateSExt(packetizedElt, packetizedEltTy);
+      packetizedInto = B.CreateSExt(packetizedInto, packetizedIntoTy);
+    }
+
+    // If we're vector predicating, scale the vector length up by the original
+    // number of vector elements.
+    auto *const EVL = VL ? B.CreateMul(VL, B.getInt32(ScalarWidth)) : nullptr;
+
+    auto *packetizedInsert = Ctx.targetInfo().createScalableInsertElement(
+        B, Ctx, InsertElement, packetizedElt, packetizedInto, packetizedIndices,
+        EVL);
+
+    // If we've been performing this broadcast as i8, now's the time to
+    // truncate back down to i1
+    if (upcastI1AsI8) {
+      packetizedInsert = B.CreateTrunc(packetizedInsert, origPacketizedIntoTy);
+    }
+
+    IC.deleteInstructionLater(InsertElement);
+    results.push_back(packetizedInsert);
+    return results;
+  }
+
+  auto Width = SimdWidth.getFixedValue();
+
+  IRBuilder<> B(buildAfter(InsertElement, F));
+
+  const auto Name = InsertElement->getName();
+  if (auto *CIndex = dyn_cast<ConstantInt>(Index)) {
+    auto IdxVal = CIndex->getZExtValue();
+
+    auto packetWidth = getPacketWidthForType(Into->getType());
+    PACK_FAIL_IF(packetWidth == Width);
+
+    auto Intos = packetizeAndGet(Into, packetWidth);
+    PACK_FAIL_IF(Intos.empty());
+
+    auto res = packetize(Elt);
+    PACK_FAIL_IF(!res);
+
+    if (res.info->numInstances == 0) {
+      // If the element was broadcast, it's better just to create more insert
+      // element instructions..
+      const auto instanceWidth =
+          multi_llvm::getVectorNumElements(Intos.front()->getType());
+      for (unsigned i = 0; i < packetWidth; ++i) {
+        results.push_back(Intos[i]);
+        for (unsigned j = IdxVal; j < instanceWidth; j += ScalarWidth) {
+          results.back() =
+              B.CreateInsertElement(results.back(), Elt, B.getInt32(j), Name);
+        }
+      }
+      return results;
+    }
+
+    SmallVector<Value *, 16> Elts;
+    res.getPacketValues(packetWidth, Elts);
+    PACK_FAIL_IF(Elts.empty());
+
+    const auto *VecTy = cast<FixedVectorType>(Intos.front()->getType());
+    const unsigned VecWidth = VecTy->getNumElements();
+    PACK_FAIL_IF(VecWidth == ScalarWidth);
+    {
+      // Can only shuffle two vectors of the same size, so redistribute
+      // the packetized elements vector
+      SmallVector<int, 16> Mask;
+      for (size_t i = 0; i < VecWidth; ++i) {
+        Mask.push_back(i / ScalarWidth);
+      }
+
+      auto *Undef = PoisonValue::get(Elts.front()->getType());
+      for (unsigned i = 0; i < packetWidth; ++i) {
+        results.push_back(createOptimalShuffle(B, Elts[i], Undef, Mask, Name));
+      }
+    }
+    if (isa<UndefValue>(Into)) {
+      // Inserting into nothing so we can just use it as is..
+      return results;
+    } else {
+      SmallVector<int, 16> Mask;
+      for (size_t i = 0; i < VecWidth; ++i) {
+        int j = VecWidth + i;
+        if (i == IdxVal) {
+          j = i;
+          IdxVal += ScalarWidth;
+        }
+        Mask.push_back(j);
+      }
+
+      for (unsigned i = 0; i < packetWidth; ++i) {
+        results[i] = createOptimalShuffle(B, results[i], Intos[i], Mask, Name);
+      }
+      return results;
+    }
+  } else {
+    Into = packetize(Into).getAsValue();
+    PACK_FAIL_IF(!Into);
+    Value *Elts = packetizeIfVarying(Elt);
+    PACK_FAIL_IF(!Elts);
+    Value *Indices = packetizeIfVarying(Index);
+    PACK_FAIL_IF(!Indices);
+
+    Result = Into;
+    if (Indices != Index) {
+      Type *IdxTy = Index->getType();
+      SmallVector<Constant *, 16> Offsets;
+      for (size_t i = 0; i < Width; ++i) {
+        Offsets.push_back(ConstantInt::get(IdxTy, i * ScalarWidth));
+      }
+      Value *Add = B.CreateAdd(Indices, ConstantVector::get(Offsets));
+
+      for (size_t i = 0; i < Width; ++i) {
+        Value *ExtractElt =
+            (Elts != Elt) ? B.CreateExtractElement(Elts, B.getInt32(i)) : Elt;
+        Value *ExtractIdx = B.CreateExtractElement(Add, B.getInt32(i));
+        Result = B.CreateInsertElement(Result, ExtractElt, ExtractIdx, Name);
+      }
+    } else {
+      for (size_t i = 0; i < Width; ++i) {
+        Value *ExtractElt =
+            (Elts != Elt) ? B.CreateExtractElement(Elts, B.getInt32(i)) : Elt;
+        Value *InsertIdx = B.CreateAdd(Index, B.getInt32(i * ScalarWidth));
+        Result = B.CreateInsertElement(Result, ExtractElt, InsertIdx, Name);
+      }
+    }
+  }
+  IC.deleteInstructionLater(InsertElement);
+  results.push_back(Result);
+  return results;
+}
+
+ValuePacket
+Packetizer::Impl::packetizeExtractElement(ExtractElementInst *ExtractElement) {
+  ValuePacket results;
+  Value *Result = nullptr;
+
+  Value *Src = ExtractElement->getOperand(0);
+  Value *Index = ExtractElement->getOperand(1);
+  assert(Src && "Could not get operand 0 of ExtractElement");
+  assert(Index && "Could not get operand 1 of ExtractElement");
+
+  if (SimdWidth.isScalable()) {
+    auto packetWidth = getPacketWidthForType(Src->getType());
+    auto srcVals = packetizeAndGet(Src, packetWidth);
+    // Scalable vectorization (currently) only ever generates 1 packet
+    PACK_FAIL_IF(srcVals.size() != 1);
+    Value *packetizedSrc = srcVals.front();
+
+    Value *packetizedIndices = packetizeIfVarying(Index);
+
+    Value *packetizedExtract = [&]() {
+      IRBuilder<> B(buildAfter(ExtractElement, F));
+
+      auto *narrowTy = getWideType(ExtractElement->getType(), SimdWidth);
+      auto *const origNarrowTy = narrowTy;
+      auto *origSrc = ExtractElement->getOperand(0);
+      auto *origTy = origSrc->getType();
+      auto *eltTy = origTy->getScalarType()->getScalarType();
+
+      // Compiler support for masked.gather/riscv.vrgather* on i1
+      // vectors is lacking, so emit this operation as the equivalent
+      // i8 vector instead.
+      const bool upcastI1AsI8 = eltTy->isIntegerTy(/*BitWidth*/ 1);
+      if (upcastI1AsI8) {
+        auto *const int8Ty = B.getInt8Ty();
+        auto *wideTy = llvm::VectorType::get(
+            int8Ty,
+            multi_llvm::getVectorElementCount(packetizedSrc->getType()));
+        narrowTy = llvm::VectorType::get(
+            int8Ty, multi_llvm::getVectorElementCount(narrowTy));
+        packetizedSrc = B.CreateSExt(packetizedSrc, wideTy);
+      }
+
+      Value *extract = Ctx.targetInfo().createScalableExtractElement(
+          B, Ctx, ExtractElement, narrowTy, packetizedSrc, packetizedIndices,
+          VL);
+
+      // If we've been performing this broadcast as i8, now's the time to
+      // truncate back down to i1
+      if (extract && upcastI1AsI8) {
+        extract = B.CreateTrunc(extract, origNarrowTy);
+      }
+
+      return extract;
+    }();
+    PACK_FAIL_IF(!packetizedExtract);
+
+    IC.deleteInstructionLater(ExtractElement);
+    results.push_back(packetizedExtract);
+    return results;
+  }
+
+  auto Width = SimdWidth.getFixedValue();
+
+  const auto ScalarWidth = multi_llvm::getVectorNumElements(Src->getType());
+
+  IRBuilder<> B(buildAfter(ExtractElement, F));
+  const auto Name = ExtractElement->getName();
+  if (auto *CIndex = dyn_cast<ConstantInt>(Index)) {
+    auto IdxVal = CIndex->getZExtValue();
+
+    auto packetWidth = getPacketWidthForType(ExtractElement->getType());
+    auto srcVals = packetizeAndGet(Src, packetWidth);
+    PACK_FAIL_IF(srcVals.empty());
+
+    auto resultWidth = Width / packetWidth;
+    if (packetWidth == 1) {
+      srcVals.push_back(PoisonValue::get(srcVals.front()->getType()));
+    } else {
+      resultWidth *= 2;
+    }
+
+    SmallVector<int, 16> Mask;
+    for (size_t i = 0, j = IdxVal; i < resultWidth; ++i, j += ScalarWidth) {
+      Mask.push_back(j);
+    }
+
+    for (unsigned i = 0; i < packetWidth; i += 2) {
+      results.push_back(
+          createOptimalShuffle(B, srcVals[i], srcVals[i + 1], Mask, Name));
+    }
+    return results;
+  } else {
+    Value *Sources = packetizeIfVarying(Src);
+    PACK_FAIL_IF(!Sources);
+    Value *Indices = packetizeIfVarying(Index);
+    PACK_FAIL_IF(!Indices);
+
+    Result =
+        PoisonValue::get(getWideType(ExtractElement->getType(), SimdWidth));
+    if (Indices != Index) {
+      Type *IdxTy = Index->getType();
+      SmallVector<Constant *, 16> Offsets;
+      for (unsigned i = 0; i < Width; ++i) {
+        Offsets.push_back(ConstantInt::get(IdxTy, i * ScalarWidth));
+      }
+
+      if (Sources != Src) {
+        Indices = B.CreateAdd(Indices, ConstantVector::get(Offsets));
+      }
+
+      for (unsigned i = 0; i < Width; ++i) {
+        Value *ExtractIdx = B.CreateExtractElement(Indices, B.getInt32(i));
+        Value *ExtractElt = B.CreateExtractElement(Sources, ExtractIdx);
+        Result = B.CreateInsertElement(Result, ExtractElt, B.getInt32(i), Name);
+      }
+    } else {
+      for (unsigned i = 0, j = 0; i < Width; ++i, j += ScalarWidth) {
+        Value *ExtractIdx = (Sources != Src && i != 0)
+                                ? B.CreateAdd(Index, B.getInt32(j))
+                                : Index;
+        Value *ExtractElt = B.CreateExtractElement(Sources, ExtractIdx);
+        Result = B.CreateInsertElement(Result, ExtractElt, B.getInt32(i), Name);
+      }
+    }
+  }
+  IC.deleteInstructionLater(ExtractElement);
+  results.push_back(Result);
+  return results;
+}
+
+ValuePacket
+Packetizer::Impl::packetizeInsertValue(InsertValueInst *InsertValue) {
+  ValuePacket results;
+
+  Value *const Val = InsertValue->getInsertedValueOperand();
+  Value *const Aggregate = InsertValue->getAggregateOperand();
+
+  // We can only packetize literal struct types
+  if (auto *StructTy = dyn_cast<StructType>(Aggregate->getType());
+      !StructTy || !StructTy->isLiteral()) {
+    return results;
+  }
+
+  Value *PackAggregate = packetizeIfVarying(Aggregate);
+  PACK_FAIL_IF(!PackAggregate);
+
+  Value *PackVal = packetizeIfVarying(Val);
+  PACK_FAIL_IF(!PackVal);
+
+  const bool IsValVarying = Val != PackVal;
+  const bool IsAggregateVarying = Aggregate != PackAggregate;
+  if (!IsAggregateVarying && IsValVarying) {
+    // If the aggregate wasn't varying but the value was
+    PackAggregate = packetize(Aggregate).getAsValue();
+  } else if (IsAggregateVarying && !IsValVarying) {
+    // If the aggregate was varying but the value wasn't
+    PackVal = packetize(Val).getAsValue();
+  } else if (!IsAggregateVarying && !IsValVarying) {
+    // If both were uniform
+    return results;
+  }
+
+  IRBuilder<> B(buildAfter(InsertValue, F));
+
+  results.push_back(
+      B.CreateInsertValue(PackAggregate, PackVal, InsertValue->getIndices()));
+
+  IC.deleteInstructionLater(InsertValue);
+  return results;
+}
+
+ValuePacket
+Packetizer::Impl::packetizeExtractValue(ExtractValueInst *ExtractValue) {
+  ValuePacket results;
+
+  Value *const Aggregate = ExtractValue->getAggregateOperand();
+  // We can only packetize literal struct types
+  if (auto *StructTy = dyn_cast<StructType>(Aggregate->getType());
+      !StructTy || !StructTy->isLiteral()) {
+    return results;
+  }
+
+  Value *PackAggregate = packetizeIfVarying(Aggregate);
+  PACK_FAIL_IF(!PackAggregate);
+
+  IRBuilder<> B(buildAfter(ExtractValue, F));
+
+  results.push_back(
+      B.CreateExtractValue(PackAggregate, ExtractValue->getIndices()));
+
+  IC.deleteInstructionLater(ExtractValue);
+  return results;
+}
+
+ValuePacket
+Packetizer::Impl::packetizeShuffleVector(ShuffleVectorInst *Shuffle) {
+  Value *const srcA = Shuffle->getOperand(0);
+  Value *const srcB = Shuffle->getOperand(1);
+  assert(srcA && "Could not get operand 0 from Shuffle");
+  assert(srcB && "Could not get operand 1 from Shuffle");
+  auto *const ty = Shuffle->getType();
+  auto *const tyA = srcA->getType();
+  auto packetWidth =
+      std::max(getPacketWidthForType(ty), getPacketWidthForType(tyA));
+
+  ValuePacket results;
+  IRBuilder<> B(buildAfter(Shuffle, F));
+  const auto scalarWidth = multi_llvm::getVectorNumElements(tyA);
+
+  if (SimdWidth.isScalable()) {
+    PACK_FAIL_IF(packetWidth != 1);
+    if (auto *const SplatVal = getSplatValue(Shuffle)) {
+      // Handle splats as a special case.
+      auto Splats = packetizeAndGet(SplatVal);
+      PACK_FAIL_IF(!createSubSplats(Ctx.targetInfo(), B, Splats, scalarWidth));
+      return Splats;
+    } else {
+      // It isn't safe to do it if it's not a power of 2.
+      PACK_FAIL_IF(!isPowerOf2_32(scalarWidth));
+      const TargetInfo &VTI = Ctx.targetInfo();
+
+      const auto dstScalarWidth = multi_llvm::getVectorNumElements(ty);
+      const auto fullWidth = SimdWidth * dstScalarWidth;
+
+      // If we're vector-predicating a vector access, scale the vector length
+      // up by the original number of vector elements.
+      auto *const EVL =
+          VL ? B.CreateMul(VL, B.getInt32(dstScalarWidth)) : nullptr;
+
+      auto *const mask = Shuffle->getShuffleMaskForBitcode();
+      auto *const vecMask =
+          VTI.createOuterScalableBroadcast(B, mask, EVL, SimdWidth);
+
+      auto *const idxVector =
+          createIndexSequence(B, VectorType::get(B.getInt32Ty(), fullWidth));
+
+      // We need to create offsets into the source operand subvectors, to add
+      // onto the broadcast shuffle mask, so that each subvector of the
+      // destination indices into the corresponding subvector of the source.
+      // That is, for a source vector width of `n` we need the indices
+      // `[0, n, 2*n, 3*n ...]`, which correspond to the indices of the first
+      // element of each subvector of the packetized source. For a destination
+      // vector of width `m` we need `m` instances of each index.
+      //
+      // We can compute the offset vector as `offset[i] = floor(i / m) * n`.
+      Value *offset = nullptr;
+      if (dstScalarWidth == scalarWidth) {
+        // If the source and destination are the same size, we have a special
+        // case and can mask off the LSBs of the index vector instead. i.e.
+        //     `offset[i] = i & -n`
+        // For instance, for `n == 4` we have offset indices:
+        // [0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, ... ].
+        offset = B.CreateAnd(
+            idxVector,
+            ConstantVector::getSplat(fullWidth, B.getInt32(-scalarWidth)));
+      } else {
+        auto *const subVecID = B.CreateUDiv(
+            idxVector,
+            ConstantVector::getSplat(fullWidth, B.getInt32(dstScalarWidth)));
+        offset = B.CreateMul(subVecID, ConstantVector::getSplat(
+                                           fullWidth, B.getInt32(scalarWidth)));
+      }
+
+      auto *const vecA = packetizeAndGet(srcA, 1).front();
+      if (isa<UndefValue>(srcB)) {
+        auto *const adjust = B.CreateAdd(vecMask, offset, "shuffleMask");
+        auto *const shuffleA = VTI.createVectorShuffle(B, vecA, adjust, EVL);
+        results.push_back(shuffleA);
+      } else {
+        // For a two-source shuffle, we shuffle each source separately and then
+        // select between the results. It might sound tempting to concatenate
+        // the sources first and use a single shuffle, but since the results
+        // need to be interleaved, it makes the mask computation somewhat more
+        // complicated, with indices dependent on the vector scale factor.
+        auto *const vecB = packetizeAndGet(srcB, 1).front();
+
+        auto *const whichCmp = B.CreateICmpUGE(
+            vecMask,
+            ConstantVector::getSplat(fullWidth, B.getInt32(scalarWidth)));
+        auto *const safeMask = B.CreateAnd(
+            vecMask,
+            ConstantVector::getSplat(fullWidth, B.getInt32(scalarWidth - 1)));
+
+        auto *const adjust = B.CreateAdd(safeMask, offset, "shuffleMask");
+        auto *const shuffleA = VTI.createVectorShuffle(B, vecA, adjust, EVL);
+        auto *const shuffleB = VTI.createVectorShuffle(B, vecB, adjust, EVL);
+        results.push_back(B.CreateSelect(whichCmp, shuffleB, shuffleA));
+      }
+
+      return results;
+    }
+  }
+
+  auto srcsA = packetizeAndGet(srcA, packetWidth);
+  auto srcsB = packetizeAndGet(srcB, packetWidth);
+  PACK_FAIL_IF(srcsA.empty() || srcsB.empty());
+
+  auto width = SimdWidth.getFixedValue() / packetWidth;
+
+  // Because up to and including LLVM 10, the IR Builder accepts a mask as a
+  // vector of uint32_t, but getShuffleMask returns an array of ints. So
+  // we do it this way.
+  const auto &origMask = Shuffle->getShuffleMask();
+  SmallVector<int, 16> mask(origMask.begin(), origMask.end());
+
+  // Adjust any indices that select from the second source vector
+  const auto adjust =
+      isa<UndefValue>(srcB) ? -scalarWidth : (width - 1) * scalarWidth;
+  for (auto &idx : mask) {
+    if (idx != -1 && idx >= int(scalarWidth)) {
+      idx += adjust;
+    }
+  }
+
+  // Duplicate the mask over the vectorized width
+  const auto size = mask.size();
+  mask.reserve(size * width);
+  for (unsigned i = 1, k = 0; i < width; ++i, k += size) {
+    for (unsigned j = 0; j < size; ++j) {
+      auto maskElem = mask[k + j];
+      if (maskElem != -1) {
+        maskElem += scalarWidth;
+      }
+      mask.push_back(maskElem);
+    }
+  }
+
+  const auto name = Shuffle->getName();
+  for (unsigned i = 0; i < packetWidth; ++i) {
+    results.push_back(createOptimalShuffle(B, srcsA[i], srcsB[i], mask, name));
+  }
+  return results;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
new file mode 100644
index 0000000000000..a496b0fdb44c1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/passes.cpp
@@ -0,0 +1,180 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/passes.h"
+
+#include <compiler/utils/mangling.h>
+#include <llvm/Analysis/AliasAnalysis.h>
+#include <llvm/Analysis/MemorySSA.h>
+#include <llvm/IR/Dominators.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Utils.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "memory_operations.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+
+namespace vecz {
+PreservedAnalyses DivergenceCleanupPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+
+  for (BasicBlock &BB : F) {
+    auto *TI = BB.getTerminator();
+    if (BranchInst *Branch = dyn_cast<BranchInst>(TI)) {
+      if (!Branch->isConditional()) {
+        continue;
+      }
+
+      if (auto *const call = dyn_cast<CallInst>(Branch->getCondition())) {
+        compiler::utils::Lexer L(call->getCalledFunction()->getName());
+        if (L.Consume(VectorizationContext::InternalBuiltinPrefix) &&
+            L.Consume("divergence_")) {
+          // uniform reductions can just disappear
+          auto *const newCond = call->getOperand(0);
+          if (!UVR.isVarying(newCond)) {
+            Branch->setCondition(newCond);
+            if (call->use_empty()) {
+              UVR.remove(call);
+              call->eraseFromParent();
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return PreservedAnalyses::all();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// @brief Try to replace or remove masked memory operations that are trivially
+/// not needed or can be converted to non-masked operations.
+PreservedAnalyses SimplifyMaskedMemOpsPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+
+  const TargetInfo &VTI = Ctx.targetInfo();
+  std::vector<Instruction *> ToDelete;
+  for (Function &Builtin : F.getParent()->functions()) {
+    std::optional<MemOpDesc> BuiltinDesc =
+        MemOpDesc::analyzeMaskedMemOp(Builtin);
+    if (!BuiltinDesc) {
+      continue;
+    }
+    for (User *U : Builtin.users()) {
+      CallInst *CI = dyn_cast<CallInst>(U);
+      if (!CI) {
+        continue;
+      }
+      Function *Parent = CI->getParent()->getParent();
+      if (Parent != &F) {
+        continue;
+      }
+      auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked);
+      if (!MaskedOp || !MaskedOp->isMaskedMemOp()) {
+        continue;
+      }
+      Value *Mask = MaskedOp->getMaskOperand();
+      Constant *CMask = dyn_cast<Constant>(Mask);
+      if (!CMask) {
+        continue;
+      }
+
+      // Handle special constants.
+      if (CMask->isZeroValue()) {
+        // A null mask means no lane executes the memory operation.
+        if (BuiltinDesc->isLoad()) {
+          CI->replaceAllUsesWith(PoisonValue::get(BuiltinDesc->getDataType()));
+        }
+        ToDelete.push_back(CI);
+      } else if (CMask->isAllOnesValue()) {
+        // An 'all ones' mask means all lane execute the memory operation.
+        IRBuilder<> B(CI);
+        Value *Data = MaskedOp->getDataOperand();
+        Value *Ptr = MaskedOp->getPointerOperand();
+        Type *DataTy = MaskedOp->getDataType();
+        auto Alignment = BuiltinDesc->getAlignment();
+        if (MaskedOp->isLoad()) {
+          Value *Load = nullptr;
+          if (DataTy->isVectorTy()) {
+            // Skip this optimization for scalable vectors for now. It's
+            // theoretically possible to perform but without scalable-vector
+            // builtins we can't test it; leave any theoretical scalable-vector
+            // maksed mem operation unoptimized.
+            if (isa<ScalableVectorType>(DataTy)) {
+              continue;
+            }
+            Load =
+                VTI.createLoad(B, CI->getType(), Ptr, B.getInt64(1), Alignment);
+          } else {
+            Load = B.CreateAlignedLoad(CI->getType(), Ptr, Align(Alignment),
+                                       /*isVolatile*/ false, CI->getName());
+          }
+          Load->takeName(CI);
+          CI->replaceAllUsesWith(Load);
+        } else {
+          if (DataTy->isVectorTy()) {
+            // Skip this optimization for scalable vectors for now. It's
+            // theoretically possible to perform but without scalable-vector
+            // builtins we can't test it; leave any theoretical scalable-vector
+            // maksed mem operation unoptimized.
+            if (isa<ScalableVectorType>(DataTy)) {
+              continue;
+            }
+            VTI.createStore(B, Data, Ptr, B.getInt64(1),
+                            BuiltinDesc->getAlignment());
+          } else {
+            B.CreateAlignedStore(Data, Ptr, Align(Alignment));
+          }
+        }
+        ToDelete.push_back(CI);
+      }
+    }
+  }
+
+  // Clean up.
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.back();
+    IRCleanup::deleteInstructionNow(I);
+    ToDelete.pop_back();
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+  Preserved.preserve<CFGAnalysis>();
+  Preserved.preserve<DivergenceAnalysis>();
+  return Preserved;
+}
+
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
new file mode 100644
index 0000000000000..b72ab38121384
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/pre_linearize_pass.cpp
@@ -0,0 +1,353 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This pass aims to optimize the CFG by hoisting instructions out of triangle
+// or diamond patterns (i.e. "if" or "if..else" constructs) where it determines
+// that executing all the instructions in all branch targets is cheaper than
+// actually branching. This is especially the case when BOSCC is active as the
+// BOSCC gadget introduces potentially-expensive AND/OR reduction operations
+// in order to branch to the uniform version of each Basic Block. To such end,
+// the pass needs to use the Uniform Value Analysis result, since only varying
+// branch conditions will be affected by BOSCC in such a way. We also need
+// access to the Target Transform Info result from the Vectorization Unit in
+// order to make target-dependent cost-based decisions.
+//
+// This pass only hoists instructions out of conditional blocks, and does not
+// directly modify the CFG, so it is intended that CFG Simplification Pass to
+// be run afterwards, in order to eliminate the now-redundant Basic Blocks and
+// transform PHI nodes into select instructions. Therefore, the
+// pre-linearization pass is implemented as an llvm::FunctionPass so it can
+// be run in the middle of the Vecz Preparation Pass.
+//
+// Pre-Linearization is currently unable to hoist memory operations, since
+// doing so will require the correct masked versions to be generated, which
+// would require a lot of special extra handling.
+
+#include <llvm/ADT/DepthFirstIterator.h>
+#include <llvm/ADT/GraphTraits.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Analysis/ValueTracking.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/DerivedTypes.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Pass.h>
+#include <llvm/Support/InstructionCost.h>
+#include <llvm/Transforms/Utils/LoopUtils.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/passes.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+bool isTrivialBlock(const llvm::BasicBlock &BB) {
+  for (const auto &I : BB) {
+    if (I.mayReadOrWriteMemory() || I.mayHaveSideEffects() ||
+        llvm::isa<llvm::PHINode>(&I)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// This is an estimate of the cycle count for executing the entire block,
+// not including the terminating branch instruction, obtained by summing
+// the cost (Reciprocal Throughput) of each individual instruction.
+// This assumes sequential execution (no Instruction Level Parallelism)
+// and takes no account of Data Hazards &c so is not guaranteed to be
+// entirely accurate.
+InstructionCost calculateBlockCost(const BasicBlock &BB,
+                                   const TargetTransformInfo &TTI) {
+  InstructionCost cost;
+  for (const auto &I : BB) {
+    if (I.isTerminator()) {
+      break;
+    }
+
+    InstructionCost inst_cost =
+        TTI.getInstructionCost(&I, TargetTransformInfo::TCK_RecipThroughput);
+
+    // When a vector instruction is encountered, we multiply by the vector
+    // width, because it will either be scalarized into that many individual
+    // instructions during scalarization, or packetized by duplication.
+    // This works on the assumption that throughput does not depend on the
+    // vector width. This calculation may need refining in future.
+    if (I.getType()->isVectorTy()) {
+      inst_cost *= multi_llvm::getVectorNumElements(I.getType());
+    }
+
+    cost += inst_cost;
+  }
+  return cost;
+}
+
+// It creates a temporary function in order to build a target-dependent
+// vector AND reduction inside it, in order to calculate the cost of it.
+InstructionCost calculateBoolReductionCost(LLVMContext &context, Module *module,
+                                           const TargetTransformInfo &TTI,
+                                           llvm::ElementCount width) {
+  Type *cond_ty = VectorType::get(Type::getInt1Ty(context), width);
+
+  FunctionType *new_fty =
+      FunctionType::get(Type::getVoidTy(context), {cond_ty}, false);
+
+  // LLVM 11 requires the function to be in a valid (existing) module in
+  // order to create a simple vector reduction with the specified opcode.
+  auto *F = Function::Create(new_fty, Function::InternalLinkage, "tmp", module);
+  auto *BB = BasicBlock::Create(context, "reduce", F);
+  IRBuilder<> B(BB);
+  createSimpleReduction(B, &*F->arg_begin(), RecurKind::And);
+  const InstructionCost cost = calculateBlockCost(*BB, TTI);
+
+  // We don't really need that function in the module anymore because it's
+  // only purpose was to be used for analysis, so we go ahead and remove it.
+  F->removeFromParent();
+  delete F;
+  return cost;
+}
+
+bool hoistInstructions(BasicBlock &BB, BranchInst &Branch, bool exceptions) {
+  const auto &DL = BB.getModule()->getDataLayout();
+  const bool TrueBranch = (Branch.getSuccessor(0) == &BB);
+  DenseMap<Value *, Value *> safeDivisors;
+
+  bool modified = false;
+  while (!BB.front().isTerminator()) {
+    auto &I = BB.front();
+    I.moveBefore(*Branch.getParent(), Branch.getIterator());
+    modified = true;
+
+    if (!exceptions) {
+      // we don't need to mask division operations if they don't trap
+      continue;
+    }
+
+    if (!isa<BinaryOperator>(&I)) {
+      // we only hoist binary operators
+      continue;
+    }
+    auto *binOp = cast<BinaryOperator>(&I);
+    // It is potentially dangerous to hoist division operations, since
+    // the RHS could be zero or INT_MIN on some lanes, unless it's a
+    // constant.
+    bool isUnsigned = false;
+    switch (binOp->getOpcode()) {
+    default:
+      break;
+    case Instruction::UDiv:
+    case Instruction::URem:
+      isUnsigned = true;
+      LLVM_FALLTHROUGH;
+    case Instruction::SDiv:
+    case Instruction::SRem: {
+      auto *divisor = binOp->getOperand(1);
+      if (auto *C = dyn_cast<Constant>(divisor)) {
+        if (C->isZeroValue()) {
+          // Divides by constant zero can be a NOP since there is no
+          // division by zero exception in OpenCL.
+          I.replaceAllUsesWith(binOp->getOperand(0));
+          I.eraseFromParent();
+        }
+      } else {
+        // if the divisor could be illegal, we need to guard it with a
+        // select instruction generated from the branch condition.
+        auto &masked = safeDivisors[divisor];
+        if (!masked) {
+          // NOTE this function does not check for the pattern
+          // "select (x eq 0) 1, x" or equivalent, so we might want to
+          // write it ourselves, but Instruction Combining cleans it
+          // up. NOTE that for a signed division, we also have to
+          // consider the potential overflow situation, which is not
+          // so simple
+          if (isUnsigned && isKnownNonZero(divisor, DL)) {
+            // Static analysis concluded it can't be zero, so we don't
+            // need to do anything.
+            masked = divisor;
+          } else {
+            Value *one = ConstantInt::get(divisor->getType(), 1);
+            Value *cond = Branch.getCondition();
+
+            Instruction *SI;
+            if (TrueBranch) {
+              SI = SelectInst::Create(cond, divisor, one,
+                                      divisor->getName() + ".hoist_guard");
+            } else {
+              SI = SelectInst::Create(cond, one, divisor,
+                                      divisor->getName() + ".hoist_guard");
+            }
+            SI->insertBefore(I.getIterator());
+            masked = SI;
+          }
+        }
+
+        if (masked != divisor) {
+          binOp->setOperand(1, masked);
+        }
+      }
+    } break;
+    }
+  }
+  return modified;
+}
+} // namespace
+
+PreservedAnalyses PreLinearizePass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  VectorizationUnitAnalysis::Result R =
+      AM.getResult<VectorizationUnitAnalysis>(F);
+  const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  const VectorizationUnit &VU = R.getVU();
+
+  bool modified = false;
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  const bool div_exceptions =
+      VU.choices().isEnabled(VectorizationChoices::eDivisionExceptions);
+
+  InstructionCost boscc_cost;
+  const UniformValueResult *UVR = nullptr;
+  if (VU.choices().linearizeBOSCC()) {
+    boscc_cost = calculateBoolReductionCost(F.getContext(), F.getParent(), TTI,
+                                            VU.width());
+    UVR = &AM.getResult<UniformValueAnalysis>(F);
+  }
+
+  auto dfo = depth_first(&F.getEntryBlock());
+  SmallVector<BasicBlock *, 16> blocks(dfo.begin(), dfo.end());
+
+  DenseMap<BasicBlock *, BasicBlock *> single_succs;
+  for (auto *BB : blocks) {
+    single_succs[BB] = BB->getSingleSuccessor();
+  }
+
+  for (auto BBit = blocks.rbegin(), BBe = blocks.rend(); BBit != BBe; ++BBit) {
+    BasicBlock *BB = *BBit;
+
+    // Check that all hoistable successor blocks are in the same loop
+    Loop *block_loop = LI.getLoopFor(BB);
+
+    if (succ_size(BB) >= 2) {
+      bool simple = true;
+      SmallPtrSet<BasicBlock *, 2> targets;
+      for (auto *succ : successors(BB)) {
+        if (BasicBlock *target = single_succs[succ]) {
+          targets.insert(target);
+        }
+      }
+
+      SmallVector<BasicBlock *, 2> hoistable;
+      SmallPtrSet<BasicBlock *, 2> new_succs;
+      for (auto *succ : successors(BB)) {
+        if (!targets.contains(succ)) {
+          if (single_succs[succ] == nullptr || pred_size(succ) != 1 ||
+              LI.getLoopFor(succ) != block_loop || !isTrivialBlock(*succ)) {
+            simple = false;
+            break;
+          }
+          hoistable.push_back(succ);
+        } else {
+          // these "bypass" successors are going to stay where they are
+          new_succs.insert(succ);
+        }
+      }
+      if (!simple || hoistable.empty()) {
+        continue;
+      }
+
+      // The cost of a "bypass" branch is essentially zero. This occurs in a
+      // "triangle" type control struct (i.e. if with no else).
+      InstructionCost min_cost = new_succs.empty() ? InstructionCost::getMax()
+                                                   : InstructionCost::getMin();
+
+      // The total cost of executing every successor sequentially
+      InstructionCost total_cost = 0;
+
+      for (auto *succ : hoistable) {
+        const InstructionCost block_cost = calculateBlockCost(*succ, TTI);
+        if (block_cost < min_cost) {
+          min_cost = block_cost;
+        }
+        total_cost += block_cost;
+        new_succs.insert(single_succs[succ]);
+      }
+
+      // One of the successors was going to get executed anyway, so we can
+      // discount the cost of the cheapest one from the total cost.
+      total_cost -= min_cost;
+
+      // The unconditional branches of the successors are going to get
+      // removed if we hoist the contents. We will only execute one successor
+      // so assume the first successor's branch is representative.
+      auto *succ_term = hoistable.front()->getTerminator();
+      InstructionCost branch_cost =
+          TTI.getInstructionCost(succ_term,
+                                 TargetTransformInfo::TCK_RecipThroughput) +
+          TTI.getInstructionCost(succ_term, TargetTransformInfo::TCK_Latency);
+
+      // If all our successors branch to the same target, the conditional
+      // branch is going to disappear as well, so we can add that to the cost
+      // of the successor's branches in our analysis.
+      auto *T = BB->getTerminator();
+      if (new_succs.size() == 1) {
+        branch_cost +=
+            TTI.getInstructionCost(T, TargetTransformInfo::TCK_RecipThroughput);
+        branch_cost +=
+            TTI.getInstructionCost(T, TargetTransformInfo::TCK_Latency);
+
+        // BOSCC will incur an additional cost on varying branches.
+        if (UVR && UVR->isVarying(T)) {
+          branch_cost += boscc_cost;
+        }
+      }
+
+      // If the cost of executing everything is less than the cost of the
+      // branches that would get removed, then it is beneficial to hoist.
+      // If the costs are the same then we might as well make the CFG simpler!
+      if (total_cost <= branch_cost) {
+        // The Lower Switch Pass ought to guarantee we can only get branch
+        // instructions here, but in case it didn't, we don't want to crash.
+        if (auto *const Branch = dyn_cast<BranchInst>(T)) {
+          for (auto *succ : hoistable) {
+            modified |= hoistInstructions(*succ, *Branch, div_exceptions);
+          }
+
+          if (new_succs.size() == 1) {
+            // We are not going to modify the CFG while we are working on it,
+            // because that is very complex so we leave it to the Simplfy CFG
+            // Pass which is to come after us, and will do a better job. So
+            // here we can just pretend we modified it.
+            single_succs[BB] = *new_succs.begin();
+          }
+        }
+      }
+    }
+  }
+
+  if (!modified) {
+    return PreservedAnalyses::all();
+  }
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
new file mode 100644
index 0000000000000..d59a65037555b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/printf_scalarizer.cpp
@@ -0,0 +1,391 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/printf_scalarizer.h"
+
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#define DEBUG_TYPE "VECZ-PRINTF-SCALARIZER"
+
+using namespace llvm;
+
+namespace vecz {
+
+GlobalVariable *GetFormatStringAsValue(Value *op) {
+  if (isa<ConstantExpr>(op)) {
+    auto const_string = cast<ConstantExpr>(op);
+    if (const_string->getOpcode() != Instruction::GetElementPtr) {
+      return nullptr;
+    }
+    return dyn_cast<GlobalVariable>(const_string->getOperand(0));
+  }
+
+  if (isa<GetElementPtrInst>(op)) {
+    auto gep_string = cast<GetElementPtrInst>(op);
+    return dyn_cast<GlobalVariable>(gep_string->getPointerOperand());
+  }
+
+  return dyn_cast<GlobalVariable>(op);
+}
+
+std::string GetFormatStringAsString(Value *op) {
+  if (!op || !isa<GlobalVariable>(op)) {
+    return "";
+  }
+
+  auto *string_global = cast<GlobalVariable>(op);
+
+  if (!string_global->hasInitializer()) {
+    return "";
+  }
+
+  Constant *const string_const = string_global->getInitializer();
+
+  if (!isa<ConstantDataSequential>(string_const)) {
+    return "";
+  }
+
+  auto *array_string = cast<ConstantDataSequential>(string_const);
+
+  if (!array_string->isString()) {
+    return "";
+  }
+
+  return array_string->getAsString().str();
+}
+
+static bool IncrementPtr(const char **fmt) {
+  if (*(++(*fmt)) == '\0') {
+    return true;
+  }
+  return false;
+}
+
+GlobalVariable *
+GetNewFormatStringAsGlobalVar(Module &module,
+                              GlobalVariable *const string_value,
+                              const std::string &new_format_string) {
+  const ArrayRef<uint8_t> Elts((const uint8_t *)new_format_string.data(),
+                               new_format_string.size());
+  Constant *new_format_string_const =
+      ConstantDataArray::get(module.getContext(), Elts);
+
+  const bool is_constant = string_value->isConstant();
+  const bool is_externally_initialized = false;
+  const uint32_t addr_space = string_value->getType()->getPointerAddressSpace();
+  const GlobalValue::LinkageTypes linkage_type = string_value->getLinkage();
+  const GlobalValue::ThreadLocalMode thread_local_mode =
+      string_value->getThreadLocalMode();
+
+  GlobalVariable *new_var = new GlobalVariable(
+      module, new_format_string_const->getType(), is_constant, linkage_type,
+      new_format_string_const, Twine(string_value->getName() + "_"),
+      string_value, thread_local_mode, addr_space, is_externally_initialized);
+
+  new_var->setAlignment(MaybeAlign(string_value->getAlignment()));
+  new_var->setUnnamedAddr(string_value->getUnnamedAddr());
+
+  return new_var;
+}
+
+EnumPrintfError ScalarizeAndCheckFormatString(const std::string &str,
+                                              std::string &new_str) {
+  // Set some sensible defaults in case we return error
+  new_str = "";
+
+  const char *fmt = str.c_str();
+
+  while (*fmt != '\0') {
+    if (*fmt != '%') {
+      new_str += *fmt;
+    } else {
+      std::string specifier_string(1, *fmt);
+
+      if (IncrementPtr(&fmt)) {
+        LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                          << str.c_str() << "\"");
+        return kPrintfError_invalidFormatString;
+      }
+
+      // Parse (zero or more) Flags
+      const char *flag_chars = "-+ #0";
+      while (strchr(flag_chars, *fmt)) {
+        specifier_string += *fmt;
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+      }
+
+      // Parse (optional) Width
+      if (*fmt == '*') {
+        specifier_string += *fmt;
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+      } else if (isdigit(*fmt)) {
+        while (isdigit(*fmt)) {
+          specifier_string += *fmt;
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+        }
+      }
+
+      // Parse (optional) Precision
+      if (*fmt == '.') {
+        specifier_string += *fmt;
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+
+        while (isdigit(*fmt)) {
+          specifier_string += *fmt;
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+        }
+      }
+
+      uint32_t vector_length = 1u;
+      const bool is_vector = *fmt == 'v';
+      // Parse (optional) Vector Specifier
+      if (is_vector) {
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+        switch (*fmt) {
+        default:
+          LLVM_DEBUG(dbgs() << "Unexpected character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        case '1':
+          // Must be 16, else error
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs()
+                       << "Expected vector width of 16 in format string \""
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          if (*fmt != '6') {
+            LLVM_DEBUG(dbgs()
+                       << "Expected vector width of 16 in format string \""
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          vector_length = 16u;
+          break;
+        case '2':
+          vector_length = 2u;
+          break;
+        case '3':
+          vector_length = 3u;
+          // Lookahead for vectors of width 32. We know that we won't go out
+          // of bounds because worst case scenario there should be a null byte
+          // after the '3'.
+          if (*(fmt + 1) == '2') {
+            IncrementPtr(&fmt);
+            vector_length = 32u;
+          }
+          break;
+        case '4':
+          vector_length = 4u;
+          break;
+        case '6':
+          // Must be 64, else error
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs()
+                       << "Expected vector width of 64 in format string \""
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          if (*fmt != '4') {
+            LLVM_DEBUG(dbgs()
+                       << "Expected vector width of 64 in format string \""
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          vector_length = 64u;
+          break;
+        case '8':
+          vector_length = 8u;
+          break;
+        }
+        if (IncrementPtr(&fmt)) {
+          LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                            << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+      }
+
+      // Parse Length Modifier
+      const char *length_modifier_chars = "hljztL";
+      // Length Modifier is required with Vector Specifier
+      bool has_used_l_length_modifier = false;
+      const bool has_supplied_length_modifier =
+          strchr(length_modifier_chars, *fmt);
+      if (is_vector && !has_supplied_length_modifier) {
+        LLVM_DEBUG(
+            dbgs() << "Expected vector width specifier in format string \""
+                   << str.c_str() << "\"");
+        return kPrintfError_invalidFormatString;
+      }
+
+      if (has_supplied_length_modifier) {
+        bool consume_next_char = true;
+        switch (*fmt) {
+        default:
+          // The 'j', 'z', 't', and 'L' length modifiers are not supported by
+          // OpenCL C.
+          LLVM_DEBUG(dbgs() << "Unsupported length modifier '" << *fmt
+                            << "'specifier in format string \"" << str.c_str()
+                            << "\"");
+          return kPrintfError_invalidFormatString;
+        case 'h':
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          if (*fmt == 'h') {
+            specifier_string += "hh";
+          } else if (*fmt == 'l') {
+            // Native printf doesn't recognize 'hl' so we don't
+            // add it to the new format string.  Luckily, 'hl'
+            // is sizeof(int) - the same as the default on
+            // native printf!
+
+            // Additionally, 'hl' modifier may only be used in
+            // conjunction with the vector specifier
+            if (!is_vector) {
+              LLVM_DEBUG(dbgs()
+                         << "Unexpected \\0 character in format string \""
+                         << str.c_str() << "\"");
+              return kPrintfError_invalidFormatString;
+            }
+          } else {
+            specifier_string += 'h';
+            // We've already incremented the ptr and we found nothing; don't
+            // do it again
+            consume_next_char = false;
+          }
+          break;
+        case 'l':
+          specifier_string += *fmt;
+          // Check ahead to see if the user is using the invalid 'll' length
+          // modifier
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          if (*fmt == 'l') {
+            LLVM_DEBUG(dbgs()
+                       << "The 'll' length specifier is invalid in OpenCL "
+                          "printf\n  > "
+                       << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+          // We've already incremented the ptr; don't do it again
+
+          // The 'l' specifier for the OpenCL printf expects 64 bits
+          // integers, check if the system's long are actually 64 bits wide
+          // and if not upgrade the format specifier to 'll'.
+          //
+          // FIXME: This only works for host based devices, which is fine for
+          // our current printf implementation, but it should really be
+          // removed once we have a proper printf implementation.
+          if (sizeof(long) != 8) {
+            specifier_string += 'l';
+          }
+
+          consume_next_char = false;
+          has_used_l_length_modifier = true;
+          break;
+        }
+        if (consume_next_char) {
+          if (IncrementPtr(&fmt)) {
+            LLVM_DEBUG(dbgs() << "Unexpected \\0 character in format string \""
+                              << str.c_str() << "\"");
+            return kPrintfError_invalidFormatString;
+          }
+        }
+      }
+
+      // Parse Specifier
+      specifier_string += *fmt;
+
+      switch (*fmt) {
+      default:
+        break;
+      case 'n':
+        // The 'n' conversion specifier is not supported by OpenCL C.
+        LLVM_DEBUG(
+            dbgs() << "The 'n' conversion specifier is invalid in OpenCL "
+                      "printf\n  > "
+                   << str.c_str() << "\"");
+        return kPrintfError_invalidFormatString;
+      case 's': // Intentional fall-through
+      case 'c':
+        // The 'l' length modifier followed by the 'c' or 's' conversion
+        // specifiers is not supported by OpenCL C.
+        if (has_used_l_length_modifier) {
+          LLVM_DEBUG(dbgs()
+                     << "The 'l' length modifier followed by the 'c' or "
+                        "'s' conversion is invalid in OpenCL printf\n  > "
+                     << str.c_str() << "\"");
+          return kPrintfError_invalidFormatString;
+        }
+        break;
+      }
+
+      // Output the %specifier for each element of the vector,
+      // and for every element but the last, follow it by a "," string.
+      for (uint32_t i = 0; i < vector_length; ++i) {
+        new_str += specifier_string;
+
+        if (i < (vector_length - 1)) {
+          new_str += ",";
+        }
+      }
+    }
+    ++fmt;
+  }
+
+  new_str += '\0';
+
+  return kPrintfError_success;
+}
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
new file mode 100644
index 0000000000000..419f41649c58d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/remove_intptr_pass.cpp
@@ -0,0 +1,125 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "transform/passes.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+/// @brief remove IntPtrs where possible.
+PreservedAnalyses RemoveIntPtrPass::run(Function &F,
+                                        FunctionAnalysisManager &) {
+  static const StringRef name = "remove_intptr";
+
+  SmallVector<PtrToIntInst *, 16> casts;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (auto *int_ptr = dyn_cast<PtrToIntInst>(&I)) {
+        casts.push_back(int_ptr);
+      }
+    }
+  }
+
+  if (casts.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  while (!casts.empty()) {
+    PtrToIntInst *int_ptr = casts.back();
+    casts.pop_back();
+
+    for (auto usei = int_ptr->use_begin(); usei != int_ptr->use_end();) {
+      auto &use = *(usei++);
+      auto *user = use.getUser();
+
+      if (auto *ptr = dyn_cast<IntToPtrInst>(user)) {
+        IRBuilder<> B(ptr);
+        Value *new_cast = B.CreatePointerBitCastOrAddrSpaceCast(
+            int_ptr->getOperand(0), ptr->getDestTy(), name);
+        ptr->replaceAllUsesWith(new_cast);
+        ptr->eraseFromParent();
+      } else if (auto *phi = dyn_cast<PHINode>(user)) {
+        // How we deal with PHI nodes is we create another PHI node with the
+        // pointer type, moving the PtrToInt to the other side of it. We also
+        // create IntToPtrs on the incoming side, where it does not consume
+        // the PtrToInt that we are currently looking at. Any new casts will
+        // hopefully be removed later.
+        auto num_values = phi->getNumIncomingValues();
+        PHINode *new_phi = PHINode::Create(int_ptr->getSrcTy(), num_values,
+                                           phi->getName() + ".intptr");
+        new_phi->insertBefore(phi->getIterator());
+
+        Instruction *insert = phi;
+        while (isa<PHINode>(insert)) {
+          insert = insert->getNextNode();
+        }
+
+        // Populate the replacement PHI node
+        for (decltype(num_values) i = 0; i != num_values; ++i) {
+          Value *incoming = phi->getIncomingValue(i);
+          BasicBlock *inb = phi->getIncomingBlock(i);
+          if (incoming == int_ptr) {
+            incoming = int_ptr->getOperand(0);
+          } else {
+            IRBuilder<> B(inb->getTerminator());
+            incoming = B.CreateIntToPtr(incoming, int_ptr->getSrcTy(), name);
+          }
+          new_phi->addIncoming(incoming, inb);
+        }
+
+        // Add the cast back to Int at the other side
+        IRBuilder<> B(insert);
+        Value *new_cast = B.CreatePtrToInt(new_phi, phi->getType(), name);
+        phi->replaceAllUsesWith(new_cast);
+        phi->eraseFromParent();
+        casts.push_back(cast<PtrToIntInst>(new_cast));
+      } else if (auto *bin_op = dyn_cast<BinaryOperator>(user)) {
+        auto *i8_ty = IntegerType::getInt8Ty(F.getContext());
+
+        IRBuilder<> B(bin_op);
+        Value *index = nullptr;
+
+        auto opcode = bin_op->getOpcode();
+        if (opcode == Instruction::Add) {
+          index = bin_op->getOperand(use.getOperandNo() == 0);
+        } else if (opcode == Instruction::Sub && use.getOperandNo() == 0) {
+          index = B.CreateNeg(bin_op->getOperand(1), name);
+        }
+
+        if (index) {
+          Value *operand = int_ptr->getOperand(0);
+          Value *new_gep = B.CreateGEP(i8_ty, operand, index, name);
+          Value *new_cast = B.CreatePtrToInt(new_gep, bin_op->getType(), name);
+          bin_op->replaceAllUsesWith(new_cast);
+          bin_op->eraseFromParent();
+          casts.push_back(cast<PtrToIntInst>(new_cast));
+        }
+      }
+    }
+
+    if (int_ptr->use_empty()) {
+      int_ptr->eraseFromParent();
+    }
+  }
+
+  auto Preserved = PreservedAnalyses::all();
+  Preserved.abandon<UniformValueAnalysis>();
+  return Preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
new file mode 100644
index 0000000000000..fcb0dfca9e621
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarization_pass.cpp
@@ -0,0 +1,284 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/scalarization_pass.h"
+
+#include <compiler/utils/device_info.h>
+#include <llvm/ADT/DepthFirstIterator.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/Support/Debug.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "transform/scalarizer.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-scalarization"
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczScalarizeFail,
+          "Number of kernels that failed to scalarize [ID#S80]");
+
+ScalarizationPass::ScalarizationPass() {}
+
+namespace {
+bool needsScalarization(const Type &T) { return T.isVectorTy(); }
+
+bool needsScalarization(const Instruction &I) {
+  if (needsScalarization(*I.getType())) {
+    return true;
+  }
+  for (const Use &op : I.operands()) {
+    if (needsScalarization(*op->getType())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool isValidScalableShuffle(const ShuffleVectorInst &shuffle) {
+  // 3-element vectors are trouble, so scalarize them.
+  if (!isPowerOf2_32(cast<VectorType>(shuffle.getType())
+                         ->getElementCount()
+                         .getFixedValue())) {
+    return false;
+  }
+  if (!isPowerOf2_32(cast<VectorType>(shuffle.getOperand(0)->getType())
+                         ->getElementCount()
+                         .getFixedValue())) {
+    return false;
+  }
+  return true;
+}
+
+bool shouldScalarize(Instruction *I, bool scalable) {
+  // Don't scalarize loads or stores..
+  if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+    return false;
+  }
+
+  // We also don't scalarize element manipulations of load instructions
+  if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(I)) {
+    if (scalable && !isValidScalableShuffle(*Shuffle)) {
+      return true;
+    }
+
+    auto *SrcA = dyn_cast<Instruction>(Shuffle->getOperand(0));
+    if (SrcA && !shouldScalarize(SrcA, scalable)) {
+      return false;
+    }
+    auto *SrcB = dyn_cast<Instruction>(Shuffle->getOperand(1));
+    if (SrcB && !shouldScalarize(SrcB, scalable)) {
+      return false;
+    }
+  } else if (auto *Extract = dyn_cast<ExtractElementInst>(I)) {
+    auto *SrcA = dyn_cast<Instruction>(Extract->getOperand(0));
+    if (SrcA && !shouldScalarize(SrcA, scalable)) {
+      return false;
+    }
+  }
+
+  // We also don't scalarize masked memory operations
+  if (auto *CI = dyn_cast<CallInst>(I)) {
+    if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
+      if (MaskedOp->isMaskedMemOp()) {
+        return false;
+      }
+    }
+  }
+
+  // Scalarize anything else
+  return true;
+}
+
+/// @brief Operand Tracer struct
+/// The purpose of this helper struct is to trace through the operands of any
+/// given instruction, incrementing a usage counter, which we can compare to
+/// the total number of uses for an instruction. If any instruction's counter
+/// is equal to its total usage count, it has no uses other than ones we have
+/// marked.
+struct OperandTracer {
+  using VisitSet = DenseSet<Instruction *>;
+
+  UniformValueResult &UVR;
+  bool scalable;
+  VisitSet visited;
+  SmallVector<Instruction *, 16> stack;
+
+  OperandTracer(UniformValueResult &uvr, bool sc) : UVR(uvr), scalable(sc) {}
+
+  void count(Instruction *I) {
+    if (visited.insert(I).second) {
+      stack.push_back(I);
+    }
+  }
+
+  void countOperand(Value *V) {
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      countInstruction(I);
+    }
+  }
+
+  void countInstruction(Instruction *I) {
+    if (scalable) {
+      if (auto *const shuffle = dyn_cast<ShuffleVectorInst>(I)) {
+        if (!isValidScalableShuffle(*shuffle)) {
+          return;
+        }
+      }
+    }
+
+    if (I->getType()->isVectorTy() && UVR.isVarying(I)) {
+      count(I);
+    }
+  }
+
+  void countOperands(Instruction *I) {
+    if (auto *Phi = dyn_cast<PHINode>(I)) {
+      for (auto &use : Phi->incoming_values()) {
+        countOperand(use.get());
+      }
+      return;
+    }
+
+    for (auto *V : I->operand_values()) {
+      countOperand(V);
+    }
+  }
+
+  void run() {
+    while (!stack.empty()) {
+      Instruction *I = stack.back();
+      stack.pop_back();
+      countOperands(I);
+    }
+  }
+};
+
+} // namespace
+
+PreservedAnalyses ScalarizationPass::run(llvm::Function &F,
+                                         llvm::FunctionAnalysisManager &AM) {
+  VectorizationUnit &VU = AM.getResult<VectorizationUnitAnalysis>(F).getVU();
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+  const auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  const auto *DI =
+      MAMProxy.getCachedResult<compiler::utils::DeviceInfoAnalysis>(
+          *F.getParent());
+  const bool DoubleSupport = DI && DI->double_capabilities != 0;
+
+  const bool FullScalarization =
+      VU.choices().isEnabled(VectorizationChoices::eFullScalarization);
+  bool NeedsScalarization = false;
+  Scalarizer SR(F, Ctx, DoubleSupport);
+
+  UniformValueResult &UVR = AM.getResult<UniformValueAnalysis>(F);
+
+  // Find vector leaves that need to be scalarized.
+  std::vector<Instruction *> Leaves;
+  UVR.findVectorLeaves(Leaves);
+
+  if (FullScalarization) {
+    // Find varying vector values that need to be scalarized.
+    for (BasicBlock *BB : depth_first(&F)) {
+      for (Instruction &I : *BB) {
+        if (needsScalarization(*I.getType()) && UVR.isVarying(&I)) {
+          SR.setNeedsScalarization(&I);
+          NeedsScalarization = true;
+        }
+      }
+    }
+
+    for (Instruction *Leaf : Leaves) {
+      if (needsScalarization(*Leaf) && getVectorType(Leaf)) {
+        SR.setNeedsScalarization(Leaf);
+        NeedsScalarization = true;
+      }
+    }
+  } else {
+    // We use the tracer to identify instructions that are only used by
+    // scalar instructions (i.e. ExtractElement instructions and reductions).
+    //
+    // Since these instructions don't necessarily use all lanes of their
+    // operands, scalarization can produce dead code, which will get removed
+    // by later cleanup optimizations. Reductions are generally much better
+    // off scalarized.
+    const bool scalable = VU.width().isScalable();
+
+    OperandTracer tracer(UVR, scalable);
+    for (Instruction *Leaf : Leaves) {
+      if (needsScalarization(*Leaf) && getVectorType(Leaf)) {
+        tracer.countOperands(Leaf);
+      }
+    }
+    // Vector-to-scalar bitcasts aren't normally counted as vector leaves, but
+    // in this case we void unnecessary scalarization if we do.
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (auto *B = dyn_cast<BitCastInst>(&I)) {
+          if (B->getSrcTy()->isVectorTy() && !B->getDestTy()->isVectorTy() &&
+              UVR.isVarying(B)) {
+            tracer.countOperands(B);
+          }
+        }
+      }
+    }
+
+    tracer.run();
+
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        if (!shouldScalarize(&I, scalable)) {
+          continue;
+        }
+
+        if (I.getType()->isVectorTy() && UVR.isVarying(&I) &&
+            !tracer.visited.contains(&I)) {
+          SR.setNeedsScalarization(&I);
+          NeedsScalarization = true;
+        }
+      }
+    }
+  }
+
+  if (!NeedsScalarization) {
+    return PreservedAnalyses::all();
+  }
+
+  if (!SR.scalarizeAll()) {
+    ++VeczScalarizeFail;
+    return VU.setFailed("Failed to scalarize");
+  }
+
+  PreservedAnalyses Preserved;
+  Preserved.preserve<DominatorTreeAnalysis>();
+  Preserved.preserve<LoopAnalysis>();
+  Preserved.preserve<CFGAnalysis>();
+  Preserved.preserve<DivergenceAnalysis>();
+  return Preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
new file mode 100644
index 0000000000000..af44c92bfd780
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/scalarizer.cpp
@@ -0,0 +1,1583 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/scalarizer.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/ADT/SmallPtrSet.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/Analysis/InstructionSimplify.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/Intrinsics.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <algorithm>
+
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "simd_packet.h"
+#include "transform/printf_scalarizer.h"
+#include "vectorization_context.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz-scalarization"
+
+namespace {
+/// @brief The maximum vector width that Vecz can handle.
+///
+/// The current limitation is due to the masks being used in the SimdPackets
+/// being stored as uint64_t.
+const unsigned MAX_SIMD_WIDTH = 64;
+} // namespace
+
+using namespace vecz;
+using namespace llvm;
+
+STATISTIC(VeczScalarized, "Number of instructions scalarized [ID#S00]");
+STATISTIC(VeczScalarizeFailCall,
+          "Scalarize: missing function declarations [ID#S81]");
+STATISTIC(VeczScalarizeFailBuiltin,
+          "Scalarize: non-scalarizable builtins [ID#S82]");
+STATISTIC(VeczScalarizeFailPrintf,
+          "Scalarize: failures to scalarize printf [ID#S83]");
+STATISTIC(VeczScalarizeFailCast,
+          "Scalarize: failures to scalarize cast [ID#S84]");
+STATISTIC(VeczScalarizeFailBitcast,
+          "Scalarize: failures to scalarize bitcast [ID#S85]");
+STATISTIC(VeczScalarizeFailReduceIntrinsic,
+          "Scalarize: failures to scalarize vector.reduce intrinsic [ID#S86]");
+
+Scalarizer::Scalarizer(llvm::Function &F, VectorizationContext &ctx,
+                       bool DoubleSuport)
+    : Ctx(ctx), F(F), DoubleSupport(DoubleSuport) {}
+
+SimdPacket *Scalarizer::getPacket(const Value *V, unsigned Width, bool Create) {
+  auto infoIt = packets.find(V);
+  if (infoIt != packets.end()) {
+    return infoIt->second.get();
+  }
+
+  if (Create) {
+    auto *P = (packets[V] = std::make_unique<SimdPacket>()).get();
+    P->resize(Width);
+    return P;
+  } else {
+    return nullptr;
+  }
+}
+
+Value *Scalarizer::getGather(Value *V) {
+  auto &Cache = Gathers[V];
+  if (Cache) {
+    return Cache;
+  }
+
+  // Build the gather directly before the original instruction.
+  // If it is not an instruction just return the original.
+  auto *insert = dyn_cast<Instruction>(V);
+  if (!insert) {
+    Cache = V;
+    return V;
+  }
+
+  auto *VecTy = cast<FixedVectorType>(V->getType());
+  const unsigned SimdWidth = VecTy->getNumElements();
+
+  SimdPacket *P = getPacket(V, SimdWidth, false);
+  assert(P);
+
+  // Have to build after any PHI nodes.
+  while (isa<PHINode>(insert)) {
+    insert = insert->getNextNode();
+  }
+  IRBuilder<> B(insert);
+
+  // If every element in the packet is the same, create a vector splat instead
+  // of individually inserting every element.
+  Value *const splat = [](SimdPacket &P) -> Value * {
+    Value *const first = P.at(0);
+    for (unsigned i = 1; i < P.size(); i++) {
+      if (P.at(i) != first) {
+        return nullptr;
+      }
+    }
+    return first;
+  }(*P);
+  if (splat) {
+    return Cache =
+               B.CreateVectorSplat(ElementCount::getFixed(P->size()), splat);
+  }
+
+  Value *Result = PoisonValue::get(V->getType());
+  for (unsigned i = 0; i < P->size(); i++) {
+    if (auto *At = P->at(i)) {
+      if (!isa<UndefValue>(At)) {
+        Result = B.CreateInsertElement(Result, At, B.getInt32(i));
+      }
+    }
+  }
+
+  Cache = Result;
+  return Result;
+}
+
+void Scalarizer::setNeedsScalarization(Value *V) {
+  // Only mark each value once, but preserve the order
+  if (ScalarizeSet.insert(V).second) {
+    ToScalarize.push_back(V);
+  }
+}
+
+bool Scalarizer::scalarizeAll() {
+  // scalar instructions that use values to be scalarized.
+  for (Value *V : ToScalarize) {
+    auto *VecTy = getVectorType(V);
+    assert(VecTy && "Trying to scalarize a non-vector");
+    const unsigned SimdWidth = VecTy->getNumElements();
+    // In the SimdPacket we use a mask that is stored as a uint64_t. Due
+    // to that, there is a limit on the vector size that Vecz can
+    // handle.
+    VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large");
+
+    PacketMask PM;
+    PM.enableAll(SimdWidth);
+    if (!scalarize(V, PM)) {
+      return false;
+    }
+  }
+
+  // Beware of instructions not being processed strictly in dominance order.
+  DenseSet<Instruction *> ScalarLeaves;
+  for (Value *V : ToScalarize) {
+    if (Failures.contains(V)) {
+      continue;
+    }
+
+    // Any user of a scalarized instruction that is not itself scalarized needs
+    // its operands fixing up to use the scalarized versions.
+    for (auto *U : V->users()) {
+      if (auto *I = dyn_cast<Instruction>(U)) {
+        if (!ScalarizeSet.contains(I)) {
+          ScalarLeaves.insert(I);
+        }
+      }
+    }
+  }
+
+  for (Instruction *I : ScalarLeaves) {
+    if (!scalarizeOperands(I)) {
+      emitVeczRemarkMissed(&F, I, "Could not scalarize");
+      return false;
+    }
+  }
+
+  IC.deleteInstructions();
+  return true;
+}
+
+Value *Scalarizer::scalarizeOperands(Instruction *I) {
+  // Vector extractions.
+  if (ExtractElementInst *Extract = dyn_cast<ExtractElementInst>(I)) {
+    // In the SimdPacket we use a mask that is stored as a uint64_t. Due to
+    // that, there is a limit on the vector size that Vecz can handle.
+    VECZ_ERROR_IF(multi_llvm::getVectorNumElements(
+                      Extract->getVectorOperandType()) > MAX_SIMD_WIDTH,
+                  "The SIMD width is too large");
+    return scalarizeOperandsExtractElement(Extract);
+  }
+
+  // Vector -> non-vector bitcasts.
+  if (BitCastInst *BC = dyn_cast<BitCastInst>(I)) {
+    if (BC->getSrcTy()->isVectorTy() && !BC->getDestTy()->isVectorTy()) {
+      // In the SimdPacket we use a mask that is stored as a uint64_t. Due to
+      // that, there is a limit on the vector size that Vecz can handle.
+      VECZ_ERROR_IF(multi_llvm::getVectorNumElements(BC->getSrcTy()) >
+                        MAX_SIMD_WIDTH,
+                    "The SIMD width is too large");
+      return scalarizeOperandsBitCast(BC);
+    }
+  }
+
+  // printf or reduction intrinsic calls
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    Function *Callee = CI->getCalledFunction();
+    VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall);
+
+    // printf calls:
+    if (!Callee->isIntrinsic()) {
+      // Check if this is indeed a printf call
+      const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+      if (auto B = BI.analyzeBuiltin(*Callee)) {
+        if (B->ID == BI.getPrintfBuiltin()) {
+          return scalarizeOperandsPrintf(CI);
+        }
+      }
+    }
+
+    // reduction intrinsics:
+    if (auto *Intrin = dyn_cast<IntrinsicInst>(CI)) {
+      if (auto *reduce = scalarizeReduceIntrinsic(Intrin)) {
+        return reduce;
+      }
+    }
+  }
+
+  // No special-case handling, so just gather any scalarized operands
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    auto *Op = I->getOperand(i);
+    if (ScalarizeSet.contains(Op)) {
+      I->setOperand(i, getGather(Op));
+    }
+  }
+
+  return I;
+}
+
+Value *Scalarizer::scalarizeOperandsPrintf(CallInst *CI) {
+  VECZ_STAT_FAIL_IF(CI->arg_empty(), VeczScalarizeFailPrintf);
+
+  // Get the format string as a string
+  GlobalVariable *FmtStringGV = GetFormatStringAsValue(CI->getArgOperand(0));
+  VECZ_STAT_FAIL_IF(!FmtStringGV, VeczScalarizeFailCall);
+  const std::string FmtString = GetFormatStringAsString(FmtStringGV);
+  VECZ_STAT_FAIL_IF(FmtString.empty(), VeczScalarizeFailCall);
+  std::string NewFmtString;
+  const EnumPrintfError err =
+      ScalarizeAndCheckFormatString(FmtString, NewFmtString);
+  // Check if the format string was scalarizer successfully
+  VECZ_STAT_FAIL_IF(err != kPrintfError_success, VeczScalarizeFailCall);
+
+  // Create a new global variable out of the new format string
+  GlobalVariable *NewFmtStringGV = GetNewFormatStringAsGlobalVar(
+      *CI->getModule(), FmtStringGV, NewFmtString);
+
+  IRBuilder<> B(CI);
+  // Gather the operands for the new printf call, taking care to scalarize
+  // any vector operands.
+  llvm::SmallVector<Value *, 16> NewOps;
+  for (const Use &Op : CI->args()) {
+    // The first operand is the new format string
+    if (Op == *CI->arg_begin()) {
+      Constant *Zero = B.getInt32(0);
+      NewOps.push_back(B.CreateGEP(NewFmtStringGV->getValueType(),
+                                   NewFmtStringGV, {Zero, Zero}));
+      continue;
+    }
+    // The rest of the operands can either be copied or scalarized
+    if (!Op->getType()->isVectorTy()) {
+      // Non-vector operand, just copy
+      NewOps.push_back(Op.get());
+    } else {
+      // Vector operand, scalarize
+      // In the SimdPacket we use a mask that is stored as a uint64_t. Due
+      // to that, there is a limit on the vector size that Vecz can handle.
+      const uint32_t SimdWidth =
+          multi_llvm::getVectorNumElements(Op->getType());
+      VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large");
+      PacketMask PM;
+      PM.enableAll(SimdWidth);
+      const SimdPacket *OpPacket = scalarize(Op.get(), PM);
+      VECZ_STAT_FAIL_IF(!OpPacket, VeczScalarizeFailCall);
+      for (unsigned i = 0; i < OpPacket->size(); ++i) {
+        Value *Lane = OpPacket->at(i);
+        VECZ_STAT_FAIL_IF(!Lane, VeczScalarizeFailCall);
+        // We need to promote half and floats to doubles, as per 6.5.2.2/6
+        // in the C99 standard, but not if the device does not have double
+        // support, in which case we need to promote them to floats, as per
+        // 6.12.13.2 in the OpenCL 1.2 standard.
+        Type *LaneTy = Lane->getType();
+        Type *PromotionType = DoubleSupport ? B.getDoubleTy() : B.getFloatTy();
+        if (LaneTy->isFloatingPointTy() &&
+            LaneTy->getPrimitiveSizeInBits() <
+                PromotionType->getPrimitiveSizeInBits()) {
+          VECZ_ERROR_IF(!LaneTy->isHalfTy() && !LaneTy->isFloatTy(),
+                        "Unexpected floating point type");
+          Lane = B.CreateFPExt(Lane, PromotionType);
+        }
+        NewOps.push_back(Lane);
+      }
+    }
+  }
+  // Create the new printf call
+  Function *Callee = CI->getCalledFunction();
+  CallInst *NewCI = B.CreateCall(Callee, NewOps, CI->getName());
+  NewCI->setCallingConv(CI->getCallingConv());
+  NewCI->setAttributes(CI->getAttributes());
+
+  // Replace all uses of the old one with the new one
+  CI->replaceAllUsesWith(NewCI);
+  IC.deleteInstructionLater(CI);
+
+  return NewCI;
+}
+
+Value *Scalarizer::scalarizeReduceIntrinsic(IntrinsicInst *Intrin) {
+  // Mark unhandled reduce intrinsics to fail (for now)
+  bool isHandled = true;
+  Instruction::BinaryOps BinOpcode;
+  switch (Intrin->getIntrinsicID()) {
+  default:
+    isHandled = false;
+    break;
+  case Intrinsic::vector_reduce_and:
+    BinOpcode = Instruction::And;
+    break;
+  case Intrinsic::vector_reduce_or:
+    BinOpcode = Instruction::Or;
+    break;
+  case Intrinsic::vector_reduce_xor:
+    BinOpcode = Instruction::Xor;
+    break;
+  case Intrinsic::vector_reduce_add:
+    // TODO: Need to handle FP reduce_add (Instruction::FAdd)
+    if (!Intrin->getType()->isFloatTy()) {
+      BinOpcode = Instruction::Add;
+    } else {
+      isHandled = false;
+    }
+    break;
+  case Intrinsic::vector_reduce_mul:
+    // TODO: Need to handle FP reduce_mul (Instruction::FMul)
+    if (!Intrin->getType()->isFloatTy()) {
+      BinOpcode = Instruction::Mul;
+    } else {
+      isHandled = false;
+    }
+    break;
+  case Intrinsic::vector_reduce_fadd:
+    // TODO: Need to handle FP reduce_add
+    isHandled = false;
+    break;
+  case Intrinsic::vector_reduce_fmul:
+    // TODO: Need to handle FP reduce_mul
+    isHandled = false;
+    break;
+  case Intrinsic::vector_reduce_fmax:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_umax:
+    // TODO: Need to handle Int (signed/unsigned) Max and FP Max
+    isHandled = false;
+    break;
+  case Intrinsic::vector_reduce_fmin:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_umin:
+    // TODO: Need to handle Int (signed/unsigned) Min and FP Min
+    isHandled = false;
+    break;
+  }
+  // If it's an intrinsic we don't handle here, return nullptr and fallback
+  // to simple gathering of any scalarized operands.
+  if (!isHandled) {
+    return nullptr;
+  }
+
+  // We need to handle more reduce intrinsics such as with more than 1 operand
+  // like 'fadd' and 'fmul', where the first operand is scalar and the second
+  // is the vector. However, the current scalarization analysis won't let these
+  // through and will fail, so we the reduce intrinsic scalarization takes in
+  // account only the the first (vector) operand, which is the only operand for
+  // the integer reduce cases.
+  Value *Vec = Intrin->getOperand(0);
+  assert(Vec && "Could not get operand 0 of Intrin");
+
+  // In the SimdPacket we use a mask that is stored as a uint64_t. Due to
+  // that, there is a limit on the vector size that Vecz can handle.
+  auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+  VECZ_FAIL_IF(!VecTy);
+  const uint32_t SimdWidth = VecTy->getNumElements();
+  VECZ_ERROR_IF(SimdWidth > MAX_SIMD_WIDTH, "The SIMD width is too large");
+
+  PacketMask PM;
+  IRBuilder<> B(Intrin);
+  PM.enableAll(SimdWidth);
+
+  const SimdPacket *Packet = scalarize(Vec, PM);
+  VECZ_STAT_FAIL_IF(!Packet, VeczScalarizeFailReduceIntrinsic);
+
+  Type *const VecEleTy = VecTy->getElementType();
+  Value *Result = ConstantInt::getNullValue(VecEleTy);
+  for (unsigned i = 0; i < Packet->size(); ++i) {
+    Value *const Lane = Packet->at(i);
+    VECZ_STAT_FAIL_IF(!Lane, VeczScalarizeFailCall);
+    Type *const LaneTy = Lane->getType();
+    VECZ_ERROR_IF(LaneTy->isFloatTy(), "Unexpected floating point type");
+    Result = B.CreateBinOp(BinOpcode, Result, Lane);
+  }
+
+  Intrin->replaceAllUsesWith(Result);
+  IC.deleteInstructionLater(Intrin);
+
+  return Result;
+}
+
+Value *Scalarizer::scalarizeOperandsExtractElement(ExtractElementInst *Extr) {
+  // Determine the extraction index.
+  Value *OrigVec = Extr->getOperand(0);
+  Value *ExtractIndex = Extr->getOperand(1);
+  assert(OrigVec && "Could not get operand 0 of Extr");
+  assert(ExtractIndex && "Could not get operand 1 of Extr");
+  ConstantInt *ConstantExtractIndex = dyn_cast<ConstantInt>(ExtractIndex);
+  PacketMask PM;
+  SimdPacket *OrigVecPacket;
+  Value *ReturnVal;
+
+  if (!ConstantExtractIndex) {
+    // Index of extractElementInst is not a constant
+    // Scalarize the original vector for all lanes.
+    auto *Vec = dyn_cast<FixedVectorType>(OrigVec->getType());
+    const unsigned VecWidth = Vec ? Vec->getNumElements() : 0;
+    PM.enableAll(VecWidth);
+    OrigVecPacket = scalarize(OrigVec, PM);
+    VECZ_FAIL_IF(!OrigVecPacket);
+
+    IRBuilder<> B(Extr);
+    Value *Select = PoisonValue::get(Extr->getType());
+    for (unsigned lane = 0; lane < VecWidth; lane++) {
+      // Check if the the lane matches the extract index and select
+      // the corresponding value
+      Value *Cmp = B.CreateICmpEQ(
+          ConstantInt::get(ExtractIndex->getType(), lane), ExtractIndex);
+      Select = B.CreateSelect(Cmp, OrigVecPacket->at(lane), Select);
+    }
+    ReturnVal = Select;
+  } else {
+    // Scalarize the original vector, but only for the lane to extract.
+    const unsigned Lane = ConstantExtractIndex->getZExtValue();
+    PM.enable(Lane);
+    OrigVecPacket = scalarize(OrigVec, PM);
+    VECZ_FAIL_IF(!OrigVecPacket);
+    ReturnVal = OrigVecPacket->at(Lane);
+  }
+
+  // Replace the extraction by the extracted lane value.
+  Extr->replaceAllUsesWith(ReturnVal);
+  IC.deleteInstructionLater(Extr);
+  return ReturnVal;
+}
+
+Value *Scalarizer::scalarizeOperandsBitCast(BitCastInst *BC) {
+  auto *VecSrcTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
+  VECZ_FAIL_IF(!VecSrcTy);
+  const unsigned SimdWidth = VecSrcTy->getNumElements();
+  PacketMask PM;
+  PM.enableAll(SimdWidth);
+  const SimdPacket *SrcPacket = scalarize(BC->getOperand(0), PM);
+  VECZ_FAIL_IF(!SrcPacket);
+
+  Type *DstTy = BC->getDestTy();
+  Type *DstAsIntTy = DstTy;
+  Type *SrcEleTy = VecSrcTy->getElementType();
+  Type *SrcEleAsIntTy = SrcEleTy;
+  const uint64_t SrcEleBits = SrcEleTy->getScalarSizeInBits();
+  const uint64_t DstBits = DstTy->getPrimitiveSizeInBits();
+  if (!DstTy->isIntegerTy()) {
+    DstAsIntTy = IntegerType::get(BC->getContext(), DstBits);
+  }
+  if (!SrcEleTy->isIntegerTy()) {
+    SrcEleAsIntTy = IntegerType::get(BC->getContext(), SrcEleBits);
+  }
+
+  // Successively OR each scalarized value together.
+  IRBuilder<> B(BC);
+  Value *Result = ConstantInt::getNullValue(DstAsIntTy);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Value *Lane = SrcPacket->at(i);
+    if (!SrcEleTy->isIntegerTy()) {
+      Lane = B.CreateBitCast(Lane, SrcEleAsIntTy);
+    }
+    Lane = B.CreateZExt(Lane, DstAsIntTy);
+    Lane = B.CreateShl(Lane, i * SrcEleBits);
+    Result = B.CreateOr(Result, Lane);
+  }
+  if (!DstTy->isIntegerTy()) {
+    Result = B.CreateBitCast(Result, DstTy);
+  }
+  BC->replaceAllUsesWith(Result);
+  IC.deleteInstructionLater(BC);
+  return Result;
+}
+
+SimdPacket *Scalarizer::scalarize(Value *V, PacketMask PM) {
+  auto *VecTy = getVectorType(V);
+  VECZ_ERROR_IF(!VecTy,
+                "We shouldn't be trying to scalarize a non-vector instruction");
+  const unsigned SimdWidth = VecTy->getNumElements();
+
+  // Re-use cached packets, but make sure it contains all the lanes we want.
+  // If we have a cached packet with missing lanes, it will be fetched by
+  // getPacket and filled with the new lanes.
+  SimdPacket *CachedPacket = getPacket(V, SimdWidth, false);
+  if (CachedPacket && ((CachedPacket->Mask.Value & PM.Value) == PM.Value)) {
+    return CachedPacket;
+  }
+
+  // This value hasn't been scheduled for scalarization, so extract instead
+  if (!V->getType()->isVoidTy() && !ScalarizeSet.contains(V)) {
+    return extractLanes(V, PM);
+  }
+
+  // Only instructions can be scalarized at this point.
+  Instruction *Ins = dyn_cast<Instruction>(V);
+  if (!Ins) {
+    if (!V->getType()->isVoidTy()) {
+      return extractLanes(V, PM);
+    } else {
+      return assignScalar(nullptr, V);
+    }
+  }
+
+  // Figure out what kind of instruction it is and try to scalarize it.
+  SimdPacket *Result = nullptr;
+  switch (Ins->getOpcode()) {
+  default:
+    if (Ins->isBinaryOp()) {
+      Result = scalarizeBinaryOp(cast<BinaryOperator>(V), PM);
+    } else if (Ins->isCast()) {
+      Result = scalarizeCast(cast<CastInst>(V), PM);
+    } else if (Ins->isUnaryOp()) {
+      Result = scalarizeUnaryOp(cast<UnaryOperator>(V), PM);
+    }
+    break;
+  case Instruction::GetElementPtr:
+    Result = scalarizeGEP(cast<GetElementPtrInst>(V), PM);
+    break;
+  case Instruction::Store:
+    Result = scalarizeStore(cast<StoreInst>(V), PM);
+    break;
+  case Instruction::Load:
+    Result = scalarizeLoad(cast<LoadInst>(V), PM);
+    break;
+  case Instruction::Call:
+    Result = scalarizeCall(cast<CallInst>(V), PM);
+    break;
+  case Instruction::ICmp:
+    Result = scalarizeICmp(cast<ICmpInst>(V), PM);
+    break;
+  case Instruction::FCmp:
+    Result = scalarizeFCmp(cast<FCmpInst>(V), PM);
+    break;
+  case Instruction::Select:
+    Result = scalarizeSelect(cast<SelectInst>(V), PM);
+    break;
+  case Instruction::ShuffleVector:
+    Result = scalarizeShuffleVector(cast<ShuffleVectorInst>(V), PM);
+    break;
+  case Instruction::InsertElement:
+    Result = scalarizeInsertElement(cast<InsertElementInst>(V), PM);
+    break;
+  case Instruction::PHI:
+    Result = scalarizePHI(cast<PHINode>(V), PM);
+    break;
+    // Freeze instruction is not available in LLVM versions prior 10.0
+    // and not used in LLVM versions prior to 11.0
+  case Instruction::Freeze:
+    Result = scalarizeFreeze(cast<FreezeInst>(V), PM);
+    break;
+  }
+
+  if (Result) {
+    scalarizeDI(Ins, Result, SimdWidth);
+    return assignScalar(Result, V);
+  } else {
+    // If an instruction couldn't be scalarized, we can just extract its
+    // elements, but we also need to remove it from the scalarization set and
+    // add it to the failures set so any scalar leaves don't try to scalarize
+    // it again.
+    ScalarizeSet.erase(Ins);
+    Failures.insert(Ins);
+    return extractLanes(V, PM);
+  }
+}
+
+SimdPacket *Scalarizer::extractLanes(llvm::Value *V, PacketMask PM) {
+  auto *VecTy = getVectorType(V);
+  VECZ_FAIL_IF(!VecTy);
+  const unsigned SimdWidth = VecTy->getNumElements();
+  SimdPacket *P = getPacket(V, SimdWidth);
+
+  if (Constant *CVec = dyn_cast<Constant>(V)) {
+    assert(isa<FixedVectorType>(CVec->getType()) && "Invalid constant type!");
+    SimdPacket *P = getPacket(CVec, SimdWidth);
+    for (unsigned i = 0; i < SimdWidth; i++) {
+      if (!PM.isEnabled(i) || P->at(i)) {
+        continue;
+      }
+      P->set(i, CVec->getAggregateElement(i));
+    }
+    return P;
+  }
+
+  Instruction *insert = nullptr;
+
+  if (auto *Arg = dyn_cast<Argument>(V)) {
+    BasicBlock &Entry = Arg->getParent()->getEntryBlock();
+
+    // Make sure we start inserting new instructions after any allocas
+    auto insertAfter = Entry.begin();
+
+    while (isa<AllocaInst>(*insertAfter)) {
+      insertAfter++;
+    }
+    insert = &*insertAfter;
+  } else if (auto *Inst = dyn_cast<Instruction>(V)) {
+    insert = Inst->getNextNode();
+    while (isa<PHINode>(insert)) {
+      insert = insert->getNextNode();
+    }
+  } else {
+    return nullptr;
+  }
+
+  const SimplifyQuery Q(F.getParent()->getDataLayout());
+
+  IRBuilder<> B(insert);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+
+    Value *Idx = B.getInt32(i);
+    Value *Extract = simplifyExtractElementInst(V, Idx, Q);
+    if (!Extract) {
+      Extract = B.CreateExtractElement(V, Idx);
+    }
+    P->set(i, Extract);
+  }
+  return P;
+}
+
+void Scalarizer::scalarizeDI(Instruction *Original, const SimdPacket *Packet,
+                             unsigned Width) {
+  // Don't support scalarizing PHI nodes
+  if (!Packet || !Original || isa<PHINode>(Original)) {
+    return;
+  }
+
+  auto *const LAM = LocalAsMetadata::getIfExists(Original);
+  if (!LAM) {
+    return;
+  }
+
+  // Contains processed SIMD values for which we create scalar debug
+  // instructions and is used to avoid duplicate LLVM dbg.value's.
+  SmallPtrSet<Value *, 4> VectorElements;
+
+  DIBuilder DIB(*Original->getModule(), false);
+
+  for (DbgVariableRecord *const DVR : LAM->getAllDbgVariableRecordUsers()) {
+    DILocalVariable *DILocal = nullptr;
+    DebugLoc DILoc;
+
+    switch (DVR->getType()) {
+    case DbgVariableRecord::LocationType::Value:
+    case DbgVariableRecord::LocationType::Declare:
+      DILocal = DVR->getVariable();
+      DILoc = DVR->getDebugLoc();
+      break;
+    default:
+      continue;
+    }
+
+    // Create new DbgVariableRecord across enabled SIMD lanes
+    const auto bitSize = Original->getType()->getScalarSizeInBits();
+    for (unsigned lane = 0; lane < Width; ++lane) {
+      Value *LaneVal = Packet->at(lane);
+      if (LaneVal && !isa<UndefValue>(LaneVal)) {
+        // Check if the LaneVal SIMD Value is already processed
+        // and a Debug Value Intrinsic has been created for it.
+        if (VectorElements.contains(LaneVal)) {
+          continue;
+        }
+        // DWARF bit piece expressions are used to describe part of an
+        // aggregate variable, our vector, which is fragmented across multiple
+        // values. First argument takes the offset of the piece, and the second
+        // takes the piece size.
+        std::optional<DIExpression *> DIExpr =
+            DIExpression::createFragmentExpression(DIB.createExpression(),
+                                                   lane * bitSize, bitSize);
+        if (DIExpr) {
+          DIB.insertDbgValueIntrinsic(LaneVal, DILocal, *DIExpr, DILoc,
+                                      Original->getIterator());
+          VectorElements.insert(LaneVal);
+        }
+      }
+    }
+  }
+
+  auto *const MDV = MetadataAsValue::getIfExists(Original->getContext(), LAM);
+  if (!MDV) {
+    return;
+  }
+}
+
+SimdPacket *Scalarizer::assignScalar(SimdPacket *P, Value *V) {
+  if (!P) {
+    emitVeczRemarkMissed(&F, V, "Could not scalarize");
+  } else {
+    ++VeczScalarized;
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      IC.deleteInstructionLater(I);
+    }
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeLoad(LoadInst *Load, PacketMask PM) {
+  Value *PtrBase = Load->getPointerOperand();
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Load->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+
+  Type *ScalarEleTy = VecDataTy->getElementType();
+
+  // Absorb redundant bitcasts
+  GetElementPtrInst *PtrGEP = dyn_cast<GetElementPtrInst>(PtrBase);
+  const bool InBounds = (PtrGEP && PtrGEP->isInBounds());
+
+  IRBuilder<> B(Load);
+
+  SimdPacket PtrPacket;
+  SimdPacket *P = getPacket(Load, SimdWidth);
+  PtrPacket.resize(SimdWidth);
+
+  // Emit scalarized pointers.
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || PtrPacket.at(i)) {
+      continue;
+    }
+
+    // Re-use GEPs if available
+    if (P->at(i)) {
+      LoadInst *LoadI = cast<LoadInst>(P->at(i));
+      Value *PtrI = LoadI->getPointerOperand();
+      if (isa<GetElementPtrInst>(PtrI)) {
+        PtrPacket.set(i, PtrI);
+        continue;
+      }
+    }
+
+    Value *Ptr = InBounds
+                     ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i))
+                     : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i));
+    PtrPacket.set(i, Ptr);
+  }
+
+  // The individual elements may need laxer alignment requirements than the
+  // whole vector.
+  const unsigned Alignment = Load->getAlign().value();
+  unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
+  EleAlign = std::min(Alignment, EleAlign);
+
+  // Emit scalarized loads.
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    LoadInst *NewLoad = B.CreateLoad(ScalarEleTy, PtrPacket.at(i),
+                                     Load->isVolatile(), Load->getName());
+
+    NewLoad->copyMetadata(*Load);
+    NewLoad->setAlignment(MaybeAlign(EleAlign).valueOrOne());
+
+    P->set(i, NewLoad);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeStore(StoreInst *Store, PacketMask PM) {
+  Value *PtrBase = Store->getPointerOperand();
+  assert(PtrBase && "Could not get pointer operand from Store");
+  auto *VecDataTy =
+      dyn_cast<FixedVectorType>(Store->getValueOperand()->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+  Type *ScalarEleTy = VecDataTy->getElementType();
+  Value *VectorData = Store->getValueOperand();
+
+  // Emit scalarized data values.
+  const SimdPacket *DataPacket = scalarize(VectorData, PM);
+  VECZ_FAIL_IF(!DataPacket);
+
+  GetElementPtrInst *PtrGEP = dyn_cast<GetElementPtrInst>(PtrBase);
+  const bool InBounds = (PtrGEP && PtrGEP->isInBounds());
+
+  IRBuilder<> B(Store);
+
+  SimdPacket PtrPacket;
+  SimdPacket *P = getPacket(Store, SimdWidth);
+  PtrPacket.resize(SimdWidth);
+
+  // Emit scalarized pointers.
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || PtrPacket.at(i)) {
+      continue;
+    }
+
+    // Re-use GEPs if available
+    if (P->at(i)) {
+      StoreInst *StoreI = cast<StoreInst>(P->at(i));
+      Value *PtrI = StoreI->getPointerOperand();
+      if (isa<GetElementPtrInst>(PtrI)) {
+        PtrPacket.set(i, PtrI);
+        continue;
+      }
+    }
+
+    Value *Ptr = InBounds
+                     ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i))
+                     : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i));
+    PtrPacket.set(i, Ptr);
+  }
+
+  // See comment at equivalent part of scalarizeLoad()
+  const unsigned Alignment = Store->getAlign().value();
+  unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
+  EleAlign = std::min(Alignment, EleAlign);
+
+  // Emit scalarized stores.
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *Data = DataPacket->at(i);
+    if (isa<UndefValue>(Data)) {
+      P->set(i, Data);
+    } else {
+      StoreInst *NewStore =
+          B.CreateStore(Data, PtrPacket.at(i), Store->isVolatile());
+
+      NewStore->copyMetadata(*Store);
+      NewStore->setAlignment(MaybeAlign(EleAlign).valueOrOne());
+
+      P->set(i, NewStore);
+    }
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeBinaryOp(BinaryOperator *BinOp,
+                                          PacketMask PM) {
+  IRBuilder<> B(BinOp);
+  Value *LHS = BinOp->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(LHS->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+  const SimdPacket *LHSPacket = scalarize(LHS, PM);
+  VECZ_FAIL_IF(!LHSPacket);
+  Value *RHS = BinOp->getOperand(1);
+  const SimdPacket *RHSPacket = scalarize(RHS, PM);
+  VECZ_FAIL_IF(!RHSPacket);
+  SimdPacket *P = getPacket(BinOp, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New = B.CreateBinOp(BinOp->getOpcode(), LHSPacket->at(i),
+                               RHSPacket->at(i), BinOp->getName());
+    if (BinaryOperator *NewBinOp = dyn_cast<BinaryOperator>(New)) {
+      NewBinOp->copyIRFlags(BinOp);
+    }
+    P->set(i, New);
+  }
+  return P;
+}
+
+// Freeze instruction is not available in LLVM versions prior 10.0
+// and not used in LLVM versions prior to 11.0
+SimdPacket *Scalarizer::scalarizeFreeze(FreezeInst *FreezeI, PacketMask PM) {
+  IRBuilder<> B(FreezeI);
+  Value *Src = FreezeI->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Src->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+  const SimdPacket *SrcPacket = scalarize(Src, PM);
+  VECZ_FAIL_IF(!SrcPacket);
+
+  // Create scalarized freeze.
+  SimdPacket *P = getPacket(FreezeI, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New = B.CreateFreeze(SrcPacket->at(i), FreezeI->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeUnaryOp(UnaryOperator *UnOp, PacketMask PM) {
+  IRBuilder<> B(UnOp);
+  Value *Src = UnOp->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Src->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+  const SimdPacket *SrcPacket = scalarize(Src, PM);
+  VECZ_FAIL_IF(!SrcPacket);
+  SimdPacket *P = getPacket(UnOp, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New =
+        B.CreateUnOp(UnOp->getOpcode(), SrcPacket->at(i), UnOp->getName());
+    if (UnaryOperator *NewUnOp = dyn_cast<UnaryOperator>(New)) {
+      NewUnOp->copyIRFlags(UnOp);
+    }
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeCast(CastInst *CastI, PacketMask PM) {
+  // Make sure we support the cast operation.
+  const CastInst::CastOps Opc = CastI->getOpcode();
+  switch (Opc) {
+  default:
+    return nullptr;
+  case CastInst::BitCast:
+    return scalarizeBitCast(cast<BitCastInst>(CastI), PM);
+  case CastInst::Trunc:
+  case CastInst::ZExt:
+  case CastInst::SExt:
+  case CastInst::FPToUI:
+  case CastInst::FPToSI:
+  case CastInst::UIToFP:
+  case CastInst::SIToFP:
+  case CastInst::FPTrunc:
+  case CastInst::FPExt:
+  case CastInst::AddrSpaceCast:
+    break;
+  }
+
+  // Scalarize the source value.
+  IRBuilder<> B(CastI);
+  Value *Src = CastI->getOperand(0);
+  auto *VecSrcTy = dyn_cast<FixedVectorType>(Src->getType());
+  VECZ_FAIL_IF(!VecSrcTy);
+  const unsigned SimdWidth = VecSrcTy->getNumElements();
+  auto *VecDstTy = dyn_cast<FixedVectorType>(CastI->getType());
+  VECZ_STAT_FAIL_IF(!VecDstTy || (VecDstTy->getNumElements() != SimdWidth),
+                    VeczScalarizeFailCast);
+  const SimdPacket *SrcPacket = scalarize(Src, PM);
+  VECZ_FAIL_IF(!SrcPacket);
+
+  // Create scalarized casts.
+  SimdPacket *P = getPacket(CastI, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    auto *const SrcPacketValue = SrcPacket->at(i);
+    VECZ_FAIL_IF(!SrcPacketValue);
+    Value *New = B.CreateCast(Opc, SrcPacketValue, VecDstTy->getElementType(),
+                              CastI->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeBitCast(BitCastInst *BC, PacketMask PM) {
+  IRBuilder<> B(BC);
+  Type *SrcTy = BC->getSrcTy();
+  Value *Src = BC->getOperand(0);
+  auto *VecSrcTy = dyn_cast<FixedVectorType>(SrcTy);
+  auto *VecDstTy = dyn_cast<FixedVectorType>(BC->getDestTy());
+  VECZ_FAIL_IF(!VecDstTy);
+  const unsigned SimdWidth = VecDstTy->getNumElements();
+  const bool Vec3Src = VecSrcTy && (VecSrcTy->getNumElements() == 3);
+  const bool Vec3Dst = (SimdWidth == 3);
+  VECZ_STAT_FAIL_IF(Vec3Src ^ Vec3Dst, VeczScalarizeFailBitcast);
+
+  // Handle non-vector -> vector casts and vector casts with different widths.
+  if (!VecSrcTy || (VecSrcTy->getNumElements() != SimdWidth)) {
+    VECZ_FAIL_IF(BC->getModule()->getDataLayout().isBigEndian());
+
+    // Treat scalars as vectors of length 1.
+    SimdPacket SrcScalar{Src};
+    SimdPacket &S =
+        VecSrcTy ? *getPacket(Src, VecSrcTy->getNumElements()) : SrcScalar;
+    Type *const SrcEleTy = VecSrcTy ? VecSrcTy->getElementType() : SrcTy;
+    // Source element need not be a primitive if it was a non-vector, but in
+    // that case we know the size must match the destination vector type.
+    const size_t SrcEleSize = VecSrcTy ? SrcEleTy->getPrimitiveSizeInBits()
+                                       : VecDstTy->getPrimitiveSizeInBits();
+    Type *const SrcEleIntTy =
+        SrcEleTy->isIntegerTy()
+            ? SrcEleTy
+            : SrcEleTy->getIntNTy(BC->getContext(),
+                                  SrcEleTy->getPrimitiveSizeInBits());
+    Type *const DstEleTy = VecDstTy->getElementType();
+    const size_t DstEleSize = DstEleTy->getPrimitiveSizeInBits();
+    Type *const DstEleIntTy =
+        DstEleTy->isIntegerTy()
+            ? DstEleTy
+            : DstEleTy->getIntNTy(BC->getContext(),
+                                  DstEleTy->getPrimitiveSizeInBits());
+    SimdPacket *P = getPacket(BC, SimdWidth);
+    PacketMask SPM;
+    for (unsigned i = 0; i < SimdWidth; i++) {
+      if (!PM.isEnabled(i) || P->at(i)) {
+        continue;
+      }
+      if (VecSrcTy) {
+        for (unsigned j = i * DstEleSize / SrcEleSize;
+             j * SrcEleSize < (i + 1) * DstEleSize; ++j) {
+          SPM.enable(j);
+        }
+        const SimdPacket *SrcPacket = scalarize(Src, SPM);
+        VECZ_FAIL_IF(!SrcPacket);
+        assert(SrcPacket == &S &&
+               "Scalarization of Src should update existing packet");
+      }
+      Value *Lane = nullptr;
+      for (unsigned j = i * DstEleSize / SrcEleSize;
+           j * SrcEleSize < (i + 1) * DstEleSize; ++j) {
+        Value *SrcPart = S[j];
+        assert(
+            SrcPart &&
+            "Scalarization of Src failure should have been detected earlier");
+        if (SrcEleIntTy != SrcEleTy) {
+          SrcPart = B.CreateBitCast(SrcPart, SrcEleIntTy);
+        }
+        if (SrcEleIntTy->getIntegerBitWidth() <
+            DstEleIntTy->getIntegerBitWidth()) {
+          SrcPart = B.CreateZExt(SrcPart, DstEleIntTy);
+        }
+        if (i * DstEleSize > j * SrcEleSize) {
+          SrcPart = B.CreateLShr(SrcPart, (i * DstEleSize) - (j * SrcEleSize));
+        } else if (j * SrcEleSize > i * DstEleSize) {
+          SrcPart = B.CreateShl(SrcPart, (j * SrcEleSize) - (i * DstEleSize));
+        }
+        if (SrcEleIntTy->getIntegerBitWidth() >
+            DstEleIntTy->getIntegerBitWidth()) {
+          SrcPart = B.CreateTrunc(SrcPart, DstEleIntTy);
+        }
+        Lane = Lane ? B.CreateOr(Lane, SrcPart) : SrcPart;
+      }
+      assert(Lane && "No bits found for lane");
+      if (DstEleTy != DstEleIntTy) {
+        Lane = B.CreateBitCast(Lane, DstEleTy);
+      }
+      P->set(i, Lane);
+    }
+    return P;
+  }
+
+  // Handle same width vector -> vector casts, quite a more straighforward
+  // affair.
+  const SimdPacket *SrcPacket = scalarize(Src, PM);
+  VECZ_FAIL_IF(!SrcPacket);
+  Type *DstEleTy = VecDstTy->getElementType();
+  SimdPacket *P = getPacket(BC, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *NewVal = B.CreateBitCast(SrcPacket->at(i), DstEleTy);
+    P->set(i, NewVal);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeICmp(ICmpInst *ICmp, PacketMask PM) {
+  IRBuilder<> B(ICmp);
+  Value *LHS = ICmp->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(ICmp->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+  const SimdPacket *LHSPacket = scalarize(LHS, PM);
+  VECZ_FAIL_IF(!LHSPacket);
+  Value *RHS = ICmp->getOperand(1);
+  const SimdPacket *RHSPacket = scalarize(RHS, PM);
+  VECZ_FAIL_IF(!RHSPacket);
+  SimdPacket *P = getPacket(ICmp, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New = B.CreateICmp(ICmp->getPredicate(), LHSPacket->at(i),
+                              RHSPacket->at(i), ICmp->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeFCmp(FCmpInst *FCmp, PacketMask PM) {
+  IRBuilder<> B(FCmp);
+  Value *LHS = FCmp->getOperand(0);
+  auto *VecDataTy = dyn_cast<FixedVectorType>(FCmp->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+  const SimdPacket *LHSPacket = scalarize(LHS, PM);
+  VECZ_FAIL_IF(!LHSPacket);
+  Value *RHS = FCmp->getOperand(1);
+  const SimdPacket *RHSPacket = scalarize(RHS, PM);
+  VECZ_FAIL_IF(!RHSPacket);
+  SimdPacket *P = getPacket(FCmp, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *New = B.CreateFCmp(FCmp->getPredicate(), LHSPacket->at(i),
+                              RHSPacket->at(i), FCmp->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeSelect(SelectInst *Select, PacketMask PM) {
+  IRBuilder<> B(Select);
+  Value *Cond = Select->getCondition();
+  const SimdPacket *CondPacket = nullptr;
+  if (Cond->getType()->isVectorTy()) {
+    CondPacket = scalarize(Cond, PM);
+    VECZ_FAIL_IF(!CondPacket);
+  }
+  Value *TrueVal = Select->getTrueValue();
+  auto *VecDataTy = dyn_cast<FixedVectorType>(Select->getType());
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+  const SimdPacket *TruePacket = scalarize(TrueVal, PM);
+  VECZ_FAIL_IF(!TruePacket);
+  Value *FalseVal = Select->getFalseValue();
+  const SimdPacket *FalsePacket = scalarize(FalseVal, PM);
+  VECZ_FAIL_IF(!FalsePacket);
+  SimdPacket *P = getPacket(Select, SimdWidth);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *CondLane = CondPacket ? CondPacket->at(i) : Cond;
+    Value *New = B.CreateSelect(CondLane, TruePacket->at(i), FalsePacket->at(i),
+                                Select->getName());
+    P->set(i, New);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeMaskedMemOp(CallInst *CI, PacketMask PM,
+                                             MemOp &MaskedOp) {
+  Function *Callee = CI->getCalledFunction();
+  VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall);
+  auto *VecDataTy = getVectorType(CI);
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+  assert((MaskedOp.isLoad() || MaskedOp.isStore()) &&
+         "Masked op is not a store or load!");
+
+  // Scalarize mask
+  Value *MaskOperand = MaskedOp.getMaskOperand();
+  VECZ_FAIL_IF(!MaskOperand);
+  const SimdPacket *MaskPacket = scalarize(MaskedOp.getMaskOperand(), PM);
+  VECZ_FAIL_IF(!MaskPacket);
+
+  Value *PtrBase = MaskedOp.getPointerOperand();
+  VECZ_FAIL_IF(!PtrBase);
+
+  // Scalarize data packet if this is a store
+  const SimdPacket *DataPacket = nullptr;
+  if (MaskedOp.isStore()) {
+    DataPacket = scalarize(MaskedOp.getDataOperand(), PM);
+    VECZ_FAIL_IF(!DataPacket);
+  }
+
+  Type *ScalarEleTy = VecDataTy->getElementType();
+
+  GetElementPtrInst *PtrGEP = dyn_cast<GetElementPtrInst>(PtrBase);
+  const bool InBounds = (PtrGEP && PtrGEP->isInBounds());
+
+  IRBuilder<> B(CI);
+
+  SimdPacket PtrPacket;
+  SimdPacket *P = getPacket(CI, SimdWidth);
+  PtrPacket.resize(SimdWidth);
+
+  // Create scalar pointers
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || PtrPacket.at(i)) {
+      continue;
+    }
+
+    Value *Ptr = InBounds
+                     ? B.CreateInBoundsGEP(ScalarEleTy, PtrBase, B.getInt32(i))
+                     : B.CreateGEP(ScalarEleTy, PtrBase, B.getInt32(i));
+    PtrPacket.set(i, Ptr);
+  }
+
+  const unsigned Alignment = MaskedOp.getAlignment();
+  unsigned EleAlign = ScalarEleTy->getPrimitiveSizeInBits() / 8;
+  EleAlign = std::min(Alignment, EleAlign);
+
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Instruction *ScalarMemOp = nullptr;
+    if (MaskedOp.isLoad()) {
+      ScalarMemOp =
+          createMaskedLoad(Ctx, ScalarEleTy, PtrPacket.at(i), MaskPacket->at(i),
+                           /*EVL*/ nullptr, EleAlign);
+    } else {
+      ScalarMemOp = createMaskedStore(Ctx, DataPacket->at(i), PtrPacket.at(i),
+                                      MaskPacket->at(i),
+                                      /*EVL*/ nullptr, EleAlign);
+    }
+    VECZ_FAIL_IF(!ScalarMemOp);
+    B.Insert(ScalarMemOp);
+    P->set(i, ScalarMemOp);
+  }
+
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeCall(CallInst *CI, PacketMask PM) {
+  compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+  Function *Callee = CI->getCalledFunction();
+  VECZ_STAT_FAIL_IF(!Callee, VeczScalarizeFailCall);
+  auto *VecDataTy = getVectorType(CI);
+  VECZ_FAIL_IF(!VecDataTy);
+  const unsigned SimdWidth = VecDataTy->getNumElements();
+
+  if (auto MaskedOp = MemOp::get(CI, MemOpAccessKind::Masked)) {
+    if (MaskedOp->isMaskedMemOp()) {
+      return scalarizeMaskedMemOp(CI, PM, *MaskedOp);
+    }
+  }
+
+  Value *VectorCallMask = nullptr;
+  if (Ctx.isMaskedFunction(Callee)) {
+    // We have a masked call to a function.
+    // Extract the mask from the call, we need to re-apply it later
+    VectorCallMask = CI->getArgOperand(CI->arg_size() - 1);
+
+    // Get the original function call from the masked wrapper function
+    Function *originalFunc = Ctx.getOriginalMaskedFunction(Callee);
+    Callee = originalFunc;
+  }
+
+  const auto Builtin = BI.analyzeBuiltin(*Callee);
+  VECZ_FAIL_IF(!Builtin);
+  Function *ScalarEquiv = BI.getScalarEquivalent(*Builtin, F.getParent());
+  VECZ_STAT_FAIL_IF(!ScalarEquiv, VeczScalarizeFailBuiltin);
+
+  IRBuilder<> B(CI);
+  const auto Props = Builtin->properties;
+  // Ignore the mask if present
+  const unsigned NumArgs = VectorCallMask ? CI->arg_size() - 1 : CI->arg_size();
+  SmallVector<SimdPacket *, 4> OpPackets(NumArgs);
+  SmallVector<Value *, 4> OpScalars(NumArgs);
+  for (unsigned i = 0; i < NumArgs; i++) {
+    Value *OrigOp = CI->getArgOperand(i);
+    Type *OldTy = OrigOp->getType();
+    if (OldTy->isVectorTy()) {
+      SimdPacket *OpPacket = scalarize(OrigOp, PM);
+      VECZ_FAIL_IF(!OpPacket);
+      OpPackets[i] = OpPacket;
+    } else if (PointerType *OldPtrTy = dyn_cast<PointerType>(OldTy)) {
+      auto *const PtrRetPointeeTy =
+          compiler::utils::getPointerReturnPointeeTy(*Callee, Props);
+      if (PtrRetPointeeTy && PtrRetPointeeTy->isVectorTy()) {
+        // Handle 'pointer return' arguments. The old type was Vector*, the new
+        // type is Scalar*. To accommodate the different we need to have
+        // individual offsets, one for each 'element pointer'.
+        auto *OldVecTy = cast<FixedVectorType>(PtrRetPointeeTy);
+        VECZ_STAT_FAIL_IF(OldVecTy->getNumElements() != SimdWidth,
+                          VeczScalarizeFailBuiltin);
+        Type *NewTy = OldPtrTy;
+        Value *ScalarAddrBase = B.CreateBitCast(OrigOp, NewTy);
+        SimdPacket *OpPacket = getPacket(ScalarAddrBase, SimdWidth);
+        for (unsigned j = 0; j < SimdWidth; j++) {
+          if (!PM.isEnabled(j) || OpPacket->at(j)) {
+            continue;
+          }
+          Value *ScalarAddr = B.CreateGEP(OldVecTy->getElementType(),
+                                          ScalarAddrBase, B.getInt32(j));
+          OpPacket->set(j, ScalarAddr);
+          OpPackets[i] = OpPacket;
+        }
+      } else {
+        OpScalars[i] = OrigOp;
+      }
+    } else {
+      OpScalars[i] = OrigOp;
+    }
+  }
+
+  SimdPacket *P = getPacket(CI, SimdWidth);
+  for (unsigned j = 0; j < SimdWidth; j++) {
+    if (!PM.isEnabled(j) || P->at(j)) {
+      continue;
+    }
+    SmallVector<Value *, 4> Ops;
+    for (unsigned i = 0; i < NumArgs; i++) {
+      const SimdPacket *OpPacket = OpPackets[i];
+      if (OpPacket) {
+        Ops.push_back(OpPacket->at(j));
+      } else {
+        Value *OrigOp = OpScalars[i];
+        VECZ_FAIL_IF(!OrigOp);
+        Ops.push_back(OrigOp);
+      }
+    }
+
+    CallInst *NewCI = B.CreateCall(ScalarEquiv, Ops, CI->getName());
+    NewCI->setCallingConv(CI->getCallingConv());
+    NewCI->setAttributes(CI->getAttributes());
+    // Re-apply mask. The new CI already has to exist to create the masked
+    // function which is why it gets updated here. We then need to add the
+    // mask argument back to the call, but LLVM won't let us update the existing
+    // one, so recreate the CallInst one last time
+    if (VectorCallMask) {
+      Function *MaskedScalarEquiv = Ctx.getOrCreateMaskedFunction(NewCI);
+      VECZ_FAIL_IF(!MaskedScalarEquiv);
+      Ops.push_back(VectorCallMask);
+      CallInst *NewCIMasked =
+          B.CreateCall(MaskedScalarEquiv, Ops, CI->getName());
+      NewCIMasked->setCallingConv(CI->getCallingConv());
+      NewCIMasked->setAttributes(CI->getAttributes());
+      P->set(j, NewCIMasked);
+      NewCI->eraseFromParent();
+    } else {
+      P->set(j, NewCI);
+    }
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeShuffleVector(ShuffleVectorInst *Shuffle,
+                                               PacketMask PM) {
+  auto *VecTy = dyn_cast<FixedVectorType>(Shuffle->getType());
+  VECZ_FAIL_IF(!VecTy);
+  Value *LHS = Shuffle->getOperand(0);
+  Value *RHS = Shuffle->getOperand(1);
+  assert(LHS && "Could not get operand 0");
+  assert(RHS && "Could not get operand 1");
+  auto *LHSVecTy = dyn_cast<FixedVectorType>(LHS->getType());
+  VECZ_FAIL_IF(!LHSVecTy);
+  const unsigned SrcWidth = LHSVecTy->getNumElements();
+  const unsigned DstWidth = VecTy->getNumElements();
+
+  // Determine which lanes we need from both vector operands.
+  PacketMask LHSMask;
+  PacketMask RHSMask;
+  for (unsigned i = 0; i < DstWidth; i++) {
+    if (!PM.isEnabled(i)) {
+      continue;
+    }
+    int MaskLane = Shuffle->getMaskValue(i);
+    if (MaskLane >= static_cast<int>(SrcWidth)) {
+      MaskLane -= static_cast<int>(SrcWidth);
+      RHSMask.enable(static_cast<unsigned>(MaskLane));
+    } else if (MaskLane >= 0) {
+      LHSMask.enable(static_cast<unsigned>(MaskLane));
+    }
+  }
+
+  // Scalarize each vector operand as needed.
+  const SimdPacket *LHSPacket = nullptr;
+  if (LHSMask.Value != 0) {
+    LHSPacket = scalarize(LHS, LHSMask);
+    VECZ_FAIL_IF(!LHSPacket);
+  }
+  const SimdPacket *RHSPacket = nullptr;
+  if (RHSMask.Value != 0) {
+    RHSPacket = scalarize(RHS, RHSMask);
+    VECZ_FAIL_IF(!RHSPacket);
+  }
+
+  // Copy the scalarized values to the result packet.
+  SimdPacket *P = getPacket(Shuffle, DstWidth);
+  for (unsigned i = 0; i < DstWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+    Value *Extracted = nullptr;
+    int MaskLane = Shuffle->getMaskValue(i);
+    if (MaskLane < 0) {
+      Extracted = PoisonValue::get(VecTy->getElementType());
+    } else if (MaskLane >= (int)SrcWidth) {
+      MaskLane -= (int)SrcWidth;
+      if (RHSPacket) {
+        Extracted = RHSPacket->at(MaskLane);
+      }
+    } else if (MaskLane >= 0) {
+      if (LHSPacket) {
+        Extracted = LHSPacket->at(MaskLane);
+      }
+    }
+    P->set(i, Extracted);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeInsertElement(InsertElementInst *Insert,
+                                               PacketMask PM) {
+  Value *Vec = Insert->getOperand(0);
+  VECZ_FAIL_IF(!Vec);
+  Value *Ele = Insert->getOperand(1);
+  assert(Ele && "Could not get operand 1 of Insert");
+  Value *Index = Insert->getOperand(2);
+  assert(Index && "Could not get operand 2 of Insert");
+  const ConstantInt *CIndex = dyn_cast<ConstantInt>(Index);
+  const auto *VecTy = cast<FixedVectorType>(Vec->getType());
+  const unsigned IndexInt = CIndex ? CIndex->getZExtValue() : 0;
+  const unsigned SimdWidth = VecTy->getNumElements();
+
+  SimdPacket *P = getPacket(Insert, SimdWidth);
+
+  // Scalarize the vector operand
+  PacketMask OpMask;
+  OpMask.enableAll(SimdWidth);
+  // If we have a constant mask, we can skip the lane we are not going to use
+  if (CIndex) {
+    OpMask.disable(IndexInt);
+  }
+  const SimdPacket *VecP = scalarize(Vec, OpMask);
+  VECZ_FAIL_IF(!VecP);
+
+  // For each lane, we need to select either the original vector element (from
+  // VecP) or the new value Ele. The selection is done based on the Index.
+  IRBuilder<> B(Insert);
+  for (unsigned lane = 0; lane < SimdWidth; ++lane) {
+    if (!PM.isEnabled(lane) || P->at(lane)) {
+      continue;
+    }
+    Value *LaneValue = nullptr;
+    if (CIndex) {
+      // If the Index is a Constant, then we can do the selection at compile
+      // time
+      LaneValue = (IndexInt == lane) ? Ele : VecP->at(lane);
+    } else {
+      // If the Index is a runtime value, then we have to emit select
+      // instructions to do selection at runtime
+      Constant *LaneC = ConstantInt::get(Index->getType(), lane);
+      LaneValue =
+          B.CreateSelect(B.CreateICmpEQ(Index, LaneC), Ele, VecP->at(lane));
+    }
+    P->set(lane, LaneValue);
+  }
+
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizeGEP(GetElementPtrInst *GEP, PacketMask PM) {
+  auto *const vecDataTy = dyn_cast<FixedVectorType>(GEP->getType());
+  VECZ_FAIL_IF(!vecDataTy);
+  const unsigned simdWidth = vecDataTy->getNumElements();
+
+  Value *const ptr = GEP->getPointerOperand();
+  const SimdPacket *ptrPacket = nullptr;
+  if (ptr->getType()->isVectorTy()) {
+    ptrPacket = scalarize(ptr, PM);
+    VECZ_FAIL_IF(!ptrPacket);
+  }
+
+  // Scalarize any vector GEP indices.
+  SmallVector<SimdPacket *, 4> indexPackets;
+  for (unsigned i = 0, n = GEP->getNumIndices(); i < n; ++i) {
+    Value *const idx = GEP->getOperand(1 + i);
+    if (idx->getType()->isVectorTy()) {
+      SimdPacket *idxP = scalarize(idx, PM);
+      VECZ_FAIL_IF(!idxP);
+      indexPackets.push_back(idxP);
+    } else {
+      indexPackets.push_back(nullptr);
+    }
+  }
+
+  IRBuilder<> B(GEP);
+  const bool inBounds = GEP->isInBounds();
+  const auto name = GEP->getName();
+  SimdPacket *const P = getPacket(GEP, simdWidth);
+  for (unsigned i = 0; i < simdWidth; i++) {
+    if (!PM.isEnabled(i) || P->at(i)) {
+      continue;
+    }
+
+    // Get the GEP indices per lane, scalarized or otherwise
+    SmallVector<Value *, 4> scalarIndices;
+    unsigned indexN = 1U;
+    for (auto *idx : indexPackets) {
+      if (idx) {
+        scalarIndices.push_back(idx->at(i));
+      } else {
+        scalarIndices.push_back(GEP->getOperand(indexN));
+      }
+      ++indexN;
+    }
+
+    auto *const scalarPointer = ptrPacket ? ptrPacket->at(i) : ptr;
+    Value *const newGEP =
+        inBounds ? B.CreateInBoundsGEP(GEP->getSourceElementType(),
+                                       scalarPointer, scalarIndices, name)
+                 : B.CreateGEP(GEP->getSourceElementType(), scalarPointer,
+                               scalarIndices, name);
+
+    P->set(i, newGEP);
+  }
+  return P;
+}
+
+SimdPacket *Scalarizer::scalarizePHI(PHINode *Phi, PacketMask PM) {
+  auto *PhiTy = cast<FixedVectorType>(Phi->getType());
+  const unsigned Width = PhiTy->getNumElements();
+  const unsigned NumIncoming = Phi->getNumIncomingValues();
+  SmallVector<SimdPacket *, 2> Incoming;
+
+  SimdPacket *P = getPacket(Phi, Width);
+  IRBuilder<> B(Phi);
+
+  SmallVector<unsigned, 4> ActiveLanes;
+
+  // Start by creating the Phi nodes. This is done before everything else
+  // because the IR might contain cycles which will cause the scalarization to
+  // loop back to this Phi node when scalarizing the incoming values.
+  for (unsigned lane = 0; lane < Width; ++lane) {
+    if (!PM.isEnabled(lane) || P->at(lane)) {
+      continue;
+    }
+    PHINode *SPhi =
+        B.CreatePHI(PhiTy->getElementType(), NumIncoming, Phi->getName());
+    P->set(lane, SPhi);
+    ActiveLanes.push_back(lane);
+  }
+
+  // Scalarize the incoming values
+  for (auto &In : Phi->incoming_values()) {
+    SimdPacket *SIn = scalarize(In, PM);
+    VECZ_FAIL_IF(!SIn);
+    Incoming.push_back(SIn);
+  }
+
+  // Assign the scalarized incoming values to the scalarized Phi nodes
+  for (const unsigned lane : ActiveLanes) {
+    VECZ_ERROR_IF(!PM.isEnabled(lane), "Active lane should be enabled.");
+    PHINode *SPhi = cast<PHINode>(P->at(lane));
+    for (unsigned i = 0; i < NumIncoming; ++i) {
+      SPhi->addIncoming(Incoming[i]->at(lane), Phi->getIncomingBlock(i));
+    }
+  }
+
+  return P;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
new file mode 100644
index 0000000000000..d73bdfb33df85
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/simplify_infinite_loop_pass.cpp
@@ -0,0 +1,114 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/Dominators.h>
+#include <llvm/Transforms/Scalar/LoopPassManager.h>
+
+#include <unordered_set>
+
+#include "debugging.h"
+#include "transform/passes.h"
+
+using namespace llvm;
+
+PreservedAnalyses
+vecz::SimplifyInfiniteLoopPass::run(Loop &L, LoopAnalysisManager &,
+                                    LoopStandardAnalysisResults &AR,
+                                    LPMUpdater &) {
+  bool modified = false;
+
+  SmallVector<BasicBlock *, 1> loopExitBlocks;
+  L.getExitBlocks(loopExitBlocks);
+
+  // If we have an infinite loop, create a virtual exit block that will target
+  // the unique exit block of the function.
+  if (loopExitBlocks.empty()) {
+    BasicBlock *latch = L.getLoopLatch();
+    assert(latch && "Loop should have a unique latch.");
+
+    Function *F = L.getHeader()->getParent();
+
+    // Get the return block of the function.
+    std::vector<BasicBlock *> returnBlocks;
+    for (BasicBlock &BB : *F) {
+      if (isa<ReturnInst>(BB.getTerminator())) {
+        returnBlocks.push_back(&BB);
+      }
+    }
+
+    if (returnBlocks.empty() || returnBlocks.size() > 1) {
+      assert(false && "Function should have only one exit.");
+      return PreservedAnalyses::all();
+    }
+
+    // The target of the virtual exit block of the infinite loop.
+    BasicBlock *target = returnBlocks[0];
+
+    // Replace the terminator of the latch with a fake conditional branch that
+    // will actually always target the header to maintain the semantic of the
+    // program.
+    latch->getTerminator()->eraseFromParent();
+    AR.DT.deleteEdge(latch, L.getHeader());
+    BasicBlock *virtualExit =
+        BasicBlock::Create(F->getContext(), L.getName() + ".virtual_exit", F);
+    AR.DT.addNewBlock(virtualExit, latch);
+    BranchInst::Create(L.getHeader(), virtualExit,
+                       ConstantInt::getTrue(F->getContext()), latch);
+    AR.DT.insertEdge(latch, L.getHeader());
+    AR.DT.insertEdge(latch, virtualExit);
+    BranchInst::Create(target, virtualExit);
+    AR.DT.insertEdge(virtualExit, target);
+
+    assert(AR.DT.verify() &&
+           "SimplifyInfiniteLoopPass: Dominator Tree failed verification");
+
+    // Update the phi nodes in the return block because we added a new
+    // predecessor to it.
+    for (Instruction &I : *target) {
+      if (auto *PHI = dyn_cast<PHINode>(&I)) {
+        PHI->addIncoming(PoisonValue::get(PHI->getType()), virtualExit);
+      }
+    }
+
+    modified = true;
+  } else if (loopExitBlocks.size() == 1) {
+    // Canonicalize any other infinite loops so that the loop header is the
+    // true condition successor.
+    auto *const latch = L.getLoopLatch();
+    auto *const header = L.getHeader();
+    auto *const T = latch->getTerminator();
+    if (auto *const branch = dyn_cast<BranchInst>(T)) {
+      if (branch->isConditional()) {
+        if (auto *const cond = dyn_cast<Constant>(branch->getCondition())) {
+          if (branch->getSuccessor(1) == header) {
+            modified = true;
+            auto &ctx = latch->getParent()->getContext();
+            branch->setCondition(cond->isOneValue()
+                                     ? ConstantInt::getFalse(ctx)
+                                     : ConstantInt::getTrue(ctx));
+            branch->swapSuccessors();
+          }
+        }
+      }
+    }
+  }
+
+  if (!modified) {
+    return PreservedAnalyses::all();
+  }
+
+  return getLoopPassPreservedAnalyses();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
new file mode 100644
index 0000000000000..4b09013f07756
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/squash_small_vectors_pass.cpp
@@ -0,0 +1,277 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/MathExtras.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "debugging.h"
+#include "transform/packetization_helpers.h"
+#include "transform/passes.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+/// @brief replace loads of vectors of small vector loads and stores with scalar
+/// loads and stores, where the entire vector fits into a legal integer.
+///
+/// The rationale here is that if we end up generating a scatter/gather, or
+/// interleaved memop, it would be more efficient with the wider type than with
+/// the vector of the narrower type. Although it's not trivial to know in
+/// advance if we will get a scatter/gather or interleaved or contiguous load,
+/// so we just do all of them and not worry too much about doing it when we
+/// didn't really need to.
+///
+/// Be careful not to run Instruction Combine Pass between this pass and
+/// packetization, because it is likely to undo it.
+PreservedAnalyses SquashSmallVectorsPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  bool changed = false;
+
+  const auto &UVR = AM.getResult<UniformValueAnalysis>(F);
+  const auto &SAR = AM.getResult<StrideAnalysis>(F);
+  auto &DL = F.getParent()->getDataLayout();
+  auto &context = F.getContext();
+
+  // Keep a cache of the bitcasts so we don't create multiple bitcasts for the
+  // same value in each BasicBlock.
+  DenseMap<const Value *, BitCastInst *> squashCasts;
+  auto getSquashed = [&](Value *vector, Type *intTy,
+                         IRBuilder<> &B) -> Value * {
+    auto *&bitCast = squashCasts[vector];
+    Value *element = bitCast;
+    if (!element) {
+      if (auto *const bcast = dyn_cast<BitCastInst>(vector)) {
+        // "See through" existing bitcasts.
+        element = bcast->getOperand(0);
+      } else {
+        element = vector;
+      }
+
+      if (element->getType() != intTy) {
+        // Note we have to freeze the vector value first, because individual
+        // elements can be `poison`, which would result in the entire value
+        // becoming `poison`, which is not a valid transform (it is not valid to
+        // increase the amount of `poison` in the IR).
+        element = B.CreateBitCast(B.CreateFreeze(element), intTy,
+                                  Twine(vector->getName(), ".squash"));
+        bitCast = dyn_cast<BitCastInst>(element);
+      }
+    }
+    return element;
+  };
+
+  SmallVector<Instruction *, 16> toErase;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (auto *load = dyn_cast<LoadInst>(&I)) {
+        if (!UVR.isVarying(load)) {
+          continue;
+        }
+
+        auto *const ty = load->getType();
+        auto *const scalarTy = ty->getScalarType();
+        const unsigned numBits = ty->getPrimitiveSizeInBits();
+        if (isPowerOf2_32(numBits) && scalarTy != ty &&
+            DL.fitsInLegalInteger(numBits)) {
+          const auto align = load->getAlign();
+          auto *const intTy = IntegerType::get(context, numBits);
+          if (DL.getABITypeAlign(intTy) > align) {
+            // The alignment of this type is too strict to convert
+            continue;
+          }
+
+          auto *const ptr = load->getPointerOperand();
+          const auto *const info = SAR.getInfo(ptr);
+          if (info && info->hasStride() &&
+              info->getConstantMemoryStride(ty, &DL) == 1) {
+            // No need to perform this transform on contiguous loads
+            continue;
+          }
+
+          IRBuilder<> B(load);
+          const auto name = load->getName();
+          auto *newLoad = cast<LoadInst>(
+              B.CreateLoad(intTy, ptr, Twine(name, ".squashed")));
+          newLoad->setAlignment(align);
+          newLoad->copyMetadata(*load);
+
+          auto *const newVec =
+              B.CreateBitCast(newLoad, ty, Twine(name, ".unsquash"));
+
+          load->replaceAllUsesWith(newVec);
+          toErase.push_back(load);
+          changed = true;
+        }
+      } else if (auto *store = dyn_cast<StoreInst>(&I)) {
+        if (!UVR.isVarying(store)) {
+          continue;
+        }
+
+        auto *const data = store->getValueOperand();
+        auto *const ty = data->getType();
+        auto *const scalarTy = ty->getScalarType();
+        const unsigned numBits = ty->getPrimitiveSizeInBits();
+        if (isPowerOf2_32(numBits) && scalarTy != ty &&
+            DL.fitsInLegalInteger(numBits)) {
+          const auto align = store->getAlign();
+          auto *const intTy = IntegerType::get(context, numBits);
+          if (DL.getABITypeAlign(intTy) > align) {
+            // The alignment of this type is too strict to convert
+            continue;
+          }
+
+          auto *const ptr = store->getPointerOperand();
+          const auto *const info = SAR.getInfo(ptr);
+          if (info && info->hasStride() &&
+              info->getConstantMemoryStride(ty, &DL) == 1) {
+            // No need to perform this transform on contiguous stores
+            continue;
+          }
+
+          IRBuilder<> B(store);
+          auto *const newData = getSquashed(data, intTy, B);
+          auto *newStore = cast<StoreInst>(B.CreateStore(newData, ptr));
+          newStore->setAlignment(align);
+          newStore->copyMetadata(*store);
+
+          toErase.push_back(store);
+          changed = true;
+        }
+      } else if (auto *zext = dyn_cast<ZExtInst>(&I)) {
+        if (!UVR.isVarying(zext)) {
+          continue;
+        }
+        // A zero-extend of an extract element can be squashed, if the source
+        // vector size is the same as the extended integer size. That is (for
+        // little-endian systems):
+        //
+        //   zext i32(extract <4 x i8> data, i32 3)
+        //
+        // becomes:
+        //
+        //   and(lshr(bitcast i32 data), i32 24), 0xFF)
+        //
+        // this avoids creating shufflevectors during packetization.
+        //
+        // We limit this optimization to vectors no larger than 64 bits in
+        // size. This is primarily because this optimization focuses on 'small'
+        // vectors but also, because LLVM's constants are limited to 64-bit
+        // integers, the masking logic would need to be done with extra
+        // instructions.
+        auto *const srcOp = zext->getOperand(0);
+        if (auto *const extract = dyn_cast<ExtractElementInst>(srcOp)) {
+          auto *const vector = extract->getVectorOperand();
+          auto *const indexOp = extract->getIndexOperand();
+          auto *const intTy = zext->getType();
+          auto *const vecTy = vector->getType();
+          if (vecTy->getPrimitiveSizeInBits() ==
+                  intTy->getPrimitiveSizeInBits() &&
+              zext->getSrcTy()->getPrimitiveSizeInBits() <= 32 &&
+              intTy->getScalarSizeInBits() <= 64 && isa<ConstantInt>(indexOp)) {
+            IRBuilder<> B(zext);
+            Value *element = getSquashed(vector, intTy, B);
+
+            const auto bits = zext->getSrcTy()->getScalarSizeInBits();
+            const auto scaled =
+                cast<ConstantInt>(indexOp)->getZExtValue() * bits;
+
+            // Note on Little Endian systems, element 0 occupies the least
+            // significant bits of the vector. On Big Endian systems it occupies
+            // the most significant bits. Thus, we shift by "maximum element
+            // number minus current element number" times by "number of bits
+            // per element".
+            const auto shift =
+                DL.isBigEndian()
+                    ? intTy->getPrimitiveSizeInBits() - bits - scaled
+                    : scaled;
+
+            if (shift != 0) {
+              element =
+                  B.CreateLShr(element, ConstantInt::get(intTy, shift),
+                               Twine(extract->getName(), ".squashExtract"));
+            }
+            element = B.CreateAnd(
+                element,
+                ConstantInt::get(intTy, maskTrailingOnes<uint64_t>(bits)),
+                Twine(zext->getName(), ".squashZExt"));
+
+            zext->replaceAllUsesWith(element);
+            toErase.push_back(zext);
+            changed = true;
+          }
+        }
+      } else if (auto *sext = dyn_cast<SExtInst>(&I)) {
+        if (!UVR.isVarying(sext)) {
+          continue;
+        }
+        // We can squash sign extends in-place as well.
+        // We do this by shifting the required element into most-significant
+        // position, and then arithmetic-shifting it back down to the least-
+        // significant position.
+        auto *const srcOp = sext->getOperand(0);
+        if (auto *const extract = dyn_cast<ExtractElementInst>(srcOp)) {
+          auto *const vector = extract->getVectorOperand();
+          auto *const indexOp = extract->getIndexOperand();
+          auto *const intTy = sext->getType();
+          auto *const vecTy = vector->getType();
+          if (vecTy->getPrimitiveSizeInBits() ==
+                  intTy->getPrimitiveSizeInBits() &&
+              isa<ConstantInt>(indexOp)) {
+            IRBuilder<> B(sext);
+            Value *element = getSquashed(vector, intTy, B);
+
+            const auto bits = sext->getSrcTy()->getScalarSizeInBits();
+            const auto shiftr = intTy->getPrimitiveSizeInBits() - bits;
+            const auto scaled =
+                cast<ConstantInt>(indexOp)->getZExtValue() * bits;
+            const auto shiftl = DL.isBigEndian() ? scaled : shiftr - scaled;
+
+            if (shiftl != 0) {
+              element =
+                  B.CreateShl(element, ConstantInt::get(intTy, shiftl),
+                              Twine(extract->getName(), ".squashExtract"));
+            }
+            element = B.CreateAShr(element, ConstantInt::get(intTy, shiftr),
+                                   Twine(extract->getName(), ".squashSExt"));
+
+            sext->replaceAllUsesWith(element);
+            toErase.push_back(sext);
+            changed = true;
+          }
+        }
+      }
+    }
+
+    // only re-use casts within a basic block
+    squashCasts.clear();
+  }
+
+  for (auto *I : toErase) {
+    I->eraseFromParent();
+  }
+
+  auto preserved = PreservedAnalyses::all();
+  if (changed) {
+    preserved.abandon<UniformValueAnalysis>();
+    preserved.abandon<StrideAnalysis>();
+  }
+  return preserved;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
new file mode 100644
index 0000000000000..b4ceb56dc2cd3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/ternary_transform_pass.cpp
@@ -0,0 +1,239 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "transform/ternary_transform_pass.h"
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <multi_llvm/multi_llvm.h>
+
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "ir_cleanup.h"
+#include "memory_operations.h"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+/// @brief Determine whether the select can and should be transformed. This is
+/// the case when there is at most one GEP to it and followed by Load/Store
+/// memory op and there are no other users to GEP.
+/// Additionally, we reject various cases where the tranform would not result
+/// in better code.
+bool shouldTransform(SelectInst *Select, const StrideAnalysisResult &SAR) {
+  // The transform only applies to pointer selects.
+  if (!Select->getType()->isPointerTy()) {
+    return false;
+  }
+
+  // There is absolutely no need to transform a uniform select.
+  if (!SAR.UVR.isVarying(Select)) {
+    return false;
+  }
+
+  {
+    // If the select itself is a strided pointer, we don't gain anything by
+    // transforming it into a pair of masked memops.
+    const auto *info = SAR.getInfo(Select);
+    if (info && info->hasStride()) {
+      return false;
+    }
+  }
+
+  // Validate Select operands
+  Value *VecTrue = Select->getOperand(1);
+  Value *VecFalse = Select->getOperand(2);
+
+  assert(VecTrue && VecFalse);
+
+  // If both pointers are uniform, it's worth doing the transform, since we get
+  // only scalar Mask Varying memops, instead of vector memops.
+  if (SAR.UVR.isVarying(VecTrue) || SAR.UVR.isVarying(VecFalse)) {
+    // Both pointers must be either strided or uniform (i.e. not divergent).
+    const auto *infoT = SAR.getInfo(VecTrue);
+    const auto *infoF = SAR.getInfo(VecFalse);
+    if (!infoT || !infoF || infoT->mayDiverge() || infoF->mayDiverge()) {
+      return false;
+    }
+  }
+
+  // Validate Select users
+  GetElementPtrInst *TheGEP = nullptr;
+  SmallVector<Instruction *, 8> SelectsUsers;
+  for (User *U : Select->users()) {
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // There can be at most one GEP
+      if (TheGEP) {
+        return false;
+      }
+      TheGEP = GEP;
+      SelectsUsers.push_back(GEP);
+    } else {
+      return false;
+    }
+  }
+
+  // Validate GEP users
+  while (!SelectsUsers.empty()) {
+    VECZ_FAIL_IF(!isa<GetElementPtrInst>(SelectsUsers.back()));
+    GetElementPtrInst *GEP =
+        cast<GetElementPtrInst>(SelectsUsers.pop_back_val());
+
+    // Validate the GEP indices
+    for (Value *idx : GEP->indices()) {
+      const auto *info = SAR.getInfo(idx);
+      if (!info || info->mayDiverge()) {
+        return false;
+      }
+    }
+    // We only transform selects used by GEPs who are exclusively used by
+    // scalar loads and stores. Performing this transform on vectors was
+    // historically banned due to internal limitations, but these days we
+    // *should* be able to. It's just that we don't know whether it's
+    // beneficial.
+    for (User *U : GEP->users()) {
+      if (auto *const LI = dyn_cast<LoadInst>(U)) {
+        if (LI->getType()->isVectorTy()) {
+          return false;
+        }
+      } else if (auto *const SI = dyn_cast<StoreInst>(U)) {
+        if (SI->getValueOperand()->getType()->isVectorTy()) {
+          return false;
+        }
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// @brief Try to transform the select, remove GEP & memory op and
+/// replace with transformed GEP and masked memory op.
+void Transform(SelectInst *Select, VectorizationContext &Ctx) {
+  SmallVector<Instruction *, 8> ToDelete;
+
+  auto transformSelect = [&](GetElementPtrInst *GEP, Instruction *Memop,
+                             Value *StoredValue, ArrayRef<Value *> Indices) {
+    // Non-obviously, we need to insert the new instructions at the GEP. The GEP
+    // is a user of the select, so we can guarantee that the GEP dominates the
+    // select. To ensure that the new instructions added also dominate the
+    // indices of the GEP, we need to insert at the GEP.
+    IRBuilder<> B(GEP);
+
+    Value *Condition = Select->getCondition();
+    Value *InvCondition = B.CreateXor(Condition, 1);
+    Value *True = Select->getTrueValue();
+    Value *False = Select->getFalseValue();
+    Value *GepTrue = B.CreateGEP(GEP->getSourceElementType(), True, Indices);
+    Value *GepFalse = B.CreateGEP(GEP->getSourceElementType(), False, Indices);
+    auto MaskedOp = MemOp::get(Memop);
+    assert(MaskedOp);
+    const MemOpDesc Mem = MaskedOp->getDesc();
+
+    // We should have filtered out all vector memory operations earlier.
+    assert(!Mem.getDataType()->isVectorTy());
+
+    auto Alignment = Mem.getAlignment();
+    if (isa<LoadInst>(Memop)) {
+      // Transform load
+      auto *LoadTrue =
+          createMaskedLoad(Ctx, Mem.getDataType(), GepTrue, Condition,
+                           /*VL*/ nullptr, Alignment);
+      LoadTrue->insertBefore(Memop->getIterator());
+      auto *LoadFalse =
+          createMaskedLoad(Ctx, Mem.getDataType(), GepFalse, InvCondition,
+                           /*VL*/ nullptr, Alignment);
+      LoadFalse->insertBefore(Memop->getIterator());
+      B.SetInsertPoint(Memop);
+      Value *LoadResult = B.CreateSelect(Condition, LoadTrue, LoadFalse);
+
+      // Replace all uses with new value
+      Memop->replaceAllUsesWith(LoadResult);
+    } else if (isa<StoreInst>(Memop)) {
+      // Transform store
+      createMaskedStore(Ctx, StoredValue, GepTrue, Condition, /*VL*/ nullptr,
+                        Alignment)
+          ->insertBefore(Memop->getIterator());
+      createMaskedStore(Ctx, StoredValue, GepFalse, InvCondition,
+                        /*VL*/ nullptr, Alignment)
+          ->insertBefore(Memop->getIterator());
+    }
+  };
+
+  for (User *U : Select->users()) {
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      ToDelete.push_back(GEP);
+
+      const SmallVector<Value *, 2> Indices(GEP->idx_begin(), GEP->idx_end());
+
+      for (User *G : GEP->users()) {
+        if (LoadInst *Load = dyn_cast<LoadInst>(G)) {
+          ToDelete.push_back(Load);
+          transformSelect(GEP, Load, nullptr, Indices);
+        } else if (StoreInst *Store = dyn_cast<StoreInst>(G)) {
+          ToDelete.push_back(Store);
+          transformSelect(GEP, Store, Store->getValueOperand(), Indices);
+        }
+      }
+    }
+  }
+
+  // Clean up instructions bottom-up (users first).
+  while (!ToDelete.empty()) {
+    Instruction *I = ToDelete.pop_back_val();
+    if (I->use_empty()) {
+      IRCleanup::deleteInstructionNow(I);
+    }
+  }
+
+  IRCleanup::deleteInstructionNow(Select);
+}
+} // namespace
+
+PreservedAnalyses TernaryTransformPass::run(llvm::Function &F,
+                                            llvm::FunctionAnalysisManager &AM) {
+  const auto &SAR = AM.getResult<StrideAnalysis>(F);
+
+  // Find selects that can be transformed
+  SmallVector<SelectInst *, 4> Selects;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (SelectInst *Select = dyn_cast<SelectInst>(&I)) {
+        if (shouldTransform(Select, SAR)) {
+          Selects.push_back(Select);
+        }
+      }
+    }
+  }
+
+  if (Selects.empty()) {
+    return PreservedAnalyses::all();
+  }
+
+  auto &Ctx = AM.getResult<VectorizationContextAnalysis>(F).getContext();
+
+  // Transform them.
+  for (SelectInst *Select : Selects) {
+    Transform(Select, Ctx);
+  }
+
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
new file mode 100644
index 0000000000000..753ec2176b38f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/transform/uniform_reassociation_pass.cpp
@@ -0,0 +1,358 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/ADT/DenseSet.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "transform/passes.h"
+
+#define DEBUG_TYPE "vecz"
+
+// WHAT THIS DOES
+//
+// Where we have some expression involving binary operators over uniform and
+// varying values, it can sometimes be advantageous to re-arrange the terms
+// to reduce the vectorization overhead. For example, we might have:
+//
+//   (Varying + Uniform) + Uniform
+//
+// The above expression requires TWO vector broadcasts of the uniform values,
+// and TWO vector additions. However, if we re-associate the operators to get:
+//
+//  Varying + (Uniform + Uniform)
+//
+// In this new form, we only need a scalar addition and a single broadcast,
+// followed by a single vector addition.
+//
+// We also make the following transformations:
+//
+//   (Varying + Uniform) + Varying -> (Varying + Varying) + Uniform
+//   Varying + (Varying + Uniform) -> (Varying + Varying) + Uniform
+//
+// Although these transformations don't reduce the number of vector
+// instructions, they may reduce the vector register pressure somewhat. But
+// more importantly they may enable further transforms on the CFG.
+//
+// A common pattern is a conditional statement like this:
+//
+//    if (uniform_condition && varying_condition) { ... }
+//
+// Control flow conversion quite often replaces the && with an & in order to
+// reduce the number of branches/basic blocks. In this case, however, that is
+// counter-productive for us, since we wish to retain the uniform branch and
+// linearize the varying one. This pass also splits up such branch conditions.
+//
+// POTENTIAL FURTHER WORK
+//
+// Currently, this pass only works on expressions involving a single kind of
+// associative and commutative operators. However, similar transformations
+// are possible with subtracts and mixtures of subtracts and additions.
+
+using namespace llvm;
+
+namespace {
+
+/// @brief it goes through all the PHI nodes in BB and duplicates the incoming
+/// values from "original" to new the new incoming block "extra"
+void updatePHIs(BasicBlock &BB, BasicBlock *original, BasicBlock *extra) {
+  for (auto &I : BB) {
+    auto *const PHI = dyn_cast<PHINode>(&I);
+    if (!PHI) {
+      break;
+    }
+    PHI->addIncoming(PHI->getIncomingValueForBlock(original), extra);
+  }
+}
+
+} // namespace
+
+namespace vecz {
+class Reassociator {
+public:
+  Reassociator() {}
+
+  /// @brief perform the Branch Split transformation
+  ///
+  /// @param[in] F Function to transform.
+  /// @param[in] AM FunctionAnalysisManager providing analyses.
+  /// @returns true iff any branches were split
+  bool run(llvm::Function &F, llvm::FunctionAnalysisManager &AM);
+
+private:
+  /// @brief classification of a binary operand according to whether its
+  ///        operands are Uniform, Varying, both (Varying Op Uniform), or non-
+  ///        canonically both (i.e. Uniform Op Varying).
+  enum class OpForm { Uniform, Varying, Mixed, NonCanonical };
+
+  /// @brief tries to transform a Binary Operator into a canonical form, such
+  ///        that if only one operand is Uniform, it is the second operand.
+  ///
+  /// @param[in] Op the Binary Operator to transform
+  /// @returns the form of the canonicalized operator
+  OpForm canonicalizeBinOp(llvm::BinaryOperator &Op);
+
+  /// @brief tries to rearrange a binary operator expression to reduce vector
+  ///        broadcasts, or to facilitate branch splitting.
+  ///
+  /// @param[in] Op the Binary Operator to transform
+  /// @returns true iff the expression was transformed
+  bool reassociate(llvm::BinaryOperator &Op);
+
+  /// @brief canonicalizes a branch into a form that can be split
+  ///
+  /// @param[in] Branch the branch instruction to canonicalize
+  /// @returns true iff the branch condition is mixed (Varying Op Uniform)
+  ///          and can be split into two separate branches.
+  bool canSplitBranch(llvm::BranchInst &Branch);
+
+  UniformValueResult *UVR = nullptr;
+};
+
+Reassociator::OpForm Reassociator::canonicalizeBinOp(llvm::BinaryOperator &Op) {
+  if (!UVR->isVarying(&Op)) {
+    // Both operands are uniform
+    return OpForm::Uniform;
+  }
+
+  if (!UVR->isVarying(Op.getOperand(0))) {
+    if (Op.isCommutative()) {
+      // canonicalize the operator so that operand 1 is uniform
+      Op.swapOperands();
+      return OpForm::Mixed;
+    }
+    return OpForm::NonCanonical;
+  }
+
+  if (!UVR->isVarying(Op.getOperand(1))) {
+    return OpForm::Mixed;
+  }
+
+  // Both operands are varying
+  return OpForm::Varying;
+}
+
+bool Reassociator::reassociate(llvm::BinaryOperator &Op) {
+  if (!Op.isAssociative() || !Op.isCommutative()) {
+    return false;
+  }
+
+  const auto Opcode = Op.getOpcode();
+  auto *const LHS = Op.getOperand(0);
+  auto *const RHS = Op.getOperand(1);
+
+  auto *const A = dyn_cast<BinaryOperator>(LHS);
+  if (A && A->getOpcode() == Opcode && A->hasNUses(1) &&
+      canonicalizeBinOp(*A) == OpForm::Mixed) {
+    if (UVR->isVarying(RHS)) {
+      // Transform (Varying Op Uniform) Op Varying
+      // into (Varying Op Varying) Op Uniform
+      auto *const P = BinaryOperator::Create(Opcode, A->getOperand(0), RHS,
+                                             "varying.reassoc");
+      P->insertBefore(Op.getIterator());
+      UVR->setVarying(P);
+      Op.setOperand(0, P);
+      Op.setOperand(1, A->getOperand(1));
+      UVR->remove(A);
+      A->eraseFromParent();
+      return true;
+    } else {
+      // Transform (Varying Op Uniform) Op Uniform
+      // into Varying Op (Uniform Op Uniform)
+      auto *const P = BinaryOperator::Create(Opcode, A->getOperand(1), RHS,
+                                             "uniform.reassoc");
+      P->insertBefore(Op.getIterator());
+      Op.setOperand(0, A->getOperand(0));
+      Op.setOperand(1, P);
+      UVR->remove(A);
+      A->eraseFromParent();
+      return true;
+    }
+  }
+
+  auto *const B = dyn_cast<BinaryOperator>(RHS);
+  if (B && B->getOpcode() == Opcode && B->hasNUses(1) &&
+      canonicalizeBinOp(*B) == OpForm::Mixed) {
+    // Transform Varying Op (Varying Op Uniform)
+    // into (Varying Op Varying) Op Uniform
+    auto *const P = BinaryOperator::Create(Opcode, B->getOperand(0), LHS,
+                                           "varying.reassoc");
+    P->insertBefore(Op.getIterator());
+    Op.setOperand(0, P);
+    Op.setOperand(1, B->getOperand(1));
+    UVR->setVarying(P);
+    UVR->remove(B);
+    B->eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool Reassociator::canSplitBranch(BranchInst &Branch) {
+  if (auto *Op = dyn_cast<BinaryOperator>(Branch.getCondition())) {
+    auto Opcode = Op->getOpcode();
+    if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+      auto Form = canonicalizeBinOp(*Op);
+      if (Form == OpForm::Mixed) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool Reassociator::run(llvm::Function &F, llvm::FunctionAnalysisManager &AM) {
+  auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+  LoopInfo *LI = nullptr;
+  UVR = &AM.getResult<UniformValueAnalysis>(F);
+
+  // Iterate over all instructions in dominance order, so that we always
+  // transform an expression before any of its uses.
+  SmallVector<BasicBlock *, 16> Blocks;
+  DT->getDescendants(&F.getEntryBlock(), Blocks);
+
+  SmallVector<BranchInst *, 4> SplitBranches;
+  for (auto *const BB : Blocks) {
+    for (auto Iit = BB->begin(); Iit != BB->end();) {
+      auto &I = *(Iit++);
+      if (auto *BinOp = dyn_cast<BinaryOperator>(&I)) {
+        const auto form = canonicalizeBinOp(*BinOp);
+        if (form == OpForm::Varying || form == OpForm::Mixed) {
+          reassociate(*BinOp);
+        }
+      } else if (auto *Branch = dyn_cast<BranchInst>(&I)) {
+        if (Branch->isConditional() && Branch->getNumSuccessors() == 2 &&
+            canSplitBranch(*Branch)) {
+          // Lazily obtain the Loop Info
+          if (!LI) {
+            LI = &AM.getResult<LoopAnalysis>(F);
+          }
+
+          if (auto *const L = LI->getLoopFor(BB)) {
+            if (L->isLoopExiting(BB)) {
+              // No need to do this transform on loop exits (?)
+              continue;
+            }
+          }
+
+          SplitBranches.push_back(Branch);
+        }
+      }
+    }
+  }
+
+  if (SplitBranches.empty()) {
+    return false;
+  }
+
+  auto *PDT = &AM.getResult<PostDominatorTreeAnalysis>(F);
+
+  do {
+    auto *Branch = SplitBranches.back();
+    SplitBranches.pop_back();
+    BasicBlock *BB = Branch->getParent();
+
+    BasicBlock *newBB = SplitBlock(BB, Branch, DT, LI);
+    newBB->setName(Twine(BB->getName(), ".cond_split"));
+
+    // update the PostDominatorTree manually..
+    PDT->addNewBlock(newBB, PDT->getNode(BB)->getIDom()->getBlock());
+
+    // Remove the unconditional branch created by splitting..
+    BB->getTerminator()->eraseFromParent();
+
+    auto *Cond = cast<BinaryOperator>(Branch->getCondition());
+    auto *varyingCond = Cond->getOperand(0);
+    auto *uniformCond = Cond->getOperand(1);
+
+    // Create a new Uniform branch condition to the Return block..
+    // Note that a conditional branch's successors are returned in reverse
+    // order, relative to how they appear in the IR, with the "true" target
+    // last. However, "getSuccessor(n)" also indexes backwards, from the end.
+    auto Opcode = Cond->getOpcode();
+
+    if (Opcode == Instruction::Or) {
+      BasicBlock *SuccT = Branch->getSuccessor(0);
+
+      BranchInst::Create(SuccT, newBB, uniformCond, BB);
+      Branch->setCondition(varyingCond);
+
+      // If the branch target has PHI nodes, they need to get an extra target
+      updatePHIs(*SuccT, newBB, BB);
+
+      // Update Dominator and PostDominator trees..
+      DT->insertEdge(BB, SuccT);
+      PDT->insertEdge(BB, SuccT);
+    } else {
+      BasicBlock *SuccF = Branch->getSuccessor(1);
+
+      BranchInst::Create(newBB, SuccF, uniformCond, BB);
+      Branch->setCondition(varyingCond);
+
+      // If the branch target has PHI nodes, they need to get an extra target
+      updatePHIs(*SuccF, newBB, BB);
+
+      // Update Dominator and PostDominator trees..
+      DT->insertEdge(BB, SuccF);
+      PDT->insertEdge(BB, SuccF);
+    }
+
+    // If we made the condition dead, we can delete it
+    if (Cond->use_empty()) {
+      Cond->eraseFromParent();
+    }
+
+    // The branch may still have a mixed condition after splitting..
+    if (canSplitBranch(*Branch)) {
+      SplitBranches.push_back(Branch);
+    }
+  } while (!SplitBranches.empty());
+
+  assert(DT->verify() && "Reassociator: Dominator Tree failed verification");
+
+  assert(PDT->verify() &&
+         "Reassociator: Post-Dominator Tree failed verification");
+
+  if (LI) {
+    // Unlike the dominator trees, LoopInfo::verify() returns void and asserts
+    // internally on failure, for some reason
+    LI->verify(*DT);
+  }
+
+  return true;
+}
+
+/// @brief reassociate uniform binary operators and split branches
+PreservedAnalyses UniformReassociationPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  Reassociator reassociator;
+  const bool changed = reassociator.run(F, AM);
+  (void)changed;
+
+  PreservedAnalyses PA;
+  PA.preserve<UniformValueAnalysis>();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<PostDominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
new file mode 100644
index 0000000000000..b22b7f1816f30
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info.cpp
@@ -0,0 +1,1340 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/TargetParser/Triple.h>
+#include <multi_llvm/target_transform_info.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "debugging.h"
+#include "memory_operations.h"
+#include "transform/packetization_helpers.h"
+#include "vecz/vecz_target_info.h"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+/// @brief Applies @a EVL to @a Mask, clearing those bits in a position greater
+/// than @a EVL.
+Value *applyEVLToMask(IRBuilder<> &B, Value *EVL, Value *Mask) {
+  if (EVL) {
+    auto *const IndexVector = B.CreateStepVector(VectorType::get(
+        EVL->getType(), multi_llvm::getVectorElementCount(Mask->getType())));
+    auto *const Splat = B.CreateVectorSplat(
+        multi_llvm::getVectorElementCount(Mask->getType()), EVL);
+    auto *const M = B.CreateICmpULT(IndexVector, Splat);
+    Mask = B.CreateLogicalAnd(Mask, M);
+  }
+  return Mask;
+}
+
+bool isLegalMaskedLoad(const TargetTransformInfo &TTI, Type *Ty,
+                       unsigned Alignment, unsigned AddrSpace) {
+  return multi_llvm::isLegalMaskedLoad(TTI, Ty, Align(Alignment), AddrSpace);
+}
+
+bool isLegalMaskedStore(const TargetTransformInfo &TTI, Type *Ty,
+                        unsigned Alignment, unsigned AddrSpace) {
+  return multi_llvm::isLegalMaskedStore(TTI, Ty, Align(Alignment), AddrSpace);
+}
+
+bool isLegalMaskedGather(const TargetTransformInfo &TTI, Type *Ty,
+                         unsigned Alignment, unsigned) {
+  return TTI.isLegalMaskedGather(Ty, Align(Alignment));
+}
+
+bool isLegalMaskedScatter(const TargetTransformInfo &TTI, Type *Ty,
+                          unsigned Alignment, unsigned) {
+  return TTI.isLegalMaskedScatter(Ty, Align(Alignment));
+}
+} // namespace
+
+// NOTE the TargetMachine is allowed to be null here; it isn't used in the
+// implementation at present, but if it gets used in future it needs to be
+// guarded.
+TargetInfo::TargetInfo(TargetMachine *tm) : TM_(tm) {}
+
+Value *TargetInfo::createLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                              Value *Stride, unsigned Alignment,
+                              Value *EVL) const {
+  if (!Ptr || !Stride || !Ty->isVectorTy()) {
+    return nullptr;
+  }
+
+  // Validate the pointer type.
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  Type *EleTy = Ty->getScalarType();
+
+  // Trivial case: contiguous load.
+  ConstantInt *CIntStride = dyn_cast<ConstantInt>(Stride);
+  if (CIntStride && CIntStride->getSExtValue() == 1) {
+    if (EVL) {
+      const Function *F = B.GetInsertBlock()->getParent();
+      const auto Legality =
+          isVPLoadLegal(F, Ty, Alignment, PtrTy->getAddressSpace());
+      if (!Legality.isVPLegal()) {
+        emitVeczRemarkMissed(F, "Could not create a VP load as the target "
+                                "reported it would be illegal");
+        VECZ_FAIL();
+      }
+      auto *Mask = createAllTrueMask(B, multi_llvm::getVectorElementCount(Ty));
+      const SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Ty, Ptr->getType()};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
+    }
+    return B.CreateAlignedLoad(Ty, Ptr, MaybeAlign(Alignment));
+  }
+
+  if (EVL) {
+    emitVeczRemarkMissed(
+        B.GetInsertBlock()->getParent(), Ptr,
+        "Could not create vector-length-predicated interleaved load");
+    return nullptr;
+  }
+
+  auto Elts = multi_llvm::getVectorElementCount(Ty);
+  if (Elts.isScalable()) {
+    emitVeczRemarkMissed(B.GetInsertBlock()->getParent(), Ptr,
+                         "Could not create a scalable-vector interleaved load");
+    VECZ_FAIL();
+  }
+  const unsigned SimdWidth = Elts.getFixedValue();
+  // Load individual values.
+  SmallVector<Value *, 8> Values;
+  Value *Index = B.getInt64(0);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Value *GEP = B.CreateGEP(EleTy, Ptr, Index);
+    Values.push_back(B.CreateLoad(EleTy, GEP, false, "interleaved.load"));
+    Index = B.CreateAdd(Index, Stride);
+  }
+
+  // Create a vector out of these values.
+  Value *Result = PoisonValue::get(Ty);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Result = B.CreateInsertElement(Result, Values[i], B.getInt32(i));
+  }
+  return Result;
+}
+
+Value *TargetInfo::createStore(IRBuilder<> &B, Value *Data, Value *Ptr,
+                               Value *Stride, unsigned Alignment,
+                               Value *EVL) const {
+  if (!Ptr || !Data || !Stride) {
+    return nullptr;
+  }
+
+  // Validate the pointer type.
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  if (!PtrTy) {
+    return nullptr;
+  }
+  Type *VecTy = Data->getType();
+  Type *EleTy = VecTy->getScalarType();
+
+  // Trivial case: contiguous store.
+  ConstantInt *CIntStride = dyn_cast<ConstantInt>(Stride);
+  if (CIntStride && CIntStride->getSExtValue() == 1) {
+    if (EVL) {
+      const Function *F = B.GetInsertBlock()->getParent();
+      const auto Legality =
+          isVPStoreLegal(F, VecTy, Alignment, PtrTy->getAddressSpace());
+      if (!Legality.isVPLegal()) {
+        emitVeczRemarkMissed(F, "Could not create a VP store as the target "
+                                "reported it would be illegal");
+        VECZ_FAIL();
+      }
+      auto *Mask =
+          createAllTrueMask(B, multi_llvm::getVectorElementCount(VecTy));
+      const SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Data->getType(),
+                                                Ptr->getType()};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args);
+    }
+    return B.CreateAlignedStore(Data, Ptr, MaybeAlign(Alignment));
+  }
+
+  if (EVL) {
+    emitVeczRemarkMissed(
+        B.GetInsertBlock()->getParent(), Ptr,
+        "Could not create vector-length-predicated interleaved store");
+    return nullptr;
+  }
+
+  auto Elts = multi_llvm::getVectorElementCount(VecTy);
+  if (Elts.isScalable()) {
+    emitVeczRemarkMissed(
+        B.GetInsertBlock()->getParent(), Ptr,
+        "Could not create a scalable-vector interleaved store");
+    VECZ_FAIL();
+  }
+  const unsigned SimdWidth = Elts.getFixedValue();
+  // Extract values from the vector.
+  SmallVector<Value *, 8> Values;
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Values.push_back(B.CreateExtractElement(Data, B.getInt32(i)));
+  }
+
+  // Store individual values.
+  Value *Ret = nullptr;
+  Value *Index = B.getInt64(0);
+  for (unsigned i = 0; i < SimdWidth; i++) {
+    Value *GEP = B.CreateGEP(EleTy, Ptr, Index);
+    Ret = B.CreateStore(Values[i], GEP);
+    cast<StoreInst>(Ret)->setAlignment(MaybeAlign(Alignment).valueOrOne());
+
+    Index = B.CreateAdd(Index, Stride);
+  }
+  return Ret;
+}
+
+Value *TargetInfo::createMaskedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                                    Value *Mask, Value *EVL,
+                                    unsigned Alignment) const {
+  VECZ_FAIL_IF(!Ptr || !Mask);
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  VECZ_FAIL_IF(!PtrTy);
+  Type *EleTy = Ty->getScalarType();
+
+  // Validate the pointer and mask types.
+  auto *DataVecTy = dyn_cast<VectorType>(Ty);
+  auto *MaskVecTy = dyn_cast<VectorType>(Mask->getType());
+  if (DataVecTy && MaskVecTy) {
+    VECZ_ERROR_IF(multi_llvm::getVectorElementCount(DataVecTy) !=
+                      multi_llvm::getVectorElementCount(MaskVecTy),
+                  "The mask and the data need to have the same width");
+  }
+
+  // Use LLVM intrinsics for masked vector loads.
+  if (Ty->isVectorTy()) {
+    const Function *F = B.GetInsertBlock()->getParent();
+    const auto Legality =
+        isVPLoadLegal(F, Ty, Alignment, PtrTy->getAddressSpace());
+    if (EVL && Legality.isVPLegal()) {
+      const SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Ty, PtrTy};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_load, Tys, Args);
+    } else if (Legality.isMaskLegal()) {
+      Mask = applyEVLToMask(B, EVL, Mask);
+      VECZ_FAIL_IF(!Mask);
+      return B.CreateMaskedLoad(Ty, Ptr, Align(Alignment), Mask);
+    } else {
+      emitVeczRemarkMissed(F, "Could not create a masked load as the target "
+                              "reported it would be illegal");
+      VECZ_FAIL();
+    }
+  }
+
+  const unsigned Width = 1;
+
+  LLVMContext &Ctx = B.getContext();
+  BasicBlock *Entry = B.GetInsertBlock();
+  BasicBlock *Exit = nullptr;
+  Function *F = Entry->getParent();
+  VECZ_FAIL_IF(!F || !Ptr || !Mask || EVL);
+
+  // Create all the required blocks.
+  SmallVector<BasicBlock *, 4> TestBlocks;
+  SmallVector<BasicBlock *, 4> LoadBlocks;
+  TestBlocks.push_back(Entry);
+  LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F));
+  for (unsigned i = 1; i < Width; i++) {
+    TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F));
+    LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F));
+  }
+  Exit = BasicBlock::Create(Ctx, "masked_load_exit", F);
+
+  Constant *const DefaultEleData = PoisonValue::get(EleTy);
+  SmallVector<Value *, 4> LoadedLanes;
+  SmallVector<Value *, 4> LanePhis;
+  for (unsigned i = 0; i < Width; i++) {
+    BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit;
+
+    // Extract the mask elements and branch.
+    B.SetInsertPoint(TestBlocks[i]);
+    if (i > 0) {
+      PHINode *LanePhi = B.CreatePHI(EleTy, 2, "result_lane");
+      LanePhi->addIncoming(LoadedLanes[i - 1], LoadBlocks[i - 1]);
+      LanePhi->addIncoming(DefaultEleData, TestBlocks[i - 1]);
+      LanePhis.push_back(LanePhi);
+    }
+
+    Value *MaskLane =
+        (Width == 1) ? Mask
+                     : B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane");
+    B.CreateCondBr(MaskLane, LoadBlocks[i], Next);
+
+    // Load the element and branch.
+    B.SetInsertPoint(LoadBlocks[i]);
+    Value *LanePtr =
+        i > 0 ? B.CreateGEP(EleTy, Ptr, B.getInt32(i), "lane_ptr") : Ptr;
+    LoadInst *Load = B.CreateLoad(EleTy, LanePtr, false, "masked_load");
+    Load->setAlignment(MaybeAlign(Alignment).valueOrOne());
+    LoadedLanes.push_back(Load);
+    B.CreateBr(Next);
+  }
+
+  // Aggregate the loaded lanes.
+  B.SetInsertPoint(Exit);
+  PHINode *LastLanePhi = B.CreatePHI(EleTy, 2, "result_lane");
+  LastLanePhi->addIncoming(LoadedLanes[Width - 1], LoadBlocks[Width - 1]);
+  LastLanePhi->addIncoming(DefaultEleData, TestBlocks[Width - 1]);
+  LanePhis.push_back(LastLanePhi);
+
+  Value *Result = nullptr;
+  if (Width > 1) {
+    Result = PoisonValue::get(Ty);
+    for (unsigned i = 0; i < Width; i++) {
+      Result = B.CreateInsertElement(Result, LanePhis[i], B.getInt32(i));
+    }
+  } else {
+    Result = LanePhis[Width - 1];
+  }
+
+  return Result;
+}
+
+Value *TargetInfo::createMaskedStore(IRBuilder<> &B, Value *Data, Value *Ptr,
+                                     Value *Mask, Value *EVL,
+                                     unsigned Alignment) const {
+  PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
+  VECZ_FAIL_IF(!PtrTy);
+  Type *DataTy = Data->getType();
+  Type *EleTy = DataTy->getScalarType();
+
+  auto *DataVecTy = dyn_cast<VectorType>(DataTy);
+  auto *MaskVecTy = dyn_cast<VectorType>(Mask->getType());
+  if (DataVecTy && MaskVecTy) {
+    VECZ_ERROR_IF(multi_llvm::getVectorElementCount(DataVecTy) !=
+                      multi_llvm::getVectorElementCount(MaskVecTy),
+                  "The mask and the data need to have the same width");
+  }
+
+  // Use LLVM intrinsics for masked vector Stores.
+  if (DataTy->isVectorTy()) {
+    const Function *F = B.GetInsertBlock()->getParent();
+    const auto Legality =
+        isVPStoreLegal(F, DataTy, Alignment, PtrTy->getAddressSpace());
+    if (EVL && Legality.isVPLegal()) {
+      const SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Data->getType(), PtrTy};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_store, Tys, Args);
+    } else if (Legality.isMaskLegal()) {
+      Mask = applyEVLToMask(B, EVL, Mask);
+      VECZ_FAIL_IF(!Mask);
+      return B.CreateMaskedStore(Data, Ptr, Align(Alignment), Mask);
+    } else {
+      emitVeczRemarkMissed(F, "Could not create a masked store as the target "
+                              "reported it would be illegal");
+      VECZ_FAIL();
+    }
+  }
+
+  const unsigned Width = 1;
+
+  LLVMContext &Ctx = B.getContext();
+  BasicBlock *Entry = B.GetInsertBlock();
+  BasicBlock *Exit = nullptr;
+  StoreInst *FirstStore = nullptr;
+  Function *F = Entry->getParent();
+  VECZ_FAIL_IF(!F || EVL);
+
+  // Create all the required blocks.
+  SmallVector<BasicBlock *, 4> TestBlocks;
+  SmallVector<BasicBlock *, 4> StoreBlocks;
+  TestBlocks.push_back(Entry);
+  StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F));
+  for (unsigned i = 1; i < Width; i++) {
+    TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F));
+    StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F));
+  }
+  Exit = BasicBlock::Create(Ctx, "masked_store_exit", F);
+
+  for (unsigned i = 0; i < Width; i++) {
+    BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit;
+
+    // Extract the mask elements and branch.
+    B.SetInsertPoint(TestBlocks[i]);
+    Value *MaskLane =
+        (Width == 1) ? Mask
+                     : B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane");
+    B.CreateCondBr(MaskLane, StoreBlocks[i], Next);
+
+    // Extract the data elements and store.
+    B.SetInsertPoint(StoreBlocks[i]);
+    Value *DataLane =
+        (Width == 1) ? Data
+                     : B.CreateExtractElement(Data, B.getInt32(i), "data_lane");
+    Value *LanePtr = Ptr;
+    if (i > 0) {
+      LanePtr = B.CreateGEP(EleTy, LanePtr, B.getInt32(i), "lane_ptr");
+    }
+    StoreInst *Store = B.CreateStore(DataLane, LanePtr);
+    if (i == 0) {
+      FirstStore = Store;
+    }
+    Store->setAlignment(MaybeAlign(Alignment).valueOrOne());
+    B.CreateBr(Next);
+  }
+
+  B.SetInsertPoint(Exit);
+  return FirstStore;
+}
+
+Value *TargetInfo::createInterleavedLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                                         Value *Stride, Value *EVL,
+                                         unsigned Alignment) const {
+  auto EC = multi_llvm::getVectorElementCount(Ty);
+  auto *const Mask = B.CreateVectorSplat(EC, B.getTrue());
+  return createMaskedInterleavedLoad(B, Ty, Ptr, Mask, Stride, EVL, Alignment);
+}
+
+Value *TargetInfo::createInterleavedStore(IRBuilder<> &B, Value *Data,
+                                          Value *Ptr, Value *Stride, Value *EVL,
+                                          unsigned Alignment) const {
+  auto EC = multi_llvm::getVectorElementCount(Data->getType());
+  auto *const Mask = B.CreateVectorSplat(EC, B.getTrue());
+  return createMaskedInterleavedStore(B, Data, Ptr, Mask, Stride, EVL,
+                                      Alignment);
+}
+
+Value *TargetInfo::createMaskedInterleavedLoad(IRBuilder<> &B, Type *Ty,
+                                               Value *Ptr, Value *Mask,
+                                               Value *Stride, Value *EVL,
+                                               unsigned Alignment) const {
+  // We only support scalar pointer types
+  assert(!Ptr->getType()->isVectorTy() && "Unsupported interleaved load");
+
+  auto EC = multi_llvm::getVectorElementCount(Ty);
+  Value *BroadcastAddr = B.CreateVectorSplat(EC, Ptr, "BroadcastAddr");
+  Value *StrideSplat = B.CreateVectorSplat(EC, Stride);
+
+  Value *IndicesVector =
+      createIndexSequence(B, cast<VectorType>(StrideSplat->getType()));
+  VECZ_FAIL_IF(!IndicesVector);
+  IndicesVector = B.CreateMul(StrideSplat, IndicesVector);
+
+  Value *Address =
+      B.CreateGEP(Ty->getScalarType(), BroadcastAddr, IndicesVector);
+
+  return createMaskedGatherLoad(B, Ty, Address, Mask, EVL, Alignment);
+}
+
+Value *TargetInfo::createMaskedInterleavedStore(IRBuilder<> &B, Value *Data,
+                                                Value *Ptr, Value *Mask,
+                                                Value *Stride, Value *EVL,
+                                                unsigned Alignment) const {
+  // We only support scalar pointer types
+  assert(!Ptr->getType()->isVectorTy() && "Unsupported interleaved store");
+  auto EC = multi_llvm::getVectorElementCount(Data->getType());
+  Value *BroadcastAddr = B.CreateVectorSplat(EC, Ptr, "BroadcastAddr");
+  Value *StrideSplat = B.CreateVectorSplat(EC, Stride);
+
+  Value *IndicesVector =
+      createIndexSequence(B, cast<VectorType>(StrideSplat->getType()));
+  VECZ_FAIL_IF(!IndicesVector);
+  IndicesVector = B.CreateMul(StrideSplat, IndicesVector);
+
+  Value *Address = B.CreateGEP(Data->getType()->getScalarType(), BroadcastAddr,
+                               IndicesVector);
+
+  return createMaskedScatterStore(B, Data, Address, Mask, EVL, Alignment);
+}
+
+Value *TargetInfo::createGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                                    Value *EVL, unsigned Alignment) const {
+  auto EC = multi_llvm::getVectorElementCount(Ty);
+  auto *const Mask = B.CreateVectorSplat(EC, B.getTrue());
+  return createMaskedGatherLoad(B, Ty, Ptr, Mask, EVL, Alignment);
+}
+
+Value *TargetInfo::createScatterStore(IRBuilder<> &B, Value *Data, Value *Ptr,
+                                      Value *EVL, unsigned Alignment) const {
+  auto EC = multi_llvm::getVectorElementCount(Data->getType());
+  auto *const Mask = B.CreateVectorSplat(EC, B.getTrue());
+  return createMaskedScatterStore(B, Data, Ptr, Mask, EVL, Alignment);
+}
+
+Value *TargetInfo::createMaskedGatherLoad(IRBuilder<> &B, Type *Ty, Value *Ptr,
+                                          Value *Mask, Value *EVL,
+                                          unsigned Alignment) const {
+  LLVMContext &Ctx = B.getContext();
+  BasicBlock *Entry = B.GetInsertBlock();
+  BasicBlock *Exit = nullptr;
+  Function *F = Entry->getParent();
+  VECZ_FAIL_IF(!F || !Ptr || !Mask);
+
+  auto *VecPtrTy = dyn_cast<VectorType>(Ptr->getType());
+  VECZ_FAIL_IF(!VecPtrTy);
+  PointerType *PtrTy = dyn_cast<PointerType>(VecPtrTy->getElementType());
+  VECZ_FAIL_IF(!PtrTy);
+  Type *EleTy = Ty->getScalarType();
+  Constant *DefaultEleData = PoisonValue::get(EleTy);
+
+  if (Ty->isVectorTy()) {
+    const auto Legality =
+        isVPGatherLegal(F, Ty, Alignment, PtrTy->getAddressSpace());
+    if (EVL && Legality.isVPLegal()) {
+      const SmallVector<llvm::Value *, 2> Args = {Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Ty, VecPtrTy};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_gather, Tys, Args);
+    } else if (Legality.isMaskLegal()) {
+      Function *MaskedGather = Intrinsic::getOrInsertDeclaration(
+          F->getParent(), Intrinsic::masked_gather, {Ty, VecPtrTy});
+
+      if (MaskedGather) {
+        Mask = applyEVLToMask(B, EVL, Mask);
+        VECZ_FAIL_IF(!Mask);
+        // Create the call to the function
+        Value *Args[] = {Ptr, B.getInt32(Alignment), Mask,
+                         PoisonValue::get(Ty)};
+        CallInst *CI = B.CreateCall(MaskedGather, Args);
+        if (CI) {
+          CI->setCallingConv(MaskedGather->getCallingConv());
+          CI->setAttributes(MaskedGather->getAttributes());
+          return CI;
+        }
+      }
+    } else {
+      emitVeczRemarkMissed(F, "Could not create a masked gather as the target "
+                              "reported it would be illegal");
+      VECZ_FAIL();
+    }
+  }
+
+  VECZ_FAIL_IF(EVL);
+  auto VecWidth = multi_llvm::getVectorElementCount(Ty);
+  const unsigned Width = VecWidth.getFixedValue();
+
+  // Fallback scalar function generator
+  // Create all the required blocks.
+  SmallVector<BasicBlock *, 4> TestBlocks;
+  SmallVector<BasicBlock *, 4> LoadBlocks;
+  TestBlocks.push_back(Entry);
+  LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F));
+  for (unsigned i = 1; i < Width; i++) {
+    TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F));
+    LoadBlocks.push_back(BasicBlock::Create(Ctx, "masked_load", F));
+  }
+  Exit = BasicBlock::Create(Ctx, "masked_load_exit", F);
+
+  SmallVector<Value *, 4> LoadedLanes;
+  SmallVector<Value *, 4> LanePhis;
+  for (unsigned i = 0; i < Width; i++) {
+    BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit;
+
+    // Extract the mask elements and branch.
+    B.SetInsertPoint(TestBlocks[i]);
+    if (i > 0) {
+      PHINode *LanePhi = B.CreatePHI(EleTy, 2, "result_lane");
+      LanePhi->addIncoming(LoadedLanes[i - 1], LoadBlocks[i - 1]);
+      LanePhi->addIncoming(DefaultEleData, TestBlocks[i - 1]);
+      LanePhis.push_back(LanePhi);
+    }
+
+    Value *MaskLane = B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane");
+    B.CreateCondBr(MaskLane, LoadBlocks[i], Next);
+
+    // Load the element and branch.
+    B.SetInsertPoint(LoadBlocks[i]);
+    Value *PtrLane = B.CreateExtractElement(Ptr, B.getInt32(i), "ptr_lane");
+    LoadInst *Load = B.CreateLoad(EleTy, PtrLane, false, "masked_load");
+    Load->setAlignment(MaybeAlign(Alignment).valueOrOne());
+    LoadedLanes.push_back(Load);
+    B.CreateBr(Next);
+  }
+
+  // Aggregate the loaded lanes.
+  B.SetInsertPoint(Exit);
+  PHINode *LastLanePhi = B.CreatePHI(EleTy, 2, "result_lane");
+  LastLanePhi->addIncoming(LoadedLanes[Width - 1], LoadBlocks[Width - 1]);
+  LastLanePhi->addIncoming(DefaultEleData, TestBlocks[Width - 1]);
+  LanePhis.push_back(LastLanePhi);
+  Value *Result = PoisonValue::get(Ty);
+  for (unsigned i = 0; i < Width; i++) {
+    Result = B.CreateInsertElement(Result, LanePhis[i], B.getInt32(i));
+  }
+  return Result;
+}
+
+Value *TargetInfo::createMaskedScatterStore(IRBuilder<> &B, Value *Data,
+                                            Value *Ptr, Value *Mask, Value *EVL,
+                                            unsigned Alignment) const {
+  LLVMContext &Ctx = B.getContext();
+  BasicBlock *Entry = B.GetInsertBlock();
+  BasicBlock *Exit = nullptr;
+  StoreInst *FirstStore = nullptr;
+  Function *F = Entry->getParent();
+  VECZ_FAIL_IF(!F || !Ptr || !Mask);
+  auto *DataTy = Data->getType();
+
+  if (DataTy->isVectorTy()) {
+    auto *VecPtrTy = dyn_cast<VectorType>(Ptr->getType());
+    VECZ_FAIL_IF(!VecPtrTy);
+    auto *PtrTy = dyn_cast<PointerType>(VecPtrTy->getElementType());
+    VECZ_FAIL_IF(!PtrTy);
+    const auto Legality =
+        isVPScatterLegal(F, DataTy, Alignment, PtrTy->getAddressSpace());
+    if (EVL && Legality.isVPLegal()) {
+      const SmallVector<llvm::Value *, 3> Args = {Data, Ptr, Mask, EVL};
+      const SmallVector<llvm::Type *, 2> Tys = {Data->getType(), VecPtrTy};
+      return B.CreateIntrinsic(llvm::Intrinsic::vp_scatter, Tys, Args);
+    } else if (Legality.isMaskLegal()) {
+      Function *MaskedScatter = Intrinsic::getOrInsertDeclaration(
+          F->getParent(), Intrinsic::masked_scatter, {DataTy, VecPtrTy});
+
+      if (MaskedScatter) {
+        Mask = applyEVLToMask(B, EVL, Mask);
+        VECZ_FAIL_IF(!Mask);
+        // Create the call to the function
+        Value *Args[] = {Data, Ptr, B.getInt32(Alignment), Mask};
+        CallInst *CI = B.CreateCall(MaskedScatter, Args);
+        if (CI) {
+          CI->setCallingConv(MaskedScatter->getCallingConv());
+          CI->setAttributes(MaskedScatter->getAttributes());
+          return CI;
+        }
+      }
+    } else {
+      emitVeczRemarkMissed(F, "Could not create a masked scatter as the target "
+                              "reported it would be illegal");
+      VECZ_FAIL();
+    }
+  }
+
+  VECZ_FAIL_IF(EVL);
+  auto VecWidth = multi_llvm::getVectorElementCount(DataTy);
+  const unsigned Width = VecWidth.getFixedValue();
+
+  // Fallback scalar function generator
+  // Create all the required blocks.
+  SmallVector<BasicBlock *, 4> TestBlocks;
+  SmallVector<BasicBlock *, 4> StoreBlocks;
+  TestBlocks.push_back(Entry);
+  StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F));
+  for (unsigned i = 1; i < Width; i++) {
+    TestBlocks.push_back(BasicBlock::Create(Ctx, "test_mask", F));
+    StoreBlocks.push_back(BasicBlock::Create(Ctx, "masked_store", F));
+  }
+  Exit = BasicBlock::Create(Ctx, "masked_store_exit", F);
+
+  for (unsigned i = 0; i < Width; i++) {
+    BasicBlock *Next = ((i + 1) < Width) ? TestBlocks[i + 1] : Exit;
+
+    // Extract the mask elements and branch.
+    B.SetInsertPoint(TestBlocks[i]);
+    Value *MaskLane = B.CreateExtractElement(Mask, B.getInt32(i), "mask_lane");
+    B.CreateCondBr(MaskLane, StoreBlocks[i], Next);
+
+    // Extract the data elements and store.
+    B.SetInsertPoint(StoreBlocks[i]);
+    Value *PtrLane = B.CreateExtractElement(Ptr, B.getInt32(i), "ptr_lane");
+    Value *DataLane = B.CreateExtractElement(Data, B.getInt32(i), "data_lane");
+    StoreInst *Store = B.CreateStore(DataLane, PtrLane);
+    if (i == 0) {
+      FirstStore = Store;
+    }
+    Store->setAlignment(MaybeAlign(Alignment).valueOrOne());
+    B.CreateBr(Next);
+  }
+
+  B.SetInsertPoint(Exit);
+  return FirstStore;
+}
+
+Value *TargetInfo::createScalableExtractElement(IRBuilder<> &B,
+                                                VectorizationContext &Ctx,
+                                                Instruction *extract,
+                                                Type *narrowTy, Value *src,
+                                                Value *index, Value *VL) const {
+  (void)VL;
+  const auto *origSrc = extract->getOperand(0);
+  auto *eltTy = src->getType()->getScalarType();
+
+  auto *wideTy = src->getType();
+
+  auto it = B.GetInsertPoint();
+
+  // Insert alloca at the beginning of the function.
+  auto allocaIt =
+      B.GetInsertBlock()->getParent()->getEntryBlock().getFirstInsertionPt();
+  B.SetInsertPoint(&*allocaIt);
+  auto *const alloc = B.CreateAlloca(wideTy, nullptr, "fixlen.alloc");
+
+  // Reset the insertion point to wherever we must insert instructions
+  B.SetInsertPoint(&*it);
+
+  // Store the packetized vector to the allocation
+  B.CreateStore(src, alloc);
+
+  const unsigned fixedVecElts =
+      multi_llvm::getVectorNumElements(origSrc->getType());
+
+  Instruction *load = nullptr;
+  if (!index->getType()->isVectorTy()) {
+    // If the index remains a scalar (is uniform) then we can use a strided load
+    // starting from the address '&alloc[index]', strided by the original vector
+    // width: &alloc[index], &alloc[index+N], &alloc[index+2N], ...
+    auto *const stride = getSizeInt(B, fixedVecElts);
+    auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
+    // Index into the allocation, coming back with the starting offset from
+    // which to begin our loads. This is either a scalar pointer, or a vector of
+    // pointers.
+    auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, index, "vec.alloc");
+
+    load = ::createInterleavedLoad(Ctx, narrowTy, gep, stride, /*Mask*/ nullptr,
+                                   /*EVL*/ nullptr, alignment.value());
+  } else {
+    // Else if we've got a varying, vector index, then we must use a gather.
+    // Take our indices, and add them to a step multiplied by the original
+    // vecor width. Use that to create a vector of pointers.
+    auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
+
+    index = getGatherIndicesVector(
+        B, index, index->getType(),
+        multi_llvm::getVectorNumElements(origSrc->getType()), "idx");
+
+    // Index into the allocation, coming back with the starting offset from
+    // which to begin our striding load.
+    auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, index, "vec.alloc");
+
+    load = ::createGather(Ctx, narrowTy, gep, /*Mask*/ nullptr, /*EVL*/ nullptr,
+                          alignment.value());
+  }
+  load->insertBefore(B.GetInsertPoint());
+
+  return load;
+}
+
+Value *TargetInfo::createOuterScalableBroadcast(IRBuilder<> &builder,
+                                                Value *vector, Value *VL,
+                                                ElementCount factor) const {
+  return createScalableBroadcast(builder, vector, VL, factor,
+                                 /* URem */ true);
+}
+
+Value *TargetInfo::createInnerScalableBroadcast(IRBuilder<> &builder,
+                                                Value *vector, Value *VL,
+                                                ElementCount factor) const {
+  return createScalableBroadcast(builder, vector, VL, factor,
+                                 /* URem */ false);
+}
+
+Value *TargetInfo::createScalableBroadcast(IRBuilder<> &B, Value *vector,
+                                           Value *VL, ElementCount factor,
+                                           bool URem) const {
+  (void)VL;
+  auto *const ty = vector->getType();
+  auto *const wideTy = ScalableVectorType::get(
+      multi_llvm::getVectorElementType(ty),
+      factor.getKnownMinValue() *
+          multi_llvm::getVectorElementCount(ty).getKnownMinValue());
+  auto wideEltCount = multi_llvm::getVectorElementCount(wideTy);
+
+  // The splats must be inserted after any Allocas
+  auto it = B.GetInsertBlock()->getParent()->getEntryBlock().begin();
+  while (isa<AllocaInst>(*it)) {
+    ++it;
+  }
+  IRBuilder<> AllocaB(&*it);
+
+  auto *const alloc = AllocaB.CreateAlloca(ty, nullptr, "fixlen.alloc");
+
+  // Store the vector to the allocation.
+  B.CreateStore(vector, alloc);
+
+  auto *const eltTy = cast<llvm::VectorType>(ty)->getElementType();
+
+  auto *const stepsRem = TargetInfo::createBroadcastIndexVector(
+      B,
+      ScalableVectorType::get(B.getInt32Ty(), cast<ScalableVectorType>(wideTy)),
+      factor, URem, "idx1");
+  auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, stepsRem, "vec.alloc");
+  auto *const boolTrue = ConstantInt::getTrue(B.getContext());
+  auto *const mask = B.CreateVectorSplat(wideEltCount, boolTrue, "truemask");
+  // Set the alignment to that of vector element type.
+  auto alignment = MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
+  return B.CreateMaskedGather(wideTy, gep, alignment, mask,
+                              PoisonValue::get(wideTy));
+}
+
+Value *TargetInfo::createBroadcastIndexVector(IRBuilder<> &B, Type *ty,
+                                              ElementCount factor, bool URem,
+                                              const llvm::Twine &N) {
+  auto *const steps = B.CreateStepVector(ty, "idx0");
+  const auto tyEC = multi_llvm::getVectorElementCount(ty);
+  const unsigned factorMinVal = factor.getKnownMinValue();
+
+  unsigned fixedAmt;
+  Instruction::BinaryOps Opc;
+  if (URem) {
+    fixedAmt = tyEC.getKnownMinValue() / factorMinVal;
+    Opc = BinaryOperator::URem;
+  } else {
+    fixedAmt = factorMinVal;
+    Opc = BinaryOperator::UDiv;
+  }
+  auto *const vectorEltsSplat = B.CreateVectorSplat(
+      tyEC, ConstantInt::get(multi_llvm::getVectorElementType(ty), fixedAmt));
+  return B.CreateBinOp(Opc, steps, vectorEltsSplat, N);
+}
+
+Value *TargetInfo::createScalableInsertElement(IRBuilder<> &B,
+                                               VectorizationContext &Ctx,
+                                               Instruction *insert, Value *elt,
+                                               Value *into, Value *index,
+                                               Value *VL) const {
+  (void)VL;
+  auto *eltTy = elt->getType();
+  auto *intoTy = into->getType();
+  auto *scalarTy = elt->getType()->getScalarType();
+
+  // The alloca must be inserted at the beginning of the function.
+  auto allocaIt =
+      B.GetInsertBlock()->getParent()->getEntryBlock().getFirstInsertionPt();
+  auto it = B.GetInsertPoint();
+
+  B.SetInsertPoint(&*allocaIt);
+  auto *const alloc = B.CreateAlloca(intoTy, nullptr);
+
+  // Reset the insertion point to wherever we must insert instructions
+  B.SetInsertPoint(&*it);
+
+  // Store the wide vector to the allocation
+  B.CreateStore(into, alloc);
+
+  const unsigned fixedVecElts =
+      multi_llvm::getVectorNumElements(insert->getOperand(0)->getType());
+
+  // Construct the index, either by packetizing if (if varying) or by
+  // splatting it and combining it with a step vector
+  Instruction *store;
+  if (!index->getType()->isVectorTy()) {
+    // If the index remains a scalar (is uniform) then we can use a strided
+    // store starting from the address '&alloc[index]', strided by the original
+    // vector width: &alloc[index], &alloc[index+N], &alloc[index+2N], ...
+    auto *const stride = getSizeInt(B, fixedVecElts);
+    auto alignment =
+        MaybeAlign(scalarTy->getScalarSizeInBits() / 8).valueOrOne();
+    // Index into the allocation, coming back with the starting offset from
+    // which to begin our loads. This is either a scalar pointer, or a vector of
+    // pointers.
+    auto *const gep = B.CreateInBoundsGEP(scalarTy, alloc, index, "vec.alloc");
+
+    store = ::createInterleavedStore(Ctx, elt, gep, stride, /*Mask*/ nullptr,
+                                     /*EVL*/ nullptr, alignment.value());
+  } else {
+    // Else if we've got a varying, vector index, then we must use a scatter.
+    // Take our indices, and add them to a step multiplied by the original
+    // vecor width. Use that to create a vector of pointers.
+    auto alignment =
+        MaybeAlign(scalarTy->getScalarSizeInBits() / 8).valueOrOne();
+
+    auto narrowEltCount = multi_llvm::getVectorElementCount(eltTy);
+
+    auto *steps = B.CreateStepVector(index->getType(), "idx0");
+    auto *const fixedVecEltsSplat = B.CreateVectorSplat(
+        narrowEltCount,
+        ConstantInt::get(index->getType()->getScalarType(), fixedVecElts));
+    auto *const stepsMul = B.CreateMul(steps, fixedVecEltsSplat, "idx.scale");
+    index = B.CreateAdd(stepsMul, index, "idx");
+
+    // Index into the allocation, coming back with the starting offset from
+    // which to begin our striding load.
+    auto *const gep = B.CreateInBoundsGEP(scalarTy, alloc, index, "vec.alloc");
+
+    store = ::createScatter(Ctx, elt, gep, /*Mask*/ nullptr,
+                            /*EVL*/ nullptr, alignment.value());
+  }
+  VECZ_FAIL_IF(!store);
+  store->insertBefore(B.GetInsertPoint());
+
+  // Load the vector back from the stack
+  return B.CreateLoad(intoTy, alloc);
+}
+
+bool TargetInfo::isVPVectorLegal(const Function &F, Type *Ty) const {
+  return !TM_ ||
+         TM_->getTargetTransformInfo(F).isElementTypeLegalForScalableVector(
+             multi_llvm::getVectorElementType(Ty));
+}
+
+TargetInfo::VPMemOpLegality TargetInfo::checkMemOpLegality(
+    const Function *F,
+    function_ref<bool(const llvm::TargetTransformInfo &, Type *, unsigned,
+                      unsigned)>
+        Checker,
+    Type *Ty, unsigned Alignment, unsigned AddrSpace) const {
+  assert(Ty->isVectorTy() && "Expected a vector type");
+  const bool isMaskLegal =
+      !(isa<ScalableVectorType>(Ty) && TM_) ||
+      Checker(TM_->getTargetTransformInfo(*F), Ty, Alignment, AddrSpace);
+  // Assuming a pointer bit width of 64
+  bool isVPLegal = isMaskLegal && isVPVectorLegal(*F, Ty);
+  if (isVPLegal) {
+    const unsigned PtrBitWidth =
+        TM_ ? TM_->createDataLayout().getPointerSizeInBits(AddrSpace) : 64;
+    auto &Ctx = Ty->getContext();
+    auto *const IntTy = IntegerType::get(Ctx, PtrBitWidth);
+    auto *const IntVecTy =
+        VectorType::get(IntTy, multi_llvm::getVectorElementCount(Ty));
+    isVPLegal = isVPVectorLegal(*F, IntVecTy);
+  }
+  return {isVPLegal, isMaskLegal};
+}
+
+TargetInfo::VPMemOpLegality
+TargetInfo::isVPLoadLegal(const Function *F, Type *Ty, unsigned Alignment,
+                          unsigned AddrSpace) const {
+  return checkMemOpLegality(F, isLegalMaskedLoad, Ty, Alignment, AddrSpace);
+}
+
+TargetInfo::VPMemOpLegality
+TargetInfo::isVPStoreLegal(const Function *F, Type *Ty, unsigned Alignment,
+                           unsigned AddrSpace) const {
+  return checkMemOpLegality(F, isLegalMaskedStore, Ty, Alignment, AddrSpace);
+}
+
+TargetInfo::VPMemOpLegality
+TargetInfo::isVPGatherLegal(const Function *F, Type *Ty, unsigned Alignment,
+                            unsigned AddrSpace) const {
+  return checkMemOpLegality(F, isLegalMaskedGather, Ty, Alignment, AddrSpace);
+}
+
+TargetInfo::VPMemOpLegality
+TargetInfo::isVPScatterLegal(const Function *F, Type *Ty, unsigned Alignment,
+                             unsigned AddrSpace) const {
+  return checkMemOpLegality(F, isLegalMaskedScatter, Ty, Alignment, AddrSpace);
+}
+
+bool TargetInfo::isLegalVPElementType(Type *) const { return true; }
+
+llvm::Value *TargetInfo::createVectorShuffle(llvm::IRBuilder<> &B,
+                                             llvm::Value *src,
+                                             llvm::Value *mask,
+                                             llvm::Value *evl) const {
+  auto *const srcTy = dyn_cast<VectorType>(src->getType());
+  auto *const maskTy = dyn_cast<VectorType>(mask->getType());
+  assert(
+      srcTy && maskTy &&
+      "TargetInfo::createVectorShuffle: source and mask must have vector type");
+
+  if (isa<Constant>(mask)) {
+    // Special case if the mask happens to be a constant.
+    return B.CreateShuffleVector(src, PoisonValue::get(srcTy), mask);
+  }
+
+  // The alloca must be inserted at the beginning of the function.
+  auto *const curBlock = B.GetInsertBlock();
+  auto &entryBlock = curBlock->getParent()->getEntryBlock();
+  const auto allocaIt = entryBlock.getFirstInsertionPt();
+  const auto it = B.GetInsertPoint();
+
+  B.SetInsertPoint(&entryBlock, allocaIt);
+  auto *const alloc = B.CreateAlloca(srcTy, nullptr);
+
+  // Reset the insertion point to wherever we must insert instructions
+  B.SetInsertPoint(curBlock, it);
+
+  // Store the wide vector to the allocation
+  B.CreateStore(src, alloc);
+
+  auto *const eltTy = srcTy->getElementType();
+
+  // Index into the allocation.
+  auto *const gep = B.CreateInBoundsGEP(eltTy, alloc, mask, "vec.alloc");
+
+  const auto eltCount = maskTy->getElementCount();
+  auto *const dstTy = VectorType::get(eltTy, eltCount);
+  const auto alignment =
+      MaybeAlign(eltTy->getScalarSizeInBits() / 8).valueOrOne();
+
+  Value *gatherMask = nullptr;
+  if (evl) {
+    const auto EC = srcTy->getElementCount();
+    auto *const IndexTy = VectorType::get(evl->getType(), EC);
+    auto *const step = B.CreateStepVector(IndexTy);
+    gatherMask = B.CreateICmpULT(step, B.CreateVectorSplat(EC, evl));
+  } else {
+    gatherMask = B.CreateVectorSplat(eltCount, B.getTrue());
+  }
+
+  return B.CreateMaskedGather(dstTy, gep, alignment, gatherMask,
+                              PoisonValue::get(dstTy));
+}
+
+llvm::Value *TargetInfo::createVectorSlideUp(llvm::IRBuilder<> &B,
+                                             llvm::Value *src,
+                                             llvm::Value *insert,
+                                             llvm::Value *) const {
+  auto *const srcTy = dyn_cast<VectorType>(src->getType());
+  assert(srcTy &&
+         "TargetInfo::createVectorShuffle: source must have vector type");
+
+  auto *const poison = PoisonValue::get(srcTy);
+  const auto EC = srcTy->getElementCount();
+  if (!EC.isScalable()) {
+    // Special case for fixed-width vectors
+    const auto width = EC.getFixedValue();
+    SmallVector<int, 16> mask(width);
+    auto it = mask.begin();
+    *it++ = 0;
+    for (size_t i = 1; i < width; ++i) {
+      *it++ = i - 1;
+    }
+
+    auto *const rotate =
+        createOptimalShuffle(B, src, poison, mask, Twine("slide_up"));
+    return B.CreateInsertElement(rotate, insert, B.getInt64(0), "slide_in");
+  }
+
+  auto *const rotate = B.CreateVectorSplice(poison, src, -1, "slide_up");
+  return B.CreateInsertElement(rotate, insert, B.getInt64(0), "slide_in");
+}
+
+bool TargetInfo::canOptimizeInterleavedGroup(const Instruction &val,
+                                             InterleavedOperation Kind,
+                                             int Stride,
+                                             unsigned GroupSize) const {
+  if ((Stride == 2) || (Stride == 4)) {
+    VECZ_FAIL_IF((int)GroupSize != abs(Stride));
+    VECZ_FAIL_IF((Kind != eInterleavedLoad) && (Kind != eInterleavedStore) &&
+                 (Kind != eMaskedInterleavedLoad) &&
+                 (Kind != eMaskedInterleavedStore));
+    Type *DataType = nullptr;
+    if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) {
+      DataType = val.getOperand(0)->getType();
+    } else {
+      DataType = val.getType();
+    }
+    VECZ_FAIL_IF(!DataType);
+    VECZ_FAIL_IF(!isa<FixedVectorType>(DataType));
+    return true;
+  }
+  return false;
+}
+
+bool TargetInfo::optimizeInterleavedGroup(IRBuilder<> &B,
+                                          InterleavedOperation Kind,
+                                          ArrayRef<Value *> Group,
+                                          ArrayRef<Value *> Masks,
+                                          Value *Address, int Stride) const {
+  VECZ_FAIL_IF(Stride < 0);
+
+  // Validate the operations in the group.
+  SmallVector<CallInst *, 4> Calls;
+  for (unsigned i = 0; i < Group.size(); i++) {
+    CallInst *Op = dyn_cast<CallInst>(Group[i]);
+    VECZ_FAIL_IF(!Op);
+    Calls.push_back(Op);
+  }
+  PointerType *PtrTy = dyn_cast<PointerType>(Address->getType());
+  VECZ_FAIL_IF(!PtrTy);
+  CallInst *Op0 = Calls[0];
+  VECZ_FAIL_IF(!canOptimizeInterleavedGroup(*Op0, Kind, Stride, Group.size()));
+
+  // canOptimizeInterleavedGroup() performs several checks, including valid
+  // Kind and Op0 types. Thus, these casts are safe.
+  FixedVectorType *VecTy = nullptr;
+  if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) {
+    VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
+  } else { // eInterleavedLoad || eMaskedInterleavedLoad
+    VecTy = cast<FixedVectorType>(Op0->getType());
+  }
+
+  auto VecWidth = multi_llvm::getVectorElementCount(VecTy);
+  const unsigned SimdWidth = VecWidth.getFixedValue();
+
+  Type *EleTy = VecTy->getElementType();
+  const unsigned Align = EleTy->getScalarSizeInBits() / 8;
+
+  const bool HasMask =
+      (Kind == eMaskedInterleavedLoad) || (Kind == eMaskedInterleavedStore);
+  SmallVector<Value *, 4> Vectors;
+  SmallVector<Value *, 4> VecMasks(Masks.begin(), Masks.end());
+  if (Kind == eInterleavedLoad || Kind == eMaskedInterleavedLoad) {
+    // Create one regular vector load per interleaved load in the group.
+    if (HasMask) {
+      VECZ_FAIL_IF(!interleaveVectors(B, VecMasks, true));
+    }
+
+    for (unsigned i = 0; i < Group.size(); i++) {
+      Value *AddressN = Address;
+      if (i > 0) {
+        const unsigned Offset = i * SimdWidth;
+        AddressN = B.CreateGEP(EleTy, Address, B.getInt32(Offset));
+      }
+      Value *Load = nullptr;
+      if (!HasMask) {
+        Load = createLoad(B, VecTy, AddressN, getSizeInt(B, 1), Align);
+      } else {
+        Value *Mask = VecMasks[i];
+        Load =
+            createMaskedLoad(B, VecTy, AddressN, Mask, /*EVL*/ nullptr, Align);
+      }
+      VECZ_FAIL_IF(!Load);
+      Vectors.push_back(Load);
+    }
+    // Transpose the loaded vectors and replace the original loads.
+    VECZ_FAIL_IF(!interleaveVectors(B, Vectors, false));
+    for (unsigned i = 0; i < Group.size(); i++) {
+      Value *Vector = Vectors[i];
+      Value *OrigLoad = Group[i];
+      OrigLoad->replaceAllUsesWith(Vector);
+    }
+  } else if (Kind == eInterleavedStore || Kind == eMaskedInterleavedStore) {
+    // Transpose the vectors to store with interleave.
+    for (unsigned i = 0; i < Group.size(); i++) {
+      CallInst *OrigStore = cast<CallInst>(Group[i]);
+      Vectors.push_back(OrigStore->getOperand(0));
+    }
+    VECZ_FAIL_IF(!interleaveVectors(B, Vectors, true));
+    if (HasMask) {
+      VECZ_FAIL_IF(!interleaveVectors(B, VecMasks, true));
+    }
+    // Create one regular vector store per interleaved store in the group.
+    for (unsigned i = 0; i < Group.size(); i++) {
+      Value *Vector = Vectors[i];
+      Value *AddressN = Address;
+      if (i > 0) {
+        const unsigned Offset = i * SimdWidth;
+        AddressN = B.CreateGEP(EleTy, Address, B.getInt32(Offset));
+      }
+      Value *Store = nullptr;
+      if (!HasMask) {
+        Store = createStore(B, Vector, AddressN, getSizeInt(B, 1), Align);
+      } else {
+        Value *Mask = VecMasks[i];
+        Store = createMaskedStore(B, Vector, AddressN, Mask, /*EVL*/ nullptr,
+                                  Align);
+      }
+      VECZ_FAIL_IF(!Store);
+    }
+  }
+
+  return true;
+}
+
+bool TargetInfo::interleaveVectors(IRBuilder<> &B,
+                                   MutableArrayRef<Value *> Vectors,
+                                   bool Forward) const {
+  const unsigned Stride = Vectors.size();
+  if (Stride == 0) {
+    return true;
+  }
+  auto *VecTy = dyn_cast<FixedVectorType>(Vectors[0]->getType());
+  VECZ_FAIL_IF(!VecTy);
+  if (Stride == 1) {
+    return true;
+  }
+  const unsigned Width = VecTy->getNumElements();
+  VECZ_FAIL_IF(Width < Stride);
+  VECZ_FAIL_IF((Width % Stride) != 0);
+  for (unsigned i = 1; i < Stride; i++) {
+    auto *VecTyN = dyn_cast<FixedVectorType>(Vectors[i]->getType());
+    VECZ_FAIL_IF(!VecTyN || (VecTyN != VecTy));
+  }
+
+  // Prepare the masks.
+  SmallVector<unsigned, 4> MaskLow2;
+  SmallVector<unsigned, 4> MaskHigh2;
+
+  StringRef Name;
+  if (Forward) {
+    Name = "interleave";
+    const unsigned Width2 = Width >> 1;
+    const unsigned Width3 = Width2 + Width;
+    for (unsigned i = 0; i < Width2; ++i) {
+      MaskLow2.push_back(i);
+      MaskHigh2.push_back(i + Width2);
+      MaskLow2.push_back(i + Width);
+      MaskHigh2.push_back(i + Width3);
+    }
+  } else {
+    Name = "deinterleave";
+    const unsigned Width2 = Width << 1;
+    for (unsigned i = 0; i < Width2; i += 2) {
+      MaskLow2.push_back(i);
+      MaskHigh2.push_back(i + 1);
+    }
+  }
+  Constant *CMaskLow2 = ConstantDataVector::get(B.getContext(), MaskLow2);
+  Constant *CMaskHigh2 = ConstantDataVector::get(B.getContext(), MaskHigh2);
+
+  if (Stride == 2) {
+    Value *Src0 = Vectors[0];
+    Value *Src1 = Vectors[1];
+    Vectors[0] = B.CreateShuffleVector(Src0, Src1, CMaskLow2, Name);
+    Vectors[1] = B.CreateShuffleVector(Src0, Src1, CMaskHigh2, Name);
+
+    return true;
+  } else if (Stride == 4) {
+    // For a 4-way interleave, we need two layers of shuffles.
+    // Starting with vectors   a..A : b..B : c..C : d..D
+    // first shuffle layer  -> ab.. : ..AB : cd.. : ..CD
+    // second shuffle layer -> abcd : .... : .... : ABCD
+    Value *Src0 = Vectors[0];
+    Value *Src1 = Vectors[1];
+    Value *Src2 = Vectors[2];
+    Value *Src3 = Vectors[3];
+
+    Constant *CMaskLow4 = nullptr;
+    Constant *CMaskHigh4 = nullptr;
+    if (Forward) {
+      SmallVector<unsigned, 4> MaskLow4;
+      SmallVector<unsigned, 4> MaskHigh4;
+      const unsigned Width2 = Width >> 1;
+      const unsigned Width3 = Width2 + Width;
+      for (unsigned i = 0; i < Width2; i += 2) {
+        MaskLow4.push_back(i);
+        MaskLow4.push_back(i + 1);
+        MaskLow4.push_back(i + Width);
+        MaskLow4.push_back(i + 1 + Width);
+        MaskHigh4.push_back(Width2 + i);
+        MaskHigh4.push_back(Width2 + i + 1);
+        MaskHigh4.push_back(Width3 + i);
+        MaskHigh4.push_back(Width3 + i + 1);
+      }
+      CMaskLow4 = ConstantDataVector::get(B.getContext(), MaskLow4);
+      CMaskHigh4 = ConstantDataVector::get(B.getContext(), MaskHigh4);
+    } else {
+      SmallVector<unsigned, 4> MaskLow4;
+      SmallVector<unsigned, 4> MaskHigh4;
+      const unsigned Width2 = Width << 1;
+      for (unsigned i = 0; i < Width2; i += 4) {
+        MaskLow4.push_back(i);
+        MaskLow4.push_back(i + 1);
+        MaskHigh4.push_back(i + 2);
+        MaskHigh4.push_back(i + 3);
+      }
+
+      // to perform the de-interleave we reverse the functions of the masks.
+      CMaskLow4 = CMaskLow2;
+      CMaskHigh4 = CMaskHigh2;
+      CMaskLow2 = ConstantDataVector::get(B.getContext(), MaskLow4);
+      CMaskHigh2 = ConstantDataVector::get(B.getContext(), MaskHigh4);
+    }
+
+    Value *Tmp0 = B.CreateShuffleVector(Src0, Src1, CMaskLow2, Name);
+    Value *Tmp1 = B.CreateShuffleVector(Src0, Src1, CMaskHigh2, Name);
+    Value *Tmp2 = B.CreateShuffleVector(Src2, Src3, CMaskLow2, Name);
+    Value *Tmp3 = B.CreateShuffleVector(Src2, Src3, CMaskHigh2, Name);
+    Vectors[0] = B.CreateShuffleVector(Tmp0, Tmp2, CMaskLow4, Name);
+    Vectors[1] = B.CreateShuffleVector(Tmp0, Tmp2, CMaskHigh4, Name);
+    Vectors[2] = B.CreateShuffleVector(Tmp1, Tmp3, CMaskLow4, Name);
+    Vectors[3] = B.CreateShuffleVector(Tmp1, Tmp3, CMaskHigh4, Name);
+
+    return true;
+  }
+  return false;
+}
+
+unsigned TargetInfo::estimateSimdWidth(const TargetTransformInfo &TTI,
+                                       const ArrayRef<const Value *> vals,
+                                       unsigned width) const {
+  const unsigned MaxVecRegBitWidth =
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedValue();
+
+  const unsigned NumVecRegs =
+      TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));
+
+  unsigned VaryingUsage = 0;
+  for (const auto *VI : vals) {
+    const auto *Ty = VI->getType();
+    VaryingUsage +=
+        Ty->isPointerTy()
+            ? TM_->getPointerSizeInBits(Ty->getPointerAddressSpace())
+            : VI->getType()->getPrimitiveSizeInBits();
+  }
+  const unsigned MaxBits = MaxVecRegBitWidth * NumVecRegs;
+  while (VaryingUsage * width > MaxBits) {
+    width >>= 1;
+  }
+
+  return width;
+}
+
+unsigned TargetInfo::getVectorWidthForType(const llvm::TargetTransformInfo &TTI,
+                                           const llvm::Type &Ty) const {
+  const unsigned MaxVecRegBitWidth =
+      TTI.getRegisterBitWidth(llvm::TargetTransformInfo::RGK_FixedWidthVector)
+          .getFixedValue();
+
+  if (MaxVecRegBitWidth == 0) {
+    return 0;
+  }
+
+  unsigned BitWidth = 0;
+  if (!Ty.isPtrOrPtrVectorTy()) {
+    BitWidth = Ty.getScalarSizeInBits();
+  } else if (TM_) {
+    BitWidth = TM_->getPointerSizeInBits(Ty.getPointerAddressSpace());
+  }
+
+  if (BitWidth == 0) {
+    // Couldn't work out the vector width..
+    return 0;
+  }
+
+  // The floor of 8 prevents poor double precision performance.
+  // Not sure why.
+  return std::max(MaxVecRegBitWidth / BitWidth, 8u);
+}
+
+bool TargetInfo::canPacketize(const llvm::Value *, ElementCount) const {
+  return true;
+}
+
+std::unique_ptr<TargetInfo>
+vecz::createTargetInfoFromTargetMachine(TargetMachine *tm) {
+  // The TargetMachine is allowed to be null
+  if (tm) {
+    const Triple &TT(tm->getTargetTriple());
+    switch (TT.getArch()) {
+    case Triple::arm:
+      return createTargetInfoArm(tm);
+    case Triple::aarch64:
+      return createTargetInfoAArch64(tm);
+    case Triple::riscv32:
+    case Triple::riscv64:
+      return createTargetInfoRISCV(tm);
+    default:
+      // Just use the generic TargetInfo unless we know better
+      break;
+    }
+  }
+  return std::make_unique<TargetInfo>(tm);
+}
+
+AnalysisKey TargetInfoAnalysis::Key;
+
+TargetInfoAnalysis::TargetInfoAnalysis()
+    : TICallback([](const Module &) {
+        return std::make_unique<TargetInfo>(/*TM*/ nullptr);
+      }) {}
+
+TargetInfoAnalysis::TargetInfoAnalysis(TargetMachine *TM)
+    : TICallback([TM](const Module &) {
+        return vecz::createTargetInfoFromTargetMachine(TM);
+      }) {}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
new file mode 100644
index 0000000000000..bae66eb789260
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_arm.cpp
@@ -0,0 +1,407 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/IRBuilder.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicsAArch64.h>
+#include <llvm/IR/IntrinsicsARM.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "debugging.h"
+#include "vecz/vecz_target_info.h"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace vecz {
+
+class TargetInfoArm final : public TargetInfo {
+public:
+  TargetInfoArm(TargetMachine *tm) : TargetInfo(tm) {}
+
+  ~TargetInfoArm() = default;
+
+  bool canOptimizeInterleavedGroup(const Instruction &val,
+                                   InterleavedOperation kind, int stride,
+                                   unsigned groupSize) const override;
+
+  bool optimizeInterleavedGroup(IRBuilder<> &builder, InterleavedOperation kind,
+                                ArrayRef<Value *> group,
+                                ArrayRef<Value *> masks, Value *baseAddress,
+                                int stride) const override;
+
+private:
+  bool canOptimizeInterleavedGroupImpl(const Instruction &val,
+                                       InterleavedOperation kind, int stride,
+                                       unsigned groupSize,
+                                       unsigned &intrinsicID) const;
+};
+
+class TargetInfoAArch64 final : public TargetInfo {
+public:
+  TargetInfoAArch64(TargetMachine *tm) : TargetInfo(tm) {}
+
+  ~TargetInfoAArch64() = default;
+
+  bool canOptimizeInterleavedGroup(const Instruction &val,
+                                   InterleavedOperation kind, int stride,
+                                   unsigned groupSize) const override;
+
+  bool optimizeInterleavedGroup(IRBuilder<> &builder, InterleavedOperation kind,
+                                ArrayRef<Value *> group,
+                                ArrayRef<Value *> masks, Value *baseAddress,
+                                int stride) const override;
+
+private:
+  bool canOptimizeInterleavedGroupImpl(const Instruction &val,
+                                       InterleavedOperation kind, int stride,
+                                       unsigned groupSize,
+                                       unsigned &intrinsicID) const;
+};
+
+std::unique_ptr<TargetInfo> createTargetInfoArm(TargetMachine *tm) {
+  return std::make_unique<TargetInfoArm>(tm);
+}
+
+std::unique_ptr<TargetInfo> createTargetInfoAArch64(TargetMachine *tm) {
+  return std::make_unique<TargetInfoAArch64>(tm);
+}
+
+} // namespace vecz
+
+bool TargetInfoArm::canOptimizeInterleavedGroup(const Instruction &val,
+                                                InterleavedOperation kind,
+                                                int stride,
+                                                unsigned groupSize) const {
+  unsigned IntrID;
+  return canOptimizeInterleavedGroupImpl(val, kind, stride, groupSize, IntrID);
+}
+
+bool TargetInfoArm::canOptimizeInterleavedGroupImpl(const Instruction &val,
+                                                    InterleavedOperation kind,
+                                                    int stride,
+                                                    unsigned groupSize,
+                                                    unsigned &IntrID) const {
+  IntrID = Intrinsic::not_intrinsic;
+  Type *dataType = nullptr;
+  if (kind == eInterleavedStore) {
+    switch (stride) {
+    default:
+      break;
+    case 2:
+      IntrID = Intrinsic::arm_neon_vst2;
+      break;
+    case 3:
+      IntrID = Intrinsic::arm_neon_vst3;
+      break;
+    case 4:
+      IntrID = Intrinsic::arm_neon_vst4;
+      break;
+    }
+    dataType = val.getOperand(0)->getType();
+  } else if (kind == eInterleavedLoad) {
+    switch (stride) {
+    default:
+      break;
+    case 2:
+      IntrID = Intrinsic::arm_neon_vld2;
+      break;
+    case 3:
+      IntrID = Intrinsic::arm_neon_vld3;
+      break;
+    case 4:
+      IntrID = Intrinsic::arm_neon_vld4;
+      break;
+    }
+    dataType = val.getType();
+  } else {
+    return false;
+  }
+
+  if (IntrID == Intrinsic::not_intrinsic || ((groupSize % stride) != 0)) {
+    return false;
+  }
+
+  if (!dataType) {
+    return false;
+  }
+
+  auto *VecTy = dyn_cast<FixedVectorType>(dataType);
+  if (!VecTy) {
+    return false;
+  }
+
+  const unsigned VecBits = VecTy->getPrimitiveSizeInBits();
+  if ((VecBits != 128) && (VecBits != 64)) {
+    return false;
+  }
+
+  // NEON interleave instructions only allow 8, 16, and 32 bit elements
+  const unsigned ElementSize = VecTy->getScalarSizeInBits();
+  if ((ElementSize != 32) && (ElementSize != 16) && (ElementSize != 8)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool TargetInfoArm::optimizeInterleavedGroup(IRBuilder<> &B,
+                                             InterleavedOperation kind,
+                                             ArrayRef<Value *> group,
+                                             ArrayRef<Value *>, Value *address,
+                                             int stride) const {
+  const bool HasMask =
+      (kind == eMaskedInterleavedLoad) || (kind == eMaskedInterleavedStore);
+  // canOptimizeInterleavedGroup() should have returned false in this case.
+  // ARM does not have masked vector load or store instructions.
+  VECZ_FAIL_IF(HasMask);
+  VECZ_FAIL_IF(stride < 0);
+
+  // TODO fetch information on SubTargetInfo
+  // load instructions seems to be easily split in the backend whereas stores
+  // generate a backend error because of invalid data type on vector operands.
+  // Vector operands are enabled in the backend only when SubTargetInfo ensures
+  // NEON instrutions are supported.
+  const bool subTargetHasNeon = false;
+  if (!subTargetHasNeon && kind == eInterleavedStore) {
+    return false;
+  }
+
+  // Validate the operations in the group.
+  SmallVector<CallInst *, 4> Calls;
+  for (unsigned i = 0; i < group.size(); i++) {
+    CallInst *Op = dyn_cast<CallInst>(group[i]);
+    if (!Op) {
+      return false;
+    }
+    Calls.push_back(Op);
+  }
+
+  PointerType *PtrTy = dyn_cast<PointerType>(address->getType());
+  if (!PtrTy) {
+    return false;
+  }
+
+  CallInst *Op0 = Calls[0];
+  // Determine the intrinsic to emit for this group.
+  unsigned IntrID = Intrinsic::not_intrinsic;
+  if (!canOptimizeInterleavedGroupImpl(*Op0, kind, stride, group.size(),
+                                       IntrID)) {
+    return false;
+  }
+
+  // canOptimizeInterleavedGroup() performs several checks, including valid
+  // Kind and Op0 types. Thus, these casts are safe.
+  FixedVectorType *VecTy = nullptr;
+  if (kind == eInterleavedStore) {
+    VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
+  } else { // eInterleavedLoad
+    VecTy = cast<FixedVectorType>(Op0->getType());
+  }
+
+  Type *EleTy = VecTy->getElementType();
+  const unsigned Alignment = (EleTy->getPrimitiveSizeInBits() / 8);
+
+  // Declare the intrinsic if needed.
+  SmallVector<Type *, 2> Tys;
+  if (kind == eInterleavedStore) {
+    Tys = {PtrTy, VecTy};
+  } else if (kind == eInterleavedLoad) {
+    Tys = {VecTy, PtrTy};
+  }
+
+  Function *IntrFn = Intrinsic::getOrInsertDeclaration(
+      Op0->getModule(), (Intrinsic::ID)IntrID, Tys);
+  if (!IntrFn) {
+    return false;
+  }
+
+  // Create a NEON load or store to replace the interleaved calls.
+  SmallVector<Value *, 8> Ops;
+  Ops.push_back(address);
+  if (kind == eInterleavedStore) {
+    for (unsigned i = 0; i < group.size(); i++) {
+      CallInst *Op = Calls[i];
+      Ops.push_back(Op->getOperand(0));
+    }
+  }
+  Ops.push_back(B.getInt32(Alignment));
+  CallInst *CI = B.CreateCall(IntrFn, Ops, Op0->getName());
+  CI->setCallingConv(IntrFn->getCallingConv());
+  if (kind == eInterleavedLoad) {
+    for (unsigned i = 0; i < Calls.size(); i++) {
+      CallInst *Op = Calls[i];
+      const ArrayRef<unsigned> Indices(&i, 1);
+      Value *Extract = B.CreateExtractValue(CI, Indices);
+      Op->replaceAllUsesWith(Extract);
+    }
+  }
+  return true;
+}
+
+bool TargetInfoAArch64::canOptimizeInterleavedGroup(const Instruction &val,
+                                                    InterleavedOperation kind,
+                                                    int stride,
+                                                    unsigned groupSize) const {
+  unsigned IntrID;
+  return canOptimizeInterleavedGroupImpl(val, kind, stride, groupSize, IntrID);
+}
+
+bool TargetInfoAArch64::canOptimizeInterleavedGroupImpl(
+    const Instruction &val, InterleavedOperation kind, int stride,
+    unsigned groupSize, unsigned &IntrID) const {
+  IntrID = Intrinsic::not_intrinsic;
+  Type *dataType = nullptr;
+  if (kind == eInterleavedStore) {
+    switch (stride) {
+    default:
+      break;
+    case 2:
+      IntrID = Intrinsic::aarch64_neon_st2;
+      break;
+    case 3:
+      IntrID = Intrinsic::aarch64_neon_st3;
+      break;
+    case 4:
+      IntrID = Intrinsic::aarch64_neon_st4;
+      break;
+    }
+    dataType = val.getOperand(0)->getType();
+  } else if (kind == eInterleavedLoad) {
+    switch (stride) {
+    default:
+      break;
+    case 2:
+      IntrID = Intrinsic::aarch64_neon_ld2;
+      break;
+    case 3:
+      IntrID = Intrinsic::aarch64_neon_ld3;
+      break;
+    case 4:
+      IntrID = Intrinsic::aarch64_neon_ld4;
+      break;
+    }
+    dataType = val.getType();
+  } else {
+    return false;
+  }
+
+  if (IntrID == Intrinsic::not_intrinsic || ((groupSize % stride) != 0)) {
+    return false;
+  }
+
+  if (!dataType) {
+    return false;
+  }
+
+  auto *VecTy = dyn_cast<FixedVectorType>(dataType);
+  if (!VecTy) {
+    return false;
+  }
+
+  const unsigned VecBits = VecTy->getPrimitiveSizeInBits();
+  if ((VecBits != 128) && (VecBits != 64)) {
+    return false;
+  }
+
+  // NEON interleave instructions only allow 8, 16, and 32 bit elements
+  const unsigned ElementSize = VecTy->getScalarSizeInBits();
+  if ((ElementSize != 32) && (ElementSize != 16) && (ElementSize != 8)) {
+    return false;
+  }
+
+  return true;
+}
+
+bool TargetInfoAArch64::optimizeInterleavedGroup(
+    IRBuilder<> &B, InterleavedOperation kind, ArrayRef<Value *> group,
+    ArrayRef<Value *>, Value *address, int stride) const {
+  const bool HasMask =
+      (kind == eMaskedInterleavedLoad) || (kind == eMaskedInterleavedStore);
+  // canOptimizeInterleavedGroup() should have returned false in this case.
+  // AArch64 does not have masked vector load or store instructions.
+  VECZ_FAIL_IF(HasMask);
+  VECZ_FAIL_IF(stride < 0);
+
+  // TODO fetch information on SubTargetInfo
+  // load instructions seems to be easily split in the backend whereas stores
+  // generate a backend error because of invalid data type on vector operands.
+  // Vector operands are enabled in the backend only when SubTargetInfo ensures
+  // NEON instrutions are supported.
+  const bool subTargetHasNeon = false;
+  if (!subTargetHasNeon && kind == eInterleavedStore) {
+    return false;
+  }
+
+  // Validate the operations in the group.
+  SmallVector<CallInst *, 4> Calls;
+  for (unsigned i = 0; i < group.size(); i++) {
+    CallInst *Op = dyn_cast<CallInst>(group[i]);
+    if (!Op) {
+      return false;
+    }
+    Calls.push_back(Op);
+  }
+
+  PointerType *PtrTy = dyn_cast<PointerType>(address->getType());
+  if (!PtrTy) {
+    return false;
+  }
+
+  CallInst *Op0 = Calls[0];
+  // Determine the intrinsic to emit for this group.
+  unsigned IntrID = Intrinsic::not_intrinsic;
+  if (!canOptimizeInterleavedGroupImpl(*Op0, kind, stride, group.size(),
+                                       IntrID)) {
+    return false;
+  }
+
+  // canOptimizeInterleavedGroup() performs several checks, including valid
+  // Kind and Op0 types. Thus, these casts are safe.
+  FixedVectorType *VecTy = nullptr;
+  if (kind == eInterleavedStore) {
+    VecTy = cast<FixedVectorType>(Op0->getOperand(0)->getType());
+  } else { // eInterleavedLoad
+    VecTy = cast<FixedVectorType>(Op0->getType());
+  }
+
+  Function *IntrFn = Intrinsic::getOrInsertDeclaration(
+      Op0->getModule(), (Intrinsic::ID)IntrID, {VecTy, PtrTy});
+  if (!IntrFn) {
+    return false;
+  }
+
+  // Create a NEON load or store to replace the interleaved calls.
+  SmallVector<Value *, 8> Ops;
+  if (kind == eInterleavedStore) {
+    for (unsigned i = 0; i < group.size(); i++) {
+      CallInst *Op = Calls[i];
+      Ops.push_back(Op->getOperand(0));
+    }
+  }
+  Ops.push_back(address);
+  CallInst *CI = B.CreateCall(IntrFn, Ops, Op0->getName());
+  CI->setCallingConv(IntrFn->getCallingConv());
+  if (kind == eInterleavedLoad) {
+    for (unsigned i = 0; i < Calls.size(); i++) {
+      CallInst *Op = Calls[i];
+      const ArrayRef<unsigned> Indices(&i, 1);
+      Value *Extract = B.CreateExtractValue(CI, Indices);
+      Op->replaceAllUsesWith(Extract);
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
new file mode 100644
index 0000000000000..8c320bd324ffa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vector_target_info_riscv.cpp
@@ -0,0 +1,753 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/IntrinsicsRISCV.h>
+#include <llvm/Support/MathExtras.h>
+#include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include "transform/packetization_helpers.h"
+#include "vecz/vecz_target_info.h"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace vecz {
+
+class TargetInfoRISCV final : public TargetInfo {
+public:
+  TargetInfoRISCV(TargetMachine *tm) : TargetInfo(tm) {}
+
+  ~TargetInfoRISCV() = default;
+
+  bool canPacketize(const llvm::Value *Val, ElementCount Width) const override;
+
+  // These functions should only be overriden in LLVM >= 13.
+  llvm::Value *createScalableExtractElement(
+      llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx,
+      llvm::Instruction *extract, llvm::Type *narrowTy, llvm::Value *src,
+      llvm::Value *index, llvm::Value *evl) const override;
+
+  llvm::Value *
+  createOuterScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector,
+                               llvm::Value *VL,
+                               ElementCount factor) const override {
+    return createScalableBroadcast(builder, vector, VL, factor,
+                                   /* URem */ true);
+  }
+
+  llvm::Value *
+  createInnerScalableBroadcast(llvm::IRBuilder<> &builder, llvm::Value *vector,
+                               llvm::Value *VL,
+                               ElementCount factor) const override {
+    return createScalableBroadcast(builder, vector, VL, factor,
+                                   /* URem */ false);
+  }
+
+  llvm::Value *createScalableInsertElement(llvm::IRBuilder<> &builder,
+                                           vecz::VectorizationContext &Ctx,
+                                           llvm::Instruction *insert,
+                                           llvm::Value *elt, llvm::Value *into,
+                                           llvm::Value *index,
+                                           llvm::Value *evl) const override;
+  bool isVPVectorLegal(const llvm::Function &F, llvm::Type *Ty) const override;
+
+  llvm::Value *createVectorShuffle(llvm::IRBuilder<> &builder, llvm::Value *src,
+                                   llvm::Value *mask,
+                                   llvm::Value *evl) const override;
+
+  llvm::Value *createVectorSlideUp(llvm::IRBuilder<> &builder, llvm::Value *src,
+                                   llvm::Value *insert,
+                                   llvm::Value *evl) const override;
+
+private:
+  bool isOperationLegal(llvm::Intrinsic::ID ID,
+                        llvm::ArrayRef<llvm::Type *> Tys) const;
+
+  /// @brief Maximum vector type size in bits for VP intrinsics.
+  static constexpr unsigned MaxLegalVectorTypeBits = 8 * 64;
+
+  /// @return Whether the minimum size of a given vector type is less than 64
+  /// bytes and the length is a power of 2.
+  bool isVectorTypeLegal(llvm::Type *Ty) const;
+
+  llvm::Value *createScalableBroadcast(llvm::IRBuilder<> &builder,
+                                       llvm::Value *vector, llvm::Value *VL,
+                                       ElementCount factor, bool URem) const;
+
+  Value *createVPKernelWidth(IRBuilder<> &, Value *, unsigned,
+                             ElementCount) const override;
+};
+
+// LLVM 14 introduced vp intrinsics legalization.
+bool TargetInfoRISCV::isVPVectorLegal(const llvm::Function &F,
+                                      llvm::Type *Ty) const {
+  (void)F;
+  return isVectorTypeLegal(Ty);
+}
+
+// Should be target-dependent. Take RISCV legal types for now.
+// FIXME: LLVM 14 adds better support for legalization of vp intrinsics, but
+// not RISCV ones like vrgather_vv.
+bool TargetInfoRISCV::isVectorTypeLegal(Type *Ty) const {
+  assert(Ty->isVectorTy() && "Expecting a vector type.");
+  (void)Ty;
+  // FIXME: VP boolean logical operators (and,or,xor) are not matched in the
+  // LLVM 13 RVV backend: we must backport https://reviews.llvm.org/D115546
+  // before we can enable this for Int1Ty as well.
+  bool isLegal = isLegalVPElementType(multi_llvm::getVectorElementType(Ty));
+  if (isLegal) {
+    const uint32_t MinSize =
+        multi_llvm::getVectorElementCount(Ty).getKnownMinValue();
+    isLegal = isPowerOf2_32(MinSize) &&
+              MinSize * Ty->getScalarSizeInBits() <= MaxLegalVectorTypeBits;
+  }
+  return isLegal;
+}
+
+std::unique_ptr<TargetInfo> createTargetInfoRISCV(TargetMachine *tm) {
+  return std::make_unique<TargetInfoRISCV>(tm);
+}
+
+} // namespace vecz
+
+bool TargetInfoRISCV::canPacketize(const llvm::Value *Val,
+                                   ElementCount Width) const {
+  // If we're not scalable, assume the backend will sort everything out.
+  if (!Width.isScalable()) {
+    return true;
+  }
+  // Do a relatively simple check that instructions aren't defining any types
+  // that can't be legalized when turned into scalable vectors.
+  if (!llvm::isa<llvm::Instruction>(Val)) {
+    return true;
+  }
+  const auto *I = llvm::cast<llvm::Instruction>(Val);
+
+  const auto IsIllegalIntBitwidth = [](const llvm::Type *Ty) {
+    if (!Ty->isIntOrIntVectorTy()) {
+      return false;
+    }
+    auto ScalarBitWidth =
+        llvm::cast<IntegerType>(Ty->getScalarType())->getBitWidth();
+    return ScalarBitWidth > 64;
+  };
+
+  if (IsIllegalIntBitwidth(I->getType())) {
+    return false;
+  }
+  for (auto *O : I->operand_values()) {
+    if (IsIllegalIntBitwidth(O->getType())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// @return Whether RISCV intrinsic @a ID is legal for types @a Tys.
+///
+/// This function does not check whether the intrinsic is being called
+/// with the right argument types, it just tests that all the types
+/// used to call the intrinsic (and its return type) are
+/// isVectorTypeLegal().
+///
+/// @param[in] ID The intrinsic ID
+/// @param[in] Tys A subset of the overloaded types of the intrinsic required to
+/// check whether it's legal.
+bool TargetInfoRISCV::isOperationLegal(llvm::Intrinsic::ID ID,
+                                       llvm::ArrayRef<llvm::Type *> Tys) const {
+  switch (ID) {
+  case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv:
+  case Intrinsic::RISCVIntrinsics::riscv_vrgather_vv_mask:
+    // riscv_vrgather_vv[_mask](RetTy, _IdxTy)
+    // We only need to check the return type here, as it should be greater or
+    // equal to the index type.
+    assert(Tys.size() == 1 &&
+           "Only the return type is needed to check vrgather_vv intrinsics");
+    return isVectorTypeLegal(Tys.front());
+  case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv:
+  case Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv_mask: {
+    constexpr unsigned MaxVectorSize = MaxLegalVectorTypeBits / 16;
+    // riscv_vrgatherei16_vv[_mask](RetTy, _IdxTy)
+    // Case similar to that of riscv_vrgather_vv[_mask], but we also need to
+    // check that the vector size is no greater than MaxLegalVectorTypeSize /
+    // 16, as the index type will always be i16.
+    assert(
+        Tys.size() == 1 &&
+        "Only the return type is needed to check vrgatherei16_vv intrinsics");
+    auto *const RetTy = Tys.front();
+    return isVectorTypeLegal(RetTy) &&
+           multi_llvm::getVectorElementCount(RetTy).getKnownMinValue() <=
+               MaxVectorSize;
+  }
+  default:
+    break;
+  }
+  llvm_unreachable("Don't know how to check whether this intrinsic is legal.");
+}
+
+namespace {
+static unsigned getRISCVBits(const TargetMachine *TM) {
+  const auto &Triple = TM->getTargetTriple();
+  return Triple.isArch32Bit() ? 32 : 64;
+}
+
+/// @brief Get VL to be used as a parameter of a RISCV intrinsic.
+///
+/// The type of this value will depend on the architecture (RISCV32 or
+/// RISCV64).
+///
+/// @return A pair containig the VL value and its type.
+///
+/// @param[in] B Builder to use when creating the VL value.
+/// @param[in] VL Original VL. If non-nullptr, this value (zero-extended for
+/// RISCV64) will be returned.
+/// @param[in] wideTy Type of the vectors which will be used in the intrinsics.
+/// If no VL is provided and `<vscale x N x Ty>` is used here, `<call
+/// llvm.vscale> * N` will be returned.
+/// @param[in] TM Target machine.
+/// @param[in] N name of the instruction to generate. "xlen" by default.
+llvm::Value *getIntrinsicVL(llvm::IRBuilderBase &B, llvm::Value *VL,
+                            llvm::Type *wideTy, llvm::TargetMachine *TM,
+                            const Twine &N = "xlen") {
+  const unsigned XLenTyWidth = getRISCVBits(TM);
+  Type *XLen = B.getIntNTy(XLenTyWidth);
+
+  if (VL) {
+    // Our incoming VP VL type is always i32, so zero-extend to 64 bits if
+    // required.
+    return XLenTyWidth == 32 ? VL : B.CreateZExt(VL, XLen, N);
+  }
+
+  // Else create a 'default' VL which covers the entire scalable vector.
+  return B.CreateElementCount(XLen,
+                              cast<VectorType>(wideTy)->getElementCount());
+}
+
+/// @brief Returns a pair with the `vrgather` intrinsic variation to use and the
+/// bitwidth of the `vs1` parameter to this intrinsic.
+///
+/// @param[in] vs2Ty Type of the source vector.
+/// @param[in] isMasked Whether the intrinsic should be masked.
+std::pair<llvm::Intrinsic::RISCVIntrinsics, unsigned>
+getGatherIntrinsic(llvm::Type *vs2Ty, bool isMasked = false) {
+  assert(!vs2Ty->isPtrOrPtrVectorTy() &&
+         "Cannot get gather intrinsic for a vector of pointers");
+
+  Intrinsic::RISCVIntrinsics Opc;
+  auto *vecTy = multi_llvm::getVectorElementType(vs2Ty);
+  unsigned vs1Width;
+  if (vecTy->isIntegerTy() && vecTy->getIntegerBitWidth() == 8) {
+    Opc = isMasked ? Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv_mask
+                   : Intrinsic::RISCVIntrinsics::riscv_vrgatherei16_vv;
+
+    vs1Width = 16;
+  } else {
+    Opc = isMasked ? Intrinsic::RISCVIntrinsics::riscv_vrgather_vv_mask
+                   : Intrinsic::RISCVIntrinsics::riscv_vrgather_vv;
+
+    vs1Width = vecTy->getScalarSizeInBits();
+  }
+  return std::make_pair(Opc, vs1Width);
+}
+
+/// @brief Returns the `v?slide1up.v?` intrinsic variation to use.
+///
+/// @param[in] vs2Ty Type of the source vector.
+llvm::Intrinsic::RISCVIntrinsics getSlideUpIntrinsic(llvm::Type *vs2Ty) {
+  assert(!vs2Ty->isPtrOrPtrVectorTy() &&
+         "Cannot get gather intrinsic for a vector of pointers");
+
+  Intrinsic::RISCVIntrinsics Opc;
+  auto *vecTy = multi_llvm::getVectorElementType(vs2Ty);
+  if (vecTy->isFloatingPointTy()) {
+    Opc = Intrinsic::RISCVIntrinsics::riscv_vfslide1up;
+  } else {
+    Opc = Intrinsic::RISCVIntrinsics::riscv_vslide1up;
+  }
+  return Opc;
+}
+
+} // namespace
+
+llvm::Value *TargetInfoRISCV::createScalableExtractElement(
+    llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx,
+    llvm::Instruction *origExtract, llvm::Type *narrowTy, llvm::Value *src,
+    llvm::Value *index, llvm::Value *VL) const {
+  // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through
+  // memory when creating this operation.
+  //   vrgather: vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+  // or,
+  //   vrgather: res[i] = (idxs[i] >= VLMAX) ? 0 : src[idxs[i]];
+  // An example: extractelement <A,B,C,D>, I - vectorized by <vscale x 1> - we
+  // receive here as packetized arguments:
+  //   src:  <A,B,C,D, E,F,G,H,    ...> (  <vscale x 4 x ty>    )
+  //   idxs: <I,       J,       K, ...> (  <vscale x 1 x idxty> )
+  // We want to construct operands such that we have:
+  //   srcs: as before
+  //   idxs: <I+0,J+4,K+8,...>           (  <vscale x 4 x idxty> )
+  // So that vrgather extracts the Ith element from the first 4 elements, the
+  // Jth element from the second 4, etc.
+  auto *srcTy = cast<ScalableVectorType>(src->getType());
+
+  Intrinsic::ID intrinsicID;
+  unsigned intrIdxBitWidth;
+  std::tie(intrinsicID, intrIdxBitWidth) = getGatherIntrinsic(srcTy);
+
+  const auto srcEC = multi_llvm::getVectorElementCount(srcTy);
+  const auto resEC = multi_llvm::getVectorElementCount(narrowTy);
+
+  auto *const indexEltTy = B.getIntNTy(intrIdxBitWidth);
+  Type *const indexVecTy = VectorType::get(indexEltTy, resEC);
+
+  // We cannot use this optimization if the types are not legal in the target
+  // machine.
+  if (!isOperationLegal(intrinsicID, {srcTy})) {
+    return TargetInfo::createScalableExtractElement(B, Ctx, origExtract,
+                                                    narrowTy, src, index, VL);
+  }
+
+  auto *const avl = getIntrinsicVL(B, VL, narrowTy, getTargetMachine());
+
+  auto *indexTy = index->getType();
+  const bool isIdxVector = indexTy->isVectorTy();
+  const unsigned idxBitWidth = indexTy->getScalarSizeInBits();
+
+  // The intrinsic may demand a larger index type than we currently have;
+  // extend up to the right type.
+  if (idxBitWidth != intrIdxBitWidth) {
+    index = B.CreateZExtOrTrunc(index, isIdxVector ? indexVecTy : indexEltTy);
+  }
+
+  // If the index is uniform, it may not be a vector. We need one for the
+  // intrinsic, so splat it here.
+  if (!isIdxVector) {
+    index = B.CreateVectorSplat(resEC, index);
+  }
+
+  // Construct the indices such that each packetized index (still indexing into
+  // the original vector of 4 elements) is spread out such that each index
+  // indexes into its own 4-element slice: e.g., <I+0, J+4, K+8, ...>.
+  auto *indices = getGatherIndicesVector(
+      B, index, indexVecTy,
+      multi_llvm::getVectorNumElements(origExtract->getOperand(0)->getType()),
+      "vs1");
+
+  auto *const zero = B.getInt64(0);
+
+  // Our indices are still in the narrower vectorized type (e.g., <vscale x 1 x
+  // idxTy>), but the vrgather intrinsics need equally-sized vector types. So
+  // insert the indices into a wide dummy vector (e.g., <vscale x 4 x idxTy>),
+  // perform the vrgather, and extract the subvector back out again.
+  auto *const intrIndexTy = VectorType::get(indexEltTy, srcEC);
+  indices = B.CreateInsertVector(intrIndexTy, PoisonValue::get(intrIndexTy),
+                                 indices, zero);
+
+  SmallVector<Value *, 4> ops;
+  // Add the a pass-through operand - we set it to poison.
+  ops.push_back(PoisonValue::get(srcTy));
+  ops.push_back(src);
+  ops.push_back(indices);
+  ops.push_back(avl);
+
+  auto *const gather =
+      B.CreateIntrinsic(intrinsicID, {srcTy, avl->getType()}, ops);
+
+  return B.CreateExtractVector(narrowTy, gather, zero);
+}
+
+llvm::Value *TargetInfoRISCV::createScalableBroadcast(llvm::IRBuilder<> &B,
+                                                      llvm::Value *vector,
+                                                      llvm::Value *VL,
+                                                      ElementCount factor,
+                                                      bool URem) const {
+  // Using rvv instruction:
+  // vrgather.vv vd, vs2, vs1, vm s.t.
+  // vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]]
+
+  auto *vectorTy = vector->getType();
+  auto *const origElTy = multi_llvm::getVectorElementType(vectorTy);
+
+  // We first check we are not broadcasting a vector of pointers,
+  // unsupported by the intrinsic.
+  const bool isVectorOfPointers = origElTy->isPtrOrPtrVectorTy();
+  if (isVectorOfPointers) {
+    vectorTy = VectorType::get(B.getIntNTy(getRISCVBits(getTargetMachine())),
+                               multi_llvm::getVectorElementCount(vectorTy));
+  }
+
+  auto *const wideTy = ScalableVectorType::get(
+      multi_llvm::getVectorElementType(vectorTy),
+      factor.getKnownMinValue() *
+          multi_llvm::getVectorElementCount(vectorTy).getKnownMinValue());
+
+  Intrinsic::RISCVIntrinsics intrinsicID;
+  unsigned vs1Width;
+  std::tie(intrinsicID, vs1Width) = getGatherIntrinsic(wideTy);
+  auto *const vs1ElTy = B.getIntNTy(vs1Width);
+
+  // We cannot use this optimization if the types are not legal in the target
+  // machine.
+  if (!isOperationLegal(intrinsicID, {wideTy})) {
+    return URem
+               ? TargetInfo::createOuterScalableBroadcast(B, vector, VL, factor)
+               : TargetInfo::createInnerScalableBroadcast(B, vector, VL,
+                                                          factor);
+  }
+
+  // Cast the vector of pointers to a vector of integers if needed.
+  if (isVectorOfPointers) {
+    vector = B.CreatePtrToInt(vector, vectorTy);
+  }
+
+  // We grow the fixed vector to consume an entire RVV register.
+  auto *const vs2 = B.CreateInsertVector(wideTy, PoisonValue::get(wideTy),
+                                         vector, B.getInt64(0), "vs2");
+
+  auto *const vs1 = createBroadcastIndexVector(
+      B, VectorType::get(vs1ElTy, wideTy), factor, URem, "vs1");
+
+  auto *const avl = getIntrinsicVL(B, VL, wideTy, getTargetMachine());
+
+  SmallVector<Value *, 4> ops;
+  // Add the pass-through operand - we set it to poison.
+  ops.push_back(PoisonValue::get(vs2->getType()));
+  ops.push_back(vs2);
+  ops.push_back(vs1);
+  ops.push_back(avl);
+
+  Value *gather =
+      B.CreateIntrinsic(intrinsicID, {vs2->getType(), avl->getType()}, ops);
+
+  // If we had to cast the vector before, we do the reverse operation
+  // on the result.
+  if (isVectorOfPointers) {
+    gather = B.CreateIntToPtr(gather, VectorType::get(origElTy, wideTy));
+  }
+
+  return gather;
+}
+
+static CallInst *createRISCVMaskedIntrinsic(IRBuilder<> &B, Intrinsic::ID ID,
+                                            ArrayRef<Type *> Types,
+                                            ArrayRef<Value *> Args,
+                                            unsigned TailPolicy,
+                                            Instruction *FMFSource = nullptr,
+                                            const Twine &Name = "") {
+  SmallVector<Value *> InArgs(Args.begin(), Args.end());
+  InArgs.push_back(
+      B.getIntN(Args.back()->getType()->getIntegerBitWidth(), TailPolicy));
+  return B.CreateIntrinsic(ID, Types, InArgs, FMFSource, Name);
+}
+
+llvm::Value *TargetInfoRISCV::createScalableInsertElement(
+    llvm::IRBuilder<> &B, vecz::VectorizationContext &Ctx,
+    llvm::Instruction *origInsert, llvm::Value *elt, llvm::Value *into,
+    llvm::Value *index, llvm::Value *VL) const {
+  // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through
+  // memory when creating this operation.
+  //   vrgather: vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+  // or,
+  //   vrgather: res[i] = (idxs[i] >= VLMAX) ? 0 : src[idxs[i]];
+  // An example: insertelement <A,B,C,D>, X, I - vectorized by <vscale x 1> -
+  // we receive here as packetized arguments:
+  //   into: <A,B,C,D, E,F,G,H,    ...> (  <vscale x 4 x ty>    )
+  //   elt:  <X,       Y,       Z, ...> (  <vscale x 1 x ty>    )
+  //   idxs: <I,       J,       K, ...> (  <vscale x 1 x idxty> )
+  // We want to construct operands such that we have:
+  //   into: as before
+  //   elt:  <X,X,X,X, Y,Y,Y,Y, Z,Z,Z,Z, ... >  ( <vscale x 4 x ty>    )
+  //   mask: true where the elts indices are to be inserted according to the
+  //         indices, e.g.,
+  //         <0,1,0,0, 0,0,0,1,   1,0,0,0, ...  ( <vscale x 4 x i1>    )
+  //   idxs: <0,I,0,0, 0,0,0,J+4, K+8,...>      ( <vscale x 4 x idxty> )
+  // So that vrgather inserts X into the Ith element of the first 4 elements, Y
+  // into the Jth element of the second 4, etc:
+  //   res:  <u,X,u,u, u,u,u,Y, Z,u,u,u, ... >
+  // If instead we use a masked vrgather with the same mask as before and with
+  // a merge operand of 'into', we expect the blended operation to be correct:
+  //   res:  <A,X,C,D, E,F,G,Y, Z,I,J,K, ... >
+  auto *const eltTy = elt->getType();
+  auto *const intoTy = into->getType();
+
+  Intrinsic::ID intrinsicID;
+  unsigned intrIdxBitWidth;
+  std::tie(intrinsicID, intrIdxBitWidth) =
+      getGatherIntrinsic(intoTy, /*isMasked*/ true);
+
+  const auto eltEC = multi_llvm::getVectorElementCount(eltTy);
+  const auto intoEC = multi_llvm::getVectorElementCount(intoTy);
+  const auto fixedAmt =
+      multi_llvm::getVectorElementCount(origInsert->getType());
+  assert(!fixedAmt.isScalable() && "Scalable pre-packetized value?");
+
+  auto *indexEltTy = B.getIntNTy(intrIdxBitWidth);
+  Type *const indexVecTy = VectorType::get(indexEltTy, eltEC);
+
+  // We cannot use this optimization if the types are not legal in the target
+  // machine.
+  if (!isOperationLegal(intrinsicID, {intoTy})) {
+    return TargetInfo::createScalableInsertElement(B, Ctx, origInsert, elt,
+                                                   into, index, VL);
+  }
+
+  auto *const avl = getIntrinsicVL(B, VL, intoTy, getTargetMachine());
+
+  auto *const indexTy = index->getType();
+  const unsigned idxBitWidth = indexTy->getScalarSizeInBits();
+  const bool indexIsVector = indexTy->isVectorTy();
+
+  // The intrinsic may demand a larger index type than we currently have;
+  // extend up to the right type.
+  if (idxBitWidth != intrIdxBitWidth) {
+    index = B.CreateZExtOrTrunc(index, indexIsVector ? indexVecTy : indexEltTy);
+  }
+
+  // If the index is uniform, it may not be a vector. We need one for the
+  // intrinsic, so splat it here.
+  if (!indexIsVector) {
+    index = B.CreateVectorSplat(intoEC, index);
+  } else {
+    index = createInnerScalableBroadcast(B, index, VL, fixedAmt);
+  }
+
+  auto *const zero = B.getInt64(0);
+
+  auto *const intrEltTy =
+      VectorType::get(multi_llvm::getVectorElementType(elt->getType()), intoEC);
+  elt = B.CreateInsertVector(intrEltTy, PoisonValue::get(intrEltTy), elt, zero,
+                             "vs2");
+
+  auto *steps = B.CreateStepVector(VectorType::get(indexEltTy, intoEC));
+
+  // Create our inner indices, e.g.: <0,1,2,3, 0,1,2,3, 0,1,2,3, ... >
+  auto *const innerIndices = B.CreateURem(
+      steps,
+      ConstantVector::getSplat(
+          intoEC, ConstantInt::get(indexEltTy, fixedAmt.getFixedValue())));
+
+  // Create our outer indices, e.g., <0,0,0,0,1,1,1,1,2,2,2,2,...>
+  auto *const outerIndices = B.CreateUDiv(
+      steps,
+      ConstantVector::getSplat(
+          intoEC, ConstantInt::get(indexEltTy, fixedAmt.getFixedValue())));
+
+  // Now compare the insert indices with the inner index vector: only one per
+  // N-element slice will be 'on', depending on the exact indices, e.g., if we
+  // originally have:
+  //    <1,3,0, ...>
+  // we have prepared it when constructing the indices:
+  //    <1,1,1,1, 3,3,3,3, 0,0,0,0, ...>
+  // == <0,1,2,3, 0,1,2,3, 0,1,2,3, ...>
+  // -> <0,1,0,0, 0,0,0,1, 1,0,0,0, ...>
+  auto *const mask = B.CreateICmpEQ(index, innerIndices, "vm");
+
+  return createRISCVMaskedIntrinsic(B, intrinsicID, {intoTy, avl->getType()},
+                                    {into, elt, outerIndices, mask, avl},
+                                    /*TailUndisturbed*/ 1);
+}
+
+llvm::Value *TargetInfoRISCV::createVectorShuffle(llvm::IRBuilder<> &B,
+                                                  llvm::Value *src,
+                                                  llvm::Value *mask,
+                                                  llvm::Value *VL) const {
+  // In RISCV, we can use vrgather_vv and vrgatherei16_vv to avoid going through
+  // memory when creating this operation.
+  assert(isa<VectorType>(src->getType()) &&
+         "TargetInfoRISCV::createVectorShuffle: source must have vector type");
+  assert(isa<VectorType>(mask->getType()) &&
+         "TargetInfoRISCV::createVectorShuffle: mask must have vector type");
+
+  auto *const srcTy = cast<VectorType>(src->getType());
+  if (isa<Constant>(mask)) {
+    // Special case if the mask happens to be a constant.
+    return B.CreateShuffleVector(src, PoisonValue::get(srcTy), mask);
+  }
+
+  if (isa<FixedVectorType>(srcTy)) {
+    // The gather intrinsics don't work with fixed vectors.
+    return TargetInfo::createVectorShuffle(B, src, mask, VL);
+  }
+
+  auto *const maskTy = cast<VectorType>(mask->getType());
+  const auto srcEC = multi_llvm::getVectorElementCount(srcTy);
+  const auto resEC = multi_llvm::getVectorElementCount(maskTy);
+
+  auto *const resTy = VectorType::get(srcTy->getElementType(), resEC);
+
+  // We can't create the intrinsics with a scalar size smaller than 8 bits, so
+  // extend it to i8, perform the shuffle, and truncate the result back.
+  if (srcTy->getScalarSizeInBits() < 8) {
+    auto *const fix = B.CreateZExt(src, VectorType::get(B.getInt8Ty(), srcEC));
+    auto *const res = createVectorShuffle(B, fix, mask, VL);
+    return B.CreateTrunc(res, resTy);
+  }
+
+  Intrinsic::ID intrinsicID;
+  unsigned intrIdxBitWidth;
+  std::tie(intrinsicID, intrIdxBitWidth) = getGatherIntrinsic(srcTy);
+
+  auto *const indexEltTy = B.getIntNTy(intrIdxBitWidth);
+  auto *const indexVecTy = VectorType::get(indexEltTy, resEC);
+
+  // We cannot use this optimization if the types are not legal in the target
+  // machine.
+  if (!isOperationLegal(intrinsicID, {srcTy})) {
+    return TargetInfo::createVectorShuffle(B, src, mask, VL);
+  }
+
+  // The intrinsic may demand a larger index type than we currently have;
+  // extend up to the right type.
+  if (indexVecTy != maskTy) {
+    mask = B.CreateZExtOrTrunc(mask, indexVecTy);
+  }
+
+  auto *const zero = B.getInt64(0);
+
+  const bool same = (resEC == srcEC);
+  const bool narrow = !same && (srcEC.isScalable() || !resEC.isScalable()) &&
+                      resEC.getKnownMinValue() <= srcEC.getKnownMinValue();
+  const bool widen = !same && (resEC.isScalable() || !srcEC.isScalable()) &&
+                     srcEC.getKnownMinValue() <= resEC.getKnownMinValue();
+
+  assert((srcTy == resTy || narrow || widen) &&
+         "TargetInfoRISCV::createVectorShuffle: "
+         "unexpected combination of source and mask vector types");
+
+  auto *gatherTy = resTy;
+  if (narrow) {
+    // The vrgather intrinsics need equally-sized vector types. So
+    // insert the indices into a wide dummy vector (e.g., <vscale x 4 x idxTy>),
+    // perform the vrgather, and extract the subvector back out again.
+    auto *const wideMaskTy = VectorType::get(indexEltTy, srcEC);
+    mask = B.CreateInsertVector(wideMaskTy, PoisonValue::get(wideMaskTy), mask,
+                                zero);
+    gatherTy = srcTy;
+  } else if (widen) {
+    // The result is wider than the source, so insert the source vector into a
+    // wider vector first.
+    src = B.CreateInsertVector(resTy, PoisonValue::get(resTy), src, zero);
+  }
+
+  auto *const avl = getIntrinsicVL(B, VL, gatherTy, getTargetMachine());
+
+  SmallVector<Value *, 4> ops;
+  // Add the pass-through operand - we set it to poison.
+  ops.push_back(PoisonValue::get(gatherTy));
+  ops.push_back(src);
+  ops.push_back(mask);
+  ops.push_back(avl);
+
+  auto *const gather =
+      B.CreateIntrinsic(intrinsicID, {gatherTy, avl->getType()}, ops);
+
+  if (narrow) {
+    return B.CreateExtractVector(resTy, gather, zero);
+  }
+  return gather;
+}
+
+llvm::Value *TargetInfoRISCV::createVectorSlideUp(llvm::IRBuilder<> &B,
+                                                  llvm::Value *src,
+                                                  llvm::Value *insert,
+                                                  llvm::Value *VL) const {
+  auto *const srcTy = dyn_cast<VectorType>(src->getType());
+  assert(srcTy &&
+         "TargetInfo::createVectorShuffle: source must have vector type");
+
+  if (isa<FixedVectorType>(srcTy)) {
+    // The slide1up intrinsics don't work with fixed vectors.
+    return TargetInfo::createVectorSlideUp(B, src, insert, VL);
+  }
+
+  const auto intrinsicID = getSlideUpIntrinsic(srcTy);
+
+  auto *const avl = getIntrinsicVL(B, VL, srcTy, getTargetMachine());
+
+  SmallVector<Value *, 4> ops;
+  // Add the pass-through operand - we set it to poison.
+  ops.push_back(PoisonValue::get(srcTy));
+  ops.push_back(src);
+  ops.push_back(insert);
+  ops.push_back(avl);
+
+  return B.CreateIntrinsic(intrinsicID,
+                           {srcTy, insert->getType(), avl->getType()}, ops);
+}
+
+// This enum was copy/pasted from the RISCV backend
+enum VLMUL : uint8_t {
+  LMUL_1 = 0,
+  LMUL_2,
+  LMUL_4,
+  LMUL_8,
+  LMUL_RESERVED,
+  LMUL_F8,
+  LMUL_F4,
+  LMUL_F2
+};
+
+Value *TargetInfoRISCV::createVPKernelWidth(IRBuilder<> &B,
+                                            Value *RemainingIters,
+                                            unsigned WidestEltTy,
+                                            ElementCount VF) const {
+  // The widest element type can only be one of the supported legal RVV vector
+  // element types.
+  if (WidestEltTy < 8 || WidestEltTy > 64 || !isPowerOf2_32(WidestEltTy)) {
+    return nullptr;
+  }
+  const auto KnownMin = VF.getKnownMinValue();
+  // The vectorization factor must be scalable and a legal vsetvli amount: no
+  // greater than the maximum vector length for each element width:
+  // nx64vi8,nx32vi16,nx16vi32,nxv8i64
+  if (!VF.isScalable() || !isPowerOf2_32(KnownMin) ||
+      KnownMin > MaxLegalVectorTypeBits / WidestEltTy) {
+    return nullptr;
+  }
+
+  unsigned LMUL = 0;
+  const unsigned MaxLegalElementWidth = 64;
+
+  if ((WidestEltTy * KnownMin) / MaxLegalElementWidth) {
+    // Non-fractional LMULs
+    LMUL = Log2_64((WidestEltTy * KnownMin) / MaxLegalElementWidth);
+  } else {
+    // Fractional LMULs
+    const auto Fraction = MaxLegalElementWidth / (WidestEltTy * KnownMin);
+    if (Fraction == 2) {
+      LMUL = LMUL_F2;
+    } else if (Fraction == 4) {
+      LMUL = LMUL_F4;
+    } else if (Fraction == 8) {
+      LMUL = LMUL_F4;
+    } else {
+      return nullptr;
+    }
+  }
+
+  auto *const VLMul = B.getInt64(LMUL);
+  auto *const VSEW = B.getInt64(Log2_64(WidestEltTy) - 3);
+
+  auto *const I32Ty = Type::getInt32Ty(B.getContext());
+  auto *const I64Ty = Type::getInt64Ty(B.getContext());
+
+  auto *const VL = B.CreateIntrinsic(Intrinsic::RISCVIntrinsics::riscv_vsetvli,
+                                     {I64Ty}, {RemainingIters, VSEW, VLMul});
+
+  return B.CreateTrunc(VL, I32Ty);
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
new file mode 100644
index 0000000000000..0c16da1c10106
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_choices.cpp
@@ -0,0 +1,172 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/mangling.h>
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/ADT/Twine.h>
+#include <llvm/Support/raw_ostream.h>
+
+#include "vecz/vecz_choices.h"
+
+using namespace llvm;
+
+namespace {
+using namespace vecz;
+static const VectorizationChoices::ChoiceInfo choicesArray[] = {
+    {"PacketizeUniform", VectorizationChoices::eOptimizationPacketizeUniform,
+     "Packetizes all packetizable instructions whether they are varying or "
+     "not."},
+
+    {"PacketizeUniformInLoops",
+     VectorizationChoices::eOptimizationPacketizeUniformInLoops,
+     "Packetizes all packetizable instructions in loops, whether they are "
+     "varying or not."},
+
+    {"InstantiateCallsInLoops",
+     VectorizationChoices::eOptimizationInstantiateCallsInLoops,
+     "Uses loops to instantiate call instructions, instead of duplication."},
+
+    {"LinearizeBOSCC", VectorizationChoices::eLinearizeBOSCC,
+     "Control Flow Conversion uses Branch On Superword Condition Code."},
+
+    {"FullScalarization", VectorizationChoices::eFullScalarization,
+     "The scalarization pass scalarizes everything it can, regardless of any "
+     "performance benefit."},
+
+    {"DivisionExceptions", VectorizationChoices::eDivisionExceptions,
+     "Specify this when the target throws hardware exceptions on integer "
+     "division by zero."},
+
+    {"VectorPredication", VectorizationChoices::eVectorPredication,
+     "Generate a vector-predicated kernel safe to run on any workgroup size, "
+     "even those smaller than the vectorization width"},
+
+    {"TargetIndependentPacketization",
+     VectorizationChoices::eTargetIndependentPacketization,
+     "Force target-independent packetization choices (e.g., for testing "
+     "purposes)"},
+};
+
+} // namespace
+
+namespace vecz {
+
+VectorizationChoices::VectorizationChoices() {}
+
+bool VectorizationChoices::parseChoicesString(StringRef Str) {
+  // If the string is empty, our work here is done
+  if (Str.empty()) {
+    return true;
+  }
+
+  // first = Choice, second = enable
+  using ChoiceValuePair = std::pair<Choice, bool>;
+  // The lexer implementation from the name mangling module is fairly generic,
+  // so we will use it here.
+  compiler::utils::Lexer L(Str);
+  // We support multiple separators in case of platform-dependent issues
+  const StringRef Separators = ":;,";
+  // All the parsed choices will be stored in a set and will only be
+  // enabled/disabled after the parsing has been completed successfully.
+  SmallVector<ChoiceValuePair, 4> ParsedChoices;
+
+  // Start by lexing and parsing the Choices string
+
+  bool read_separator = false;
+  do {
+    StringRef ParsedChoice;
+    // Strip any leading whitespace
+    L.ConsumeWhitespace();
+    // If we have reached the end of the string, we are done
+    if (L.Left() == 0) {
+      break;
+    }
+    // Consume the optional "no" prefix, which disables the given prefix
+    const bool disable = L.Consume("no");
+    // Consume the Choice name
+    if (L.ConsumeAlphanumeric(ParsedChoice)) {
+      // Convert the string to a Choice value
+      const Choice C = fromString(ParsedChoice);
+      if (C == eInvalid) {
+        printChoicesParseError(Str, L.CurrentPos() - ParsedChoice.size(),
+                               "Invalid Choice \"" + ParsedChoice + "\"");
+        return false;
+      }
+      ParsedChoices.push_back(std::make_pair(C, !disable));
+    } else {
+      printChoicesParseError(Str, L.CurrentPos(), "Expected Choice");
+      return false;
+    }
+    // Strip any trailing whitespace
+    L.ConsumeWhitespace();
+    // Consume the separator (if any)
+    read_separator = false;
+    auto Current = L.Current();
+    if (Current != -1 && Separators.contains(char(Current))) {
+      L.Consume(1);
+      read_separator = true;
+    }
+  } while (read_separator && L.Left() > 0);
+
+  // If there is any string left, there must be some kind of mistake
+  if (L.Left() != 0) {
+    printChoicesParseError(Str, L.CurrentPos(), "Expected ';'");
+    return false;
+  }
+
+  // Set all the choices parsed in the loop
+
+  for (auto C : ParsedChoices) {
+    if (C.second == true) {
+      enable(C.first);
+    } else {
+      disable(C.first);
+    }
+  }
+
+  // We have finished successfully
+
+  return true;
+}
+
+VectorizationChoices::Choice VectorizationChoices::fromString(StringRef Str) {
+  auto Choose = StringSwitch<Choice>(Str);
+  for (const auto &info : ArrayRef<ChoiceInfo>(choicesArray)) {
+    Choose.Case(info.name, info.number);
+  }
+  return Choose.Default(eInvalid);
+}
+
+ArrayRef<VectorizationChoices::ChoiceInfo>
+VectorizationChoices::queryAvailableChoices() {
+  return ArrayRef<VectorizationChoices::ChoiceInfo>(choicesArray);
+}
+
+void VectorizationChoices::printChoicesParseError(StringRef Input,
+                                                  unsigned Position,
+                                                  Twine Msg) {
+  errs() << "CODEPLAY_VECZ_CHOICES parsing error: " << Msg << " at position "
+         << Position << "\n";
+  errs() << "    " << Input << "\n    ";
+  // We use the range [1, Position) instead of [0, Position - 1) to avoid
+  // an underflow in the case of Position = 0
+  for (unsigned i = 0; i < Position; ++i) {
+    errs() << ' ';
+  }
+  errs() << "^\n";
+}
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
new file mode 100644
index 0000000000000..a90ce8d767048
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_context.cpp
@@ -0,0 +1,1283 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorization_context.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/group_collective_helpers.h>
+#include <compiler/utils/mangling.h>
+#include <compiler/utils/pass_functions.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/InstrTypes.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/Support/AtomicOrdering.h>
+#include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/instructions.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <algorithm>
+#include <cassert>
+#include <optional>
+
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "llvm_helpers.h"
+#include "memory_operations.h"
+#include "transform/packetization_helpers.h"
+#include "vectorization_helpers.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+#include "vecz/vecz_target_info.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace llvm;
+using namespace vecz;
+
+STATISTIC(VeczContextFailBuiltin,
+          "Context: builtins with no vector equivalent [ID#V84]");
+STATISTIC(VeczContextFailScalarizeCall,
+          "Context: non-scalarizable vector builtin [ID#V86]");
+
+/// @brief Prefix used to distinguish internal vecz builtins from OpenCL
+/// builtins and user functions.
+const char *VectorizationContext::InternalBuiltinPrefix = "__vecz_b_";
+
+VectorizationContext::VectorizationContext(llvm::Module &target,
+                                           TargetInfo &vti,
+                                           compiler::utils::BuiltinInfo &bi)
+    : VTI(vti), Module(target), BI(bi), DL(&Module.getDataLayout()) {}
+
+TargetTransformInfo
+VectorizationContext::getTargetTransformInfo(Function &F) const {
+  auto *const TM = targetInfo().getTargetMachine();
+  if (TM) {
+    return TM->getTargetTransformInfo(F);
+  } else {
+    return TargetTransformInfo(F.getParent()->getDataLayout());
+  }
+}
+
+VectorizationUnit *VectorizationContext::getActiveVU(const Function *F) const {
+  const auto I = ActiveVUs.find(F);
+  if (I == ActiveVUs.end()) {
+    return nullptr;
+  }
+  VectorizationUnit *VU = I->second;
+  assert(VU->vectorizedFunction() == F);
+  return VU;
+}
+
+compiler::utils::BuiltinInfo &VectorizationContext::builtins() { return BI; }
+
+const compiler::utils::BuiltinInfo &VectorizationContext::builtins() const {
+  return BI;
+}
+
+VectorizationUnit *VectorizationContext::createVectorizationUnit(
+    llvm::Function &F, ElementCount VF, unsigned Dimension,
+    const VectorizationChoices &Ch) {
+  KernelUnits.push_back(
+      std::make_unique<VectorizationUnit>(F, VF, Dimension, *this, Ch));
+  return KernelUnits.back().get();
+}
+
+bool VectorizationContext::isVector(const Instruction &I) {
+  if (I.getType()->isVectorTy()) {
+    return true;
+  }
+  for (const Use &op : I.operands()) {
+    if (op->getType()->isVectorTy()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool VectorizationContext::canExpandBuiltin(const Function *ScalarFn) const {
+  // Builtins that return no value must have side-effects.
+  if (ScalarFn->getReturnType()->isVoidTy()) {
+    return false;
+  }
+  for (const Argument &Arg : ScalarFn->args()) {
+    // Most builtins that take pointers have side-effects. Be conservative.
+    if (Arg.getType()->isPointerTy()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+VectorizationResult &
+VectorizationContext::getOrCreateBuiltin(llvm::Function &F,
+                                         unsigned SimdWidth) {
+  compiler::utils::BuiltinInfo &BI = builtins();
+  const auto Cached = VectorizedBuiltins.find(&F);
+  if (Cached != VectorizedBuiltins.end()) {
+    const auto Found = Cached->second.find(SimdWidth);
+    if (Found != Cached->second.end()) {
+      return Found->second;
+    }
+  }
+
+  auto &result = VectorizedBuiltins[&F][SimdWidth];
+
+  const auto Builtin = BI.analyzeBuiltin(F);
+  if (!Builtin) {
+    ++VeczContextFailBuiltin;
+    return result;
+  }
+
+  // Try to find a vector equivalent for the builtin.
+  Function *const VectorCallee =
+      isInternalBuiltin(&F)
+          ? getInternalVectorEquivalent(&F, SimdWidth)
+          : BI.getVectorEquivalent(*Builtin, SimdWidth, &Module);
+
+  if (!VectorCallee) {
+    ++VeczContextFailBuiltin;
+    return result;
+  }
+
+  result.func = VectorCallee;
+
+  // Gather information about the function's arguments.
+  const auto Props = Builtin->properties;
+  unsigned i = 0;
+  for (const Argument &Arg : F.args()) {
+    Type *pointerRetPointeeTy = nullptr;
+    VectorizationResult::Arg::Kind kind = VectorizationResult::Arg::SCALAR;
+
+    if (Arg.getType()->isPointerTy()) {
+      pointerRetPointeeTy =
+          compiler::utils::getPointerReturnPointeeTy(F, Props);
+      kind = VectorizationResult::Arg::POINTER_RETURN;
+    } else {
+      kind = VectorizationResult::Arg::VECTORIZED;
+    }
+    result.args.emplace_back(kind, VectorCallee->getArg(i)->getType(),
+                             pointerRetPointeeTy);
+    i++;
+  }
+  return result;
+}
+
+VectorizationResult
+VectorizationContext::getVectorizedFunction(Function &callee,
+                                            ElementCount factor) {
+  VectorizationResult result;
+  if (factor.isScalable()) {
+    // We can't vectorize builtins by a scalable factor yet.
+    return result;
+  }
+
+  auto simdWidth = factor.getFixedValue();
+  if (auto *vecTy = dyn_cast<FixedVectorType>(callee.getReturnType())) {
+    Function *scalarEquiv = nullptr;
+    if (const auto Builtin = BI.analyzeBuiltin(callee)) {
+      scalarEquiv = builtins().getScalarEquivalent(*Builtin, &Module);
+    }
+    if (!scalarEquiv) {
+      ++VeczContextFailScalarizeCall;
+      return VectorizationResult();
+    }
+
+    auto scalarWidth = vecTy->getNumElements();
+
+    result = getOrCreateBuiltin(*scalarEquiv, simdWidth * scalarWidth);
+  } else {
+    result = getOrCreateBuiltin(callee, simdWidth);
+  }
+  return result;
+}
+
+bool VectorizationContext::isInternalBuiltin(const Function *F) {
+  return F->getName().starts_with(VectorizationContext::InternalBuiltinPrefix);
+}
+
+Function *VectorizationContext::getOrCreateInternalBuiltin(StringRef Name,
+                                                           FunctionType *FT) {
+  Function *F = Module.getFunction(Name);
+  if (!F && FT) {
+    F = dyn_cast_or_null<Function>(
+        Module.getOrInsertFunction(Name, FT).getCallee());
+    if (F) {
+      // Set some default attributes on the function.
+      // We never use exceptions
+      F->addFnAttr(Attribute::NoUnwind);
+      // Recursion is not supported in ComputeMux
+      F->addFnAttr(Attribute::NoRecurse);
+    }
+  }
+
+  return F;
+}
+
+Function *VectorizationContext::getOrCreateMaskedFunction(CallInst *CI) {
+  Function *F = CI->getCalledFunction();
+  if (!F) {
+    F = dyn_cast<Function>(CI->getCalledOperand()->stripPointerCasts());
+  }
+  VECZ_FAIL_IF(!F); // TODO: Support indirect function calls.
+  LLVMContext &ctx = F->getContext();
+
+  // We will handle printf statements, but handling every possible vararg
+  // function can become a bit too complex, among other things because name
+  // mangling with arbitrary types can become a bit complex. printf is the only
+  // vararg OpenCL builtin, so only user functions are affected by this.
+  const bool isVarArg = F->isVarArg();
+  VECZ_FAIL_IF(isVarArg && F->getName() != "printf");
+  // Copy the argument types. This is done from the CallInst instead of the
+  // called Function because the called Function might be a VarArg function, in
+  // which case we need to create the wrapper with the expanded argument list.
+  SmallVector<Type *, 8> argTys;
+  for (const auto &U : CI->args()) {
+    argTys.push_back(U->getType());
+  }
+  AttributeList fnAttrs = F->getAttributes();
+  unsigned firstImmArg;
+  const bool hasImmArg =
+      F->isIntrinsic() &&
+      fnAttrs.hasAttrSomewhere(Attribute::ImmArg, &firstImmArg);
+  if (hasImmArg) {
+    firstImmArg -= AttributeList::FirstArgIndex;
+    // We can only handle a single `i1` `Immarg` parameter. If we outgrow this
+    // limitation we need a different approach to the single inner branch
+    int count = 0;
+    for (unsigned i = firstImmArg, n = argTys.size(); i < n; ++i) {
+      if (!fnAttrs.hasAttributeAtIndex(AttributeList::FirstArgIndex + i,
+                                       Attribute::ImmArg)) {
+        continue;
+      }
+      // We only support one ImmArg or i1 type
+      if (count++ || argTys[i] != Type::getInt1Ty(ctx)) {
+        return nullptr;
+      }
+      fnAttrs = fnAttrs.removeAttributeAtIndex(ctx, i, Attribute::ImmArg);
+    }
+  }
+  // Add one extra argument for the mask
+  argTys.push_back(Type::getInt1Ty(ctx));
+  // Generate the function name
+  compiler::utils::NameMangler mangler(&ctx);
+  const SmallVector<compiler::utils::TypeQualifiers, 8> quals(
+      argTys.size(), compiler::utils::TypeQualifiers());
+  std::string newFName;
+  raw_string_ostream O(newFName);
+  O << VectorizationContext::InternalBuiltinPrefix << "masked_" << F->getName();
+  // We need to mangle the names of the vararg masked functions, since we will
+  // generate different masked functions for invocations with different argument
+  // types. For non-vararg functions, we don't need the mangling so we skip it.
+  if (isVarArg) {
+    O << "_";
+    for (auto T : argTys) {
+      VECZ_FAIL_IF(!mangler.mangleType(
+          O, T,
+          compiler::utils::TypeQualifiers(compiler::utils::eTypeQualNone)));
+    }
+  }
+  O.flush();
+  // Check if we have a masked version already
+  auto maskedVersion = MaskedVersions.find(newFName);
+  if (maskedVersion != MaskedVersions.end()) {
+    LLVM_DEBUG(dbgs() << "vecz: Found existing masked function " << newFName
+                      << "\n");
+    return maskedVersion->second;
+  }
+  // Create the function type
+  FunctionType *newFunctionTy =
+      FunctionType::get(F->getReturnType(), argTys, false);
+  Function *newFunction = Function::Create(
+      newFunctionTy, GlobalValue::PrivateLinkage, newFName, F->getParent());
+  const CallingConv::ID cc = CI->getCallingConv();
+  LLVM_DEBUG(dbgs() << "vecz: Created masked function " << newFName << "\n");
+
+  // Create the function's basic blocks
+  BasicBlock *entryBlock = BasicBlock::Create(ctx, "entry", newFunction);
+  BasicBlock *activeBlock = BasicBlock::Create(ctx, "active", newFunction);
+  BasicBlock *mergeBlock = BasicBlock::Create(ctx, "exit", newFunction);
+
+  // Create a new call instruction to call the masked function
+  SmallVector<Value *, 8> CIArgs;
+  for (Value &arg : newFunction->args()) {
+    CIArgs.push_back(&arg);
+  }
+  // Remove the mask argument
+  CIArgs.pop_back();
+
+  FunctionType *FTy = CI->getFunctionType();
+  const AttributeList callAttrs = CI->getAttributes();
+  SmallVector<std::pair<Value *, BasicBlock *>, 4> PhiOperands;
+  if (hasImmArg) {
+    Value *immArg = newFunction->getArg(firstImmArg);
+    BasicBlock *const immTrueBB =
+        BasicBlock::Create(ctx, "active.imm.1", newFunction, mergeBlock);
+    CIArgs[firstImmArg] = ConstantInt::getTrue(ctx);
+    CallInst *c0 =
+        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immTrueBB);
+    c0->setCallingConv(cc);
+    c0->setAttributes(callAttrs);
+    BranchInst::Create(mergeBlock, immTrueBB);
+
+    CIArgs[firstImmArg] = ConstantInt::getFalse(ctx);
+    // Now the false half
+    BasicBlock *const immFalseBB =
+        BasicBlock::Create(ctx, "active.imm.0", newFunction, mergeBlock);
+
+    CallInst *c1 =
+        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", immFalseBB);
+    c1->setCallingConv(cc);
+    c1->setAttributes(callAttrs);
+    BranchInst::Create(mergeBlock, immFalseBB);
+    BranchInst::Create(immTrueBB, immFalseBB, immArg, activeBlock);
+    PhiOperands.push_back({c0, immTrueBB});
+    PhiOperands.push_back({c1, immFalseBB});
+
+    // Now fix up the new function's signature. It can't be inheriting illegal
+    // attributes; only intrinsics may have the `ImmArg` Attribute. The verifier
+    // complains loudly otherwise, and then comes into our houses at night, and
+    // wrecks up the place...
+    for (unsigned i = 0, n = fnAttrs.getNumAttrSets(); i < n; ++i) {
+      fnAttrs = fnAttrs.removeAttributeAtIndex(ctx, i, Attribute::ImmArg);
+    }
+  } else {
+    // We are using the called Value instead of F because it might contain
+    // a bitcast or something, which makes the function types different.
+    CallInst *c =
+        CallInst::Create(FTy, CI->getCalledOperand(), CIArgs, "", activeBlock);
+    c->setCallingConv(cc);
+    c->setAttributes(callAttrs);
+    PhiOperands.push_back({c, activeBlock});
+    BranchInst::Create(mergeBlock, activeBlock);
+  }
+  newFunction->setCallingConv(cc);
+  newFunction->setAttributes(fnAttrs);
+
+  // Get the last argument (the mask) and use it as our branch predicate as to
+  // the live blocks or a no-op
+  Value *mask = newFunction->arg_end() - 1;
+  BranchInst::Create(activeBlock, mergeBlock, mask, entryBlock);
+
+  Type *returnTy = F->getReturnType();
+  if (returnTy != Type::getVoidTy(ctx)) {
+    PHINode *result = PHINode::Create(returnTy, 2, "", mergeBlock);
+    for (auto &phiOp : PhiOperands) {
+      result->addIncoming(phiOp.first, phiOp.second);
+    }
+    result->addIncoming(getDefaultValue(returnTy), entryBlock);
+    ReturnInst::Create(ctx, result, mergeBlock);
+  } else {
+    ReturnInst::Create(ctx, mergeBlock);
+  }
+
+  MaskedVersions.insert(std::make_pair(newFName, newFunction));
+  insertMaskedFunction(newFunction, F);
+  return newFunction;
+}
+
+std::optional<VectorizationContext::MaskedAtomic>
+VectorizationContext::isMaskedAtomicFunction(const Function &F) const {
+  auto VFInfo = decodeVectorizedFunctionName(F.getName());
+  if (!VFInfo) {
+    return std::nullopt;
+  }
+  auto [FnNameStr, VF, Choices] = *VFInfo;
+
+  llvm::StringRef FnName = FnNameStr;
+  if (!FnName.consume_front("masked_")) {
+    return std::nullopt;
+  }
+  const bool IsCmpXchg = FnName.consume_front("cmpxchg_");
+  if (!IsCmpXchg && !FnName.consume_front("atomicrmw_")) {
+    return std::nullopt;
+  }
+  VectorizationContext::MaskedAtomic AtomicInfo;
+
+  AtomicInfo.VF = VF;
+  AtomicInfo.IsVectorPredicated = Choices.vectorPredication();
+
+  if (IsCmpXchg) {
+    AtomicInfo.IsWeak = FnName.consume_front("weak_");
+  }
+  AtomicInfo.IsVolatile = FnName.consume_front("volatile_");
+
+  AtomicInfo.BinOp = AtomicRMWInst::BinOp::BAD_BINOP;
+
+  if (!IsCmpXchg) {
+    if (auto BinOp = multi_llvm::consume_binop_with_underscore(FnName)) {
+      AtomicInfo.BinOp = *BinOp;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  if (!FnName.consume_front("align")) {
+    return std::nullopt;
+  }
+
+  uint64_t Alignment = 0;
+  if (FnName.consumeInteger(/*Radix=*/10, Alignment)) {
+    return std::nullopt;
+  }
+
+  AtomicInfo.Align = Align(Alignment);
+
+  if (!FnName.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  auto demangleOrdering = [&FnName]() -> std::optional<AtomicOrdering> {
+    if (FnName.consume_front("acquire_")) {
+      return AtomicOrdering::Acquire;
+    } else if (FnName.consume_front("acqrel_")) {
+      return AtomicOrdering::AcquireRelease;
+    } else if (FnName.consume_front("monotonic_")) {
+      return AtomicOrdering::Monotonic;
+    } else if (FnName.consume_front("notatomic_")) {
+      return AtomicOrdering::NotAtomic;
+    } else if (FnName.consume_front("release_")) {
+      return AtomicOrdering::Release;
+    } else if (FnName.consume_front("seqcst_")) {
+      return AtomicOrdering::SequentiallyConsistent;
+    } else if (FnName.consume_front("unordered_")) {
+      return AtomicOrdering::Unordered;
+    } else {
+      return std::nullopt;
+    }
+  };
+
+  if (auto Ordering = demangleOrdering()) {
+    AtomicInfo.Ordering = *Ordering;
+  } else {
+    return std::nullopt;
+  }
+
+  if (IsCmpXchg) {
+    if (auto Ordering = demangleOrdering()) {
+      AtomicInfo.CmpXchgFailureOrdering = Ordering;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  unsigned SyncScopeID = 0;
+  if (FnName.consumeInteger(/*Radix=*/10, SyncScopeID)) {
+    return std::nullopt;
+  }
+
+  AtomicInfo.SyncScope = static_cast<SyncScope::ID>(SyncScopeID);
+
+  if (!FnName.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  // Note - we just assume the rest of the builtin name is okay, here. It
+  // should be mangled types, but vecz builtins use a strange mangling system,
+  // purely for uniqueness and not to infer types. Types are always assumed to
+  // be inferrable from the function parameters.
+  AtomicInfo.PointerTy = F.getFunctionType()->getParamType(0);
+  AtomicInfo.ValTy = F.getFunctionType()->getParamType(1);
+
+  return AtomicInfo;
+}
+
+Function *VectorizationContext::getOrCreateMaskedAtomicFunction(
+    MaskedAtomic &I, const VectorizationChoices &Choices, ElementCount VF) {
+  const bool isCmpXchg = I.isCmpXchg();
+  LLVMContext &ctx = I.ValTy->getContext();
+
+  SmallVector<Type *, 8> argTys;
+
+  argTys.push_back(I.PointerTy);
+  argTys.push_back(I.ValTy);
+  if (isCmpXchg) {
+    argTys.push_back(I.ValTy);
+  }
+  // Add one extra argument for the mask, which is always the same length
+  // (scalar or vector) as the value type.
+  auto *i1Ty = Type::getInt1Ty(ctx);
+  auto *maskTy =
+      !I.ValTy->isVectorTy()
+          ? dyn_cast<Type>(i1Ty)
+          : VectorType::get(i1Ty, cast<VectorType>(I.ValTy)->getElementCount());
+  argTys.push_back(maskTy);
+  if (Choices.vectorPredication()) {
+    argTys.push_back(Type::getInt32Ty(ctx));
+  }
+
+  std::string maskedFnName;
+  raw_string_ostream O(maskedFnName);
+  O << (isCmpXchg ? "masked_cmpxchg_" : "masked_atomicrmw_");
+
+  if (I.IsWeak) {
+    assert(isCmpXchg && "Bad MaskedAtomic state");
+    O << "weak_";
+  }
+
+  if (I.IsVolatile) {
+    O << "volatile_";
+  }
+
+  if (!isCmpXchg) {
+    O << multi_llvm::to_string(I.BinOp) << "_";
+  }
+
+  O << "align" << I.Align.value() << "_";
+
+  // Mangle ordering
+  auto mangleOrdering = [&O](AtomicOrdering Ordering) {
+    switch (Ordering) {
+    case AtomicOrdering::Acquire:
+      O << "acquire";
+      return;
+    case AtomicOrdering::AcquireRelease:
+      O << "acqrel";
+      return;
+    case AtomicOrdering::Monotonic:
+      O << "monotonic";
+      return;
+    case AtomicOrdering::NotAtomic:
+      O << "notatomic";
+      return;
+    case AtomicOrdering::Release:
+      O << "release";
+      return;
+    case AtomicOrdering::SequentiallyConsistent:
+      O << "seqcst";
+      return;
+    case AtomicOrdering::Unordered:
+      O << "unordered";
+      return;
+    }
+
+    O << static_cast<unsigned>(Ordering);
+  };
+
+  mangleOrdering(I.Ordering);
+  // Failure Ordering
+  if (I.CmpXchgFailureOrdering) {
+    O << "_";
+    mangleOrdering(*I.CmpXchgFailureOrdering);
+  }
+
+  // Syncscope
+  O << "_" << static_cast<unsigned>(I.SyncScope) << "_";
+
+  // Mangle types
+  compiler::utils::NameMangler mangler(&ctx);
+  for (auto *ty : argTys) {
+    VECZ_FAIL_IF(!mangler.mangleType(
+        O, ty,
+        compiler::utils::TypeQualifiers(compiler::utils::eTypeQualNone)));
+  }
+
+  maskedFnName =
+      getVectorizedFunctionName(maskedFnName, VF, Choices, /*IsBuiltin=*/true);
+
+  Type *maskedFnRetTy = isCmpXchg ? StructType::get(I.ValTy, maskTy) : I.ValTy;
+
+  // Create the function type
+  FunctionType *maskedFnTy =
+      FunctionType::get(maskedFnRetTy, argTys, /*isVarArg=*/false);
+
+  return getOrCreateInternalBuiltin(maskedFnName, maskedFnTy);
+}
+
+namespace {
+std::optional<std::tuple<bool, RecurKind, bool>>
+isSubgroupScan(StringRef fnName, Type *const ty) {
+  compiler::utils::Lexer L(fnName);
+  if (!L.Consume(VectorizationContext::InternalBuiltinPrefix)) {
+    return std::nullopt;
+  }
+  if (!L.Consume("sub_group_scan_")) {
+    return std::nullopt;
+  }
+  const bool isInt = ty->isIntOrIntVectorTy();
+  const bool isInclusive = L.Consume("inclusive_");
+  if (isInclusive || L.Consume("exclusive_")) {
+    StringRef OpKind;
+    if (L.ConsumeAlpha(OpKind)) {
+      RecurKind opKind;
+      if (OpKind == "add") {
+        opKind = isInt ? RecurKind::Add : RecurKind::FAdd;
+      } else if (OpKind == "min") {
+        assert(!isInt && "unexpected internal scan builtin");
+        opKind = RecurKind::FMin;
+      } else if (OpKind == "max") {
+        assert(!isInt && "unexpected internal scan builtin");
+        opKind = RecurKind::FMax;
+      } else if (OpKind == "smin") {
+        opKind = RecurKind::SMin;
+      } else if (OpKind == "smax") {
+        opKind = RecurKind::SMax;
+      } else if (OpKind == "umin") {
+        opKind = RecurKind::UMin;
+      } else if (OpKind == "umax") {
+        opKind = RecurKind::UMax;
+      } else if (OpKind == "mul") {
+        opKind = isInt ? RecurKind::Mul : RecurKind::FMul;
+      } else if (OpKind == "and") {
+        opKind = RecurKind::And;
+        assert(isInt && "unexpected internal scan builtin");
+      } else if (OpKind == "or") {
+        opKind = RecurKind::Or;
+        assert(isInt && "unexpected internal scan builtin");
+      } else if (OpKind == "xor") {
+        opKind = RecurKind::Xor;
+        assert(isInt && "unexpected internal scan builtin");
+      } else {
+        return std::nullopt;
+      }
+      const bool isVP = L.Consume("_vp");
+      return std::make_tuple(isInclusive, opKind, isVP);
+    }
+  }
+  return std::nullopt;
+}
+} // namespace
+
+bool VectorizationContext::defineInternalBuiltin(Function *F) {
+  assert(F->isDeclaration() && "builtin is already defined");
+
+  // Handle masked memory loads and stores.
+  if (std::optional<MemOpDesc> Desc = MemOpDesc::analyzeMemOpFunction(*F)) {
+    if (Desc->isMaskedMemOp()) {
+      return emitMaskedMemOpBody(*F, *Desc);
+    }
+
+    // Handle interleaved memory loads and stores.
+    if (Desc->isInterleavedMemOp()) {
+      return emitInterleavedMemOpBody(*F, *Desc);
+    }
+
+    // Handle masked interleaved memory loads and stores
+    if (Desc->isMaskedInterleavedMemOp()) {
+      return emitMaskedInterleavedMemOpBody(*F, *Desc);
+    }
+
+    // Handle scatter stores and gather loads.
+    if (Desc->isScatterGatherMemOp()) {
+      return emitScatterGatherMemOpBody(*F, *Desc);
+    }
+
+    // Handle masked scatter stores and gather loads.
+    if (Desc->isMaskedScatterGatherMemOp()) {
+      return emitMaskedScatterGatherMemOpBody(*F, *Desc);
+    }
+  }
+
+  // Handle subgroup scan operations.
+  if (auto scanInfo = isSubgroupScan(F->getName(), F->getReturnType())) {
+    const bool isInclusive = std::get<0>(*scanInfo);
+    const RecurKind opKind = std::get<1>(*scanInfo);
+    const bool isVP = std::get<2>(*scanInfo);
+    return emitSubgroupScanBody(*F, isInclusive, opKind, isVP);
+  }
+
+  if (auto AtomicInfo = isMaskedAtomicFunction(*F)) {
+    return emitMaskedAtomicBody(*F, *AtomicInfo);
+  }
+
+  return false;
+}
+
+bool VectorizationContext::emitMaskedMemOpBody(Function &F,
+                                               const MemOpDesc &Desc) const {
+  Value *Data = Desc.getDataOperand(&F);
+  Value *Ptr = Desc.getPointerOperand(&F);
+  Value *Mask = Desc.getMaskOperand(&F);
+  Value *VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
+  Type *DataTy = Desc.isLoad() ? F.getReturnType() : Data->getType();
+
+  BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
+  IRBuilder<> B(Entry);
+  Value *Result = nullptr;
+  if (Desc.isLoad()) {
+    Result =
+        VTI.createMaskedLoad(B, DataTy, Ptr, Mask, VL, Desc.getAlignment());
+    B.CreateRet(Result);
+  } else {
+    Result = VTI.createMaskedStore(B, Data, Ptr, Mask, VL, Desc.getAlignment());
+    B.CreateRetVoid();
+  }
+  VECZ_FAIL_IF(!Result);
+  return true;
+}
+
+bool VectorizationContext::emitInterleavedMemOpBody(
+    Function &F, const MemOpDesc &Desc) const {
+  return emitMaskedInterleavedMemOpBody(F, Desc);
+}
+
+bool VectorizationContext::emitMaskedInterleavedMemOpBody(
+    Function &F, const MemOpDesc &Desc) const {
+  Value *Data = Desc.getDataOperand(&F);
+  auto *const Ptr = Desc.getPointerOperand(&F);
+  VECZ_FAIL_IF(!isa<VectorType>(Desc.getDataType()) || !Ptr);
+
+  auto *const Mask = Desc.getMaskOperand(&F);
+  auto *const VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
+  const auto Align = Desc.getAlignment();
+  const auto Stride = Desc.getStride();
+
+  BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
+  IRBuilder<> B(Entry);
+
+  // If the mask is missing, assume that this is a normal interleaved memop that
+  // we want to emit as an unmasked interleaved memop
+  if (Desc.isLoad()) {
+    auto *const Result =
+        Mask ? VTI.createMaskedInterleavedLoad(B, F.getReturnType(), Ptr, Mask,
+                                               Stride, VL, Align)
+             : VTI.createInterleavedLoad(B, F.getReturnType(), Ptr, Stride, VL,
+                                         Align);
+    VECZ_FAIL_IF(!Result);
+    B.CreateRet(Result);
+  } else {
+    auto *const Result =
+        Mask ? VTI.createMaskedInterleavedStore(B, Data, Ptr, Mask, Stride, VL,
+                                                Align)
+             : VTI.createInterleavedStore(B, Data, Ptr, Stride, VL, Align);
+    VECZ_FAIL_IF(!Result);
+    B.CreateRetVoid();
+  }
+  return true;
+}
+
+bool VectorizationContext::emitScatterGatherMemOpBody(
+    Function &F, const MemOpDesc &Desc) const {
+  return emitMaskedScatterGatherMemOpBody(F, Desc);
+}
+
+bool VectorizationContext::emitMaskedScatterGatherMemOpBody(
+    Function &F, const MemOpDesc &Desc) const {
+  Value *Data = Desc.getDataOperand(&F);
+  auto *const VecDataTy = dyn_cast<VectorType>(Desc.getDataType());
+  auto *const Ptr = Desc.getPointerOperand(&F);
+  VECZ_FAIL_IF(!VecDataTy || !Ptr);
+
+  auto *const Mask = Desc.getMaskOperand(&F);
+  auto *const VL = Desc.isVLOp() ? Desc.getVLOperand(&F) : nullptr;
+  const auto Align = Desc.getAlignment();
+
+  BasicBlock *Entry = BasicBlock::Create(F.getContext(), "entry", &F);
+  IRBuilder<> B(Entry);
+
+  // If the mask is missing, assume that this is a normal scatter/gather memop
+  // that we want to emit as an unmasked scatter/gather memop
+  if (Desc.isLoad()) {
+    auto *const Result =
+        Mask ? VTI.createMaskedGatherLoad(B, VecDataTy, Ptr, Mask, VL, Align)
+             : VTI.createGatherLoad(B, VecDataTy, Ptr, VL, Align);
+    VECZ_FAIL_IF(!Result);
+    B.CreateRet(Result);
+  } else {
+    auto *const Result =
+        Mask ? VTI.createMaskedScatterStore(B, Data, Ptr, Mask, VL, Align)
+             : VTI.createScatterStore(B, Data, Ptr, VL, Align);
+    VECZ_FAIL_IF(!Result);
+    B.CreateRetVoid();
+  }
+  return true;
+}
+
+// Emit a subgroup scan operation.
+// If the vectorization factor is fixed, we can do a scan in log2(N) steps,
+// by noting that an inclusive scan can be split into two, and recombined into
+// a single result by adding the last element of the first half onto every
+// element of the second half. To deal with exclusive scans, we rotate the
+// result by one element and insert the neutral element at the beginning.
+//
+// For now, when using scalable vectorization factor, this takes the form of a
+// simple loop that accumulates the scan operation in scalar form, extracting
+// and inserting elements of the resulting vector on each iteration:
+//   %v = <A,B,C,D>
+//   Iteration 0:
+//     %e.0 = extractelement %v, 0          (A)
+//     %s.0 = add N, %e.0                   (A)
+//     %v.0 = insertelement poison, %s.0, 0 (<A,P,P,P>)
+//   Iteration 1:
+//     %e.1 = extractelement %v, 1          (B)
+//     %s.1 = add %s.0, %e.1                (A+B)
+//     %v.1 = insertelement  %v.0, %s.1, 1  (<A,A+B,P,P>)
+//   Iteration 2:
+//     %e.2 = extractelement %v, 2          (C)
+//     %s.2 = add %s.1, %e.2                (A+B+C)
+//     %v.2 = insertelement  %v.1, %s.2, 2  (<A,A+B,A+B+C,P>)
+//   Iteration 3:
+//     %e.3 = extractelement %v, 3          (D)
+//     %s.3 = add %s.2, %e.2                (A+B+C+D)
+//     %v.3 = insertelement  %v.2, %s.3, 3  (<A,A+B,A+B+C,A+B+C+D>)
+//   Result:
+//     %v.3 = <A,A+B,A+B+C,A+B+C+D>
+//
+// Exclusive scans operate by pre-filling the vector with the neutral value,
+// looping from 1 onwards, and extracting from one less than the current
+// iteration:
+//   %z = insertelement poison, N, 0
+//   Iteration 0:
+//     %e.0 = extractelement %v, 0          (A)
+//     %s.0 = add N, %e.0                   (A)
+//     %v.0 = insertelement %z, %s.0, 1     (<N,A,P,P>)
+// This loop operates up to the VL input, if it is a vector-predicated scan.
+// Elements past the vector length will receive a default zero value.
+// Note: This method is not optimal for fixed-length code, but serves as a way
+// of producing scalable- and fixed-length vector code equivalently.
+bool VectorizationContext::emitSubgroupScanBody(Function &F, bool IsInclusive,
+                                                RecurKind OpKind,
+                                                bool IsVP) const {
+  LLVMContext &Ctx = F.getContext();
+
+  auto *const Entry = BasicBlock::Create(Ctx, "entry", &F);
+  IRBuilder<> B(Entry);
+
+  Type *const VecTy = F.getReturnType();
+  Type *const EltTy = multi_llvm::getVectorElementType(VecTy);
+  const ElementCount EC = multi_llvm::getVectorElementCount(VecTy);
+
+  Function::arg_iterator Arg = F.arg_begin();
+
+  Value *const Vec = Arg;
+  Value *const VL = IsVP ? ++Arg : nullptr;
+
+  // If it's not a scalable vector, we can do it the fast way.
+  if (!EC.isScalable() && !IsVP) {
+    auto *const NeutralVal = compiler::utils::getNeutralVal(OpKind, EltTy);
+    const auto Width = EC.getFixedValue();
+    auto *const UndefVal = PoisonValue::get(VecTy);
+
+    // Put the Neutral element in a vector so we can shuffle it in.
+    auto *const NeutralVec =
+        B.CreateInsertElement(UndefVal, NeutralVal, B.getInt64(0));
+
+    auto *Result = Vec;
+    unsigned N = 1u;
+
+    SmallVector<int, 16> mask(Width);
+    while (N < Width) {
+      // Build shuffle mask.
+      // The sequence of masks will be, for a width of 16
+      // (in hexadecimal for concision, where x represents the neutral value
+      // element):
+      //
+      // x0x2x4x6x8xAxCxE
+      // xx11xx55xx99xxDD
+      // xxxx3333xxxxBBBB
+      // xxxxxxxx77777777
+      //
+      const auto N2 = N << 1u;
+      auto MaskIt = mask.begin();
+      for (size_t i = 0; i < Width; i += N2) {
+        for (size_t j = 0; j < N; ++j) {
+          *MaskIt++ = Width;
+        }
+
+        const auto k = i + N - 1;
+        for (size_t j = 0; j < N; ++j) {
+          *MaskIt++ = k;
+        }
+      }
+      N = N2;
+      auto *const Shuffle =
+          createOptimalShuffle(B, Result, NeutralVec, mask, Twine("scan_impl"));
+      Result =
+          compiler::utils::createBinOpForRecurKind(B, Result, Shuffle, OpKind);
+    }
+
+    if (!IsInclusive) {
+      // If it is an exclusive scan, rotate the result.
+      auto *const IdentityVal = compiler::utils::getIdentityVal(OpKind, EltTy);
+      VECZ_FAIL_IF(!IdentityVal);
+      Result = VTI.createVectorSlideUp(B, Result, IdentityVal, VL);
+    }
+
+    B.CreateRet(Result);
+    return true;
+  }
+
+  // If the vector is scalable, we don't know the number of iterations required,
+  // so we have to use a loop and shuffle masks generated from the step vector.
+
+  auto *const IVTy = B.getInt32Ty();
+  auto *const IndexTy = VectorType::get(IVTy, EC);
+  auto *const Step = B.CreateStepVector(IndexTy, "step");
+  auto *const VZero = Constant::getNullValue(IndexTy);
+
+  auto *const Loop = BasicBlock::Create(Ctx, "loop", &F);
+  auto *const Exit = BasicBlock::Create(Ctx, "exit", &F);
+
+  // The length of the vector.
+  Value *Width = nullptr;
+  if (IsVP) {
+    Width = VL;
+  } else {
+    Width = B.CreateElementCount(IVTy, EC);
+  }
+
+  B.CreateBr(Loop);
+
+  // Loop induction starts at 1 and doubles each time.
+  auto *const IVStart = ConstantInt::get(IVTy, 1);
+
+  // Create the loop instructions
+  B.SetInsertPoint(Loop);
+
+  // The induction variable (IV) which determines both our loop bounds and our
+  // vector indices.
+  auto *N = B.CreatePHI(IVTy, 2, "iv");
+  N->addIncoming(IVStart, Entry);
+
+  // A vector phi representing the vectorized value we're building up.
+  auto *VecPhi = B.CreatePHI(VecTy, 2, "vec");
+  VecPhi->addIncoming(Vec, Entry);
+
+  // A vector phi representing the vectorized value we're building up.
+  auto *MaskPhi = B.CreatePHI(IndexTy, 2, "mask.phi");
+  MaskPhi->addIncoming(Step, Entry);
+
+  // This will create shuffle masks like the following sequence:
+  //
+  // 1032547698BADCFE = (0123456789ABCDEF ^ splat(1))
+  // 33117755BB99FFDD = (1032547698BADCFE ^ splat(2)) | splat(1)
+  // 77773333FFFFBBBB = (33117755BB99FFDD ^ splat(4)) | splat(2)
+  // FFFFFFFF77777777 = (77773333FFFFBBBB ^ splat(8)) | splat(4)
+  //
+  // We don't mix the neutral element into the vector in this case, but use a
+  // Select instruction to choose between the updated or original value, so that
+  // backends can lower it as a masked binary operation. The select condition
+  // therefore needs to be like the following sequence:
+  //
+  // 0101010101010101
+  // 0011001100110011
+  // 0000111100001111
+  // 0000000011111111
+
+  auto *const SplatN = B.CreateVectorSplat(EC, N, "splatN");
+  auto *const Mask = B.CreateXor(MaskPhi, SplatN, "mask");
+  auto *const Shuffle = VTI.createVectorShuffle(B, VecPhi, Mask, VL);
+  auto *const Accum =
+      compiler::utils::createBinOpForRecurKind(B, VecPhi, Shuffle, OpKind);
+
+  auto *const NBit = B.CreateAnd(MaskPhi, SplatN, "isolate");
+  auto *const Which = B.CreateICmpNE(NBit, VZero, "which");
+  auto *const NewVec = B.CreateSelect(Which, Accum, VecPhi, "newvec");
+
+  auto *const NewMask = B.CreateOr(Mask, SplatN, "newmask");
+  auto *const N2 = B.CreateShl(N, ConstantInt::get(IVTy, 1), "N2",
+                               /*HasNUW*/ true, /*HasNSW*/ true);
+
+  VecPhi->addIncoming(NewVec, Loop);
+  MaskPhi->addIncoming(NewMask, Loop);
+  N->addIncoming(N2, Loop);
+
+  // Loop exit condition
+  auto *const Cond = B.CreateICmpULT(N2, Width, "iv.cmp");
+  B.CreateCondBr(Cond, Loop, Exit);
+
+  // Function exit instructions:
+  B.SetInsertPoint(Exit);
+
+  // Create an LCSSA PHI node.
+  auto *const ResultPhi = B.CreatePHI(VecTy, 1, "res.phi");
+  ResultPhi->addIncoming(NewVec, Loop);
+
+  Value *Result = ResultPhi;
+  if (!IsInclusive) {
+    // If it is an exclusive scan, rotate the result.
+    auto *const IdentityVal = compiler::utils::getIdentityVal(OpKind, EltTy);
+    VECZ_FAIL_IF(!IdentityVal);
+    Result = VTI.createVectorSlideUp(B, Result, IdentityVal, VL);
+  }
+
+  B.CreateRet(Result);
+  return true;
+}
+
+bool VectorizationContext::emitMaskedAtomicBody(
+    Function &F, const VectorizationContext::MaskedAtomic &MA) const {
+  LLVMContext &Ctx = F.getContext();
+  const bool IsCmpXchg = MA.isCmpXchg();
+
+  auto *const EntryBB = BasicBlock::Create(Ctx, "entry", &F);
+
+  IRBuilder<> B(EntryBB);
+
+  BasicBlock *LoopEntryBB = EntryBB;
+  if (MA.IsVectorPredicated) {
+    auto *const VL = F.getArg(3 + IsCmpXchg);
+    // Early exit if the vector length is zero. We're going to unconditionally
+    // jump into the loop after this.
+    auto *const EarlyExitBB = BasicBlock::Create(Ctx, "earlyexit", &F);
+    auto *const CmpZero =
+        B.CreateICmpEQ(VL, ConstantInt::get(VL->getType(), 0));
+
+    LoopEntryBB = BasicBlock::Create(Ctx, "loopentry", &F);
+
+    B.CreateCondBr(CmpZero, EarlyExitBB, LoopEntryBB);
+
+    B.SetInsertPoint(EarlyExitBB);
+    B.CreateRet(PoisonValue::get(F.getReturnType()));
+  }
+
+  B.SetInsertPoint(LoopEntryBB);
+
+  auto *const ExitBB = BasicBlock::Create(Ctx, "exit", &F);
+
+  auto *const PtrArg = F.getArg(0);
+  auto *const ValArg = F.getArg(1);
+  Value *MaskArg = F.getArg(2 + IsCmpXchg);
+
+  const bool IsVector = ValArg->getType()->isVectorTy();
+
+  Value *const IdxStart = B.getInt32(0);
+  Value *IdxEnd;
+  if (MA.IsVectorPredicated) {
+    IdxEnd = F.getArg(3 + IsCmpXchg);
+  } else {
+    IdxEnd = B.CreateElementCount(B.getInt32Ty(), MA.VF);
+  }
+
+  Value *RetVal = nullptr;
+  Value *RetSuccessVal = nullptr;
+
+  auto CreateLoopBody =
+      [&MA, &F, &ExitBB, PtrArg, ValArg, MaskArg, &RetVal, &RetSuccessVal,
+       IsVector, IsCmpXchg](BasicBlock *BB, Value *Idx, ArrayRef<Value *> IVs,
+                            MutableArrayRef<Value *> IVsNext) -> BasicBlock * {
+    IRBuilder<> IRB(BB);
+
+    Value *MaskElt = MaskArg;
+    if (IsVector) {
+      MaskElt = IRB.CreateExtractElement(MaskArg, Idx, "mask");
+    }
+    auto *const MaskCmp =
+        IRB.CreateICmpNE(MaskElt, IRB.getInt1(false), "mask.cmp");
+
+    auto *const IfBB = BasicBlock::Create(F.getContext(), "if.then", &F);
+    auto *const ElseBB = BasicBlock::Create(F.getContext(), "if.else", &F);
+
+    IRB.CreateCondBr(MaskCmp, IfBB, ElseBB);
+
+    {
+      IRB.SetInsertPoint(IfBB);
+      Value *Ptr = PtrArg;
+      Value *Val = ValArg;
+      if (IsVector) {
+        Ptr = IRB.CreateExtractElement(PtrArg, Idx, "ptr");
+        Val = IRB.CreateExtractElement(ValArg, Idx, "val");
+      }
+
+      if (IsCmpXchg) {
+        Value *NewValArg = F.getArg(2);
+        Value *NewVal = NewValArg;
+        if (IsVector) {
+          NewVal = IRB.CreateExtractElement(NewValArg, Idx, "newval");
+        }
+        auto *const CmpXchg =
+            IRB.CreateAtomicCmpXchg(Ptr, Val, NewVal, MA.Align, MA.Ordering,
+                                    *MA.CmpXchgFailureOrdering, MA.SyncScope);
+        CmpXchg->setWeak(MA.IsWeak);
+        CmpXchg->setVolatile(MA.IsVolatile);
+
+        if (IsVector) {
+          RetVal = IRB.CreateInsertElement(
+              IVs[0], IRB.CreateExtractValue(CmpXchg, 0), Idx, "retvec");
+          RetSuccessVal = IRB.CreateInsertElement(
+              IVs[1], IRB.CreateExtractValue(CmpXchg, 1), Idx, "retsuccess");
+        } else {
+          RetVal = IRB.CreateExtractValue(CmpXchg, 0);
+          RetSuccessVal = IRB.CreateExtractValue(CmpXchg, 1);
+        }
+
+      } else {
+        auto *const AtomicRMW = IRB.CreateAtomicRMW(
+            MA.BinOp, Ptr, Val, MA.Align, MA.Ordering, MA.SyncScope);
+        AtomicRMW->setVolatile(MA.IsVolatile);
+
+        if (IsVector) {
+          RetVal = IRB.CreateInsertElement(IVs[0], AtomicRMW, Idx, "retvec");
+        } else {
+          RetVal = AtomicRMW;
+        }
+      }
+
+      IRB.CreateBr(ElseBB);
+    }
+
+    {
+      IRB.SetInsertPoint(ElseBB);
+
+      auto *MergePhi = IRB.CreatePHI(RetVal->getType(), 2, "merge");
+      MergePhi->addIncoming(IVs[0], BB);
+      MergePhi->addIncoming(RetVal, IfBB);
+      RetVal = MergePhi;
+    }
+    IVsNext[0] = RetVal;
+
+    if (IsCmpXchg) {
+      auto *MergePhi =
+          IRB.CreatePHI(RetSuccessVal->getType(), 2, "mergesuccess");
+      MergePhi->addIncoming(IVs[1], BB);
+      MergePhi->addIncoming(RetSuccessVal, IfBB);
+      RetSuccessVal = MergePhi;
+      IVsNext[1] = RetSuccessVal;
+    }
+
+    // Move the exit block right to the end of the function.
+    ExitBB->moveAfter(ElseBB);
+
+    return ElseBB;
+  };
+
+  compiler::utils::CreateLoopOpts Opts;
+  {
+    Opts.IVs.push_back(PoisonValue::get(MA.ValTy));
+    Opts.loopIVNames.push_back("retvec.prev");
+  }
+  if (IsCmpXchg) {
+    Opts.IVs.push_back(PoisonValue::get(MaskArg->getType()));
+    Opts.loopIVNames.push_back("retsuccess.prev");
+  }
+  compiler::utils::createLoop(LoopEntryBB, ExitBB, IdxStart, IdxEnd, Opts,
+                              CreateLoopBody);
+
+  B.SetInsertPoint(ExitBB);
+  if (IsCmpXchg) {
+    Value *RetStruct = PoisonValue::get(F.getReturnType());
+    RetStruct = B.CreateInsertValue(RetStruct, RetVal, 0);
+    RetStruct = B.CreateInsertValue(RetStruct, RetSuccessVal, 1);
+    B.CreateRet(RetStruct);
+  } else {
+    B.CreateRet(RetVal);
+  }
+  return true;
+}
+
+Function *
+VectorizationContext::getInternalVectorEquivalent(Function *ScalarFn,
+                                                  unsigned SimdWidth) {
+  // Handle masked memory loads and stores.
+  if (!ScalarFn) {
+    return nullptr;
+  }
+  if (auto Desc = MemOpDesc::analyzeMaskedMemOp(*ScalarFn)) {
+    auto *NewDataTy = FixedVectorType::get(Desc->getDataType(), SimdWidth);
+    return getOrCreateMaskedMemOpFn(
+        *this, NewDataTy, cast<PointerType>(Desc->getPointerType()),
+        Desc->getAlignment(), Desc->isLoad(), Desc->isVLOp());
+  }
+
+  return nullptr;
+}
+
+bool VectorizationContext::isMaskedFunction(const llvm::Function *F) const {
+  return MaskedFunctionsMap.count(F) > 0;
+}
+
+bool VectorizationContext::insertMaskedFunction(llvm::Function *F,
+                                                llvm::Function *WrappedF) {
+  auto result = MaskedFunctionsMap.insert({F, WrappedF});
+  return result.second;
+}
+
+llvm::Function *
+VectorizationContext::getOriginalMaskedFunction(llvm::Function *F) {
+  auto Iter = MaskedFunctionsMap.find(F);
+  if (Iter != MaskedFunctionsMap.end()) {
+    return dyn_cast_or_null<llvm::Function>(Iter->second);
+  }
+
+  return nullptr;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+char DefineInternalBuiltinsPass::PassID = 0;
+
+PreservedAnalyses DefineInternalBuiltinsPass::run(Module &M,
+                                                  ModuleAnalysisManager &AM) {
+  llvm::FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+  // Remove internal builtins that may not be needed any more.
+  SmallVector<Function *, 4> ToRemove;
+
+  bool NonePreserved = false;
+  // Implement internal builtins that we now know are needed.
+  // We find all declarations that should be builtins, and then define them if
+  // they have users that have associated vectorization units.
+  // On failure to define, we notify those vectorization units of failure
+  // and remove any partially defined body.
+  // Unused declarations are removed
+  for (Function &F : M.functions()) {
+    if (!F.isDeclaration() || !VectorizationContext::isInternalBuiltin(&F)) {
+      continue;
+    }
+    if (F.use_empty()) {
+      ToRemove.push_back(&F);
+      NonePreserved = true;
+      continue;
+    }
+    llvm::SmallPtrSet<VectorizationUnit *, 1> UserVUs;
+    for (const Use &U : F.uses()) {
+      if (CallInst *CI = dyn_cast<CallInst>(U.getUser())) {
+        auto R = FAM.getResult<VectorizationUnitAnalysis>(*CI->getFunction());
+        if (R.hasResult()) {
+          UserVUs.insert(&R.getVU());
+        }
+      }
+    }
+    if (std::all_of(UserVUs.begin(), UserVUs.end(),
+                    [](VectorizationUnit *VU) { return VU->failed(); })) {
+      // If the vectorization has failed, we do not want to define the internal
+      // builtins, both because its a waste of time and because we might try to
+      // instantiate some invalid builtin that would have been replaced by the
+      // packetization process.
+      continue;
+    }
+
+    VectorizationContext &Ctx = (*UserVUs.begin())->context();
+    const bool DefinedBuiltin = Ctx.defineInternalBuiltin(&F);
+    if (!DefinedBuiltin) {
+      // If we've failed to define this builtin, ensure we clean up the
+      // half-complete body. We can't simply delete it because it will have
+      // uses in the vector kernel. This will revert it to a declaration, which
+      // will be cleaned up later by the global optimizer.
+      if (!F.isDeclaration()) {
+        // defineInternalBuiltin may have partially defined the function body.
+        // Clean it up. FIXME defineInternalBuiltin should probably clean up
+        // after itself if there is a failure condition
+        F.deleteBody();
+      }
+      for (VectorizationUnit *VU : UserVUs) {
+        VU->setFailed("failed to define an internal builtin");
+      }
+      continue;
+    }
+    NonePreserved = true;
+  }
+
+  for (Function *F : ToRemove) {
+    F->eraseFromParent();
+  }
+
+  return NonePreserved ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
new file mode 100644
index 0000000000000..29d505d28369b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_helpers.cpp
@@ -0,0 +1,343 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorization_helpers.h"
+
+#include <compiler/utils/attributes.h>
+#include <compiler/utils/metadata.h>
+#include <llvm/IR/Attributes.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/TypeSize.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <optional>
+
+#include "debugging.h"
+#include "vectorization_context.h"
+#include "vectorization_unit.h"
+#include "vecz/vecz_choices.h"
+
+using namespace llvm;
+using namespace vecz;
+
+namespace {
+
+Function *declareFunction(const VectorizationUnit &VU) {
+  Module &Module = VU.context().module();
+  const Function *const ScalarFn = VU.scalarFunction();
+  const ElementCount SimdWidth = VU.width();
+
+  // For kernels, the vectorized function type is is the same as the original
+  // scalar function type, since function arguments are uniform. We no longer
+  // use Vectorization Units for builtins.
+  FunctionType *VectorizedFnType = VU.scalarFunction()->getFunctionType();
+  VECZ_FAIL_IF(!VectorizedFnType);
+  const std::string VectorizedName =
+      getVectorizedFunctionName(ScalarFn->getName(), SimdWidth, VU.choices());
+  Module.getOrInsertFunction(VectorizedName, VectorizedFnType);
+  auto *const VectorizedFn = Module.getFunction(VectorizedName);
+  if (VectorizedFn) {
+    VectorizedFn->setCallingConv(ScalarFn->getCallingConv());
+  }
+  return VectorizedFn;
+}
+
+/// @brief Clone the OpenCL named metadata node with name NodeName
+/// @param[in] NodeName The name of the node to clone
+///
+/// This function works with nodes that follow a specific pattern,
+/// specifically nodes that have as their operands other metadata nodes, which
+/// in turn have their first operand set to the OpenCL kernel Function. It
+/// searches for the node that contains the scalar kernel, and copies all its
+/// metadata, which the exception of the Function itself, which is replaced by
+/// the vectorized kernel.
+void cloneOpenCLNamedMetadataHelper(const VectorizationUnit &VU,
+                                    const std::string &NodeName) {
+  const Module &M = VU.context().module();
+
+  // Try to get the OpenCL metadata
+  NamedMDNode *KernelsMD = M.getNamedMetadata(NodeName);
+  if (!KernelsMD) {
+    return;
+  }
+
+  // Find which metadata node contains the metadata for the scalar function
+  MDNode *ScalarKernelMD = nullptr;
+  for (auto *KernelMD : KernelsMD->operands()) {
+    // The function name is the first operand
+    if (KernelMD->getNumOperands() > 0) {
+      // Get the Constant containing the function
+      ConstantAsMetadata *KernelNameMD =
+          dyn_cast_or_null<ConstantAsMetadata>(KernelMD->getOperand(0));
+      if (KernelNameMD) {
+        // Check if the function in the metadata is the original OpenCL kernel
+        if (KernelNameMD->getValue() == VU.scalarFunction()) {
+          ScalarKernelMD = KernelMD;
+          break;
+        }
+      }
+    }
+  }
+
+  // Did we find the correct metadata?
+  if (!ScalarKernelMD) {
+    return;
+  }
+
+  // Replace the kernel name and clone the rest of the metadata
+  SmallVector<llvm::Metadata *, 5> KernelMDArgs;
+  KernelMDArgs.push_back(
+      llvm::ConstantAsMetadata::get(VU.vectorizedFunction()));
+  auto MDIt = ScalarKernelMD->op_begin() + 1;
+  auto MDEnd = ScalarKernelMD->op_end();
+  for (; MDIt != MDEnd; ++MDIt) {
+    KernelMDArgs.push_back(*MDIt);
+  }
+
+  // Create a new metadata node and add it to the opencl.kernels node
+  llvm::MDNode *KernelMDNode =
+      llvm::MDNode::get(VU.context().module().getContext(), KernelMDArgs);
+  KernelsMD->addOperand(KernelMDNode);
+}
+
+/// @brief Create placeholder instructions for arguments that will be
+/// vectorized. This is necessary to clone the original function's scalar code
+/// into the vectorized function.
+///
+/// @param[in,out] ValueMap Map to update with the arguments.
+SmallVector<Instruction *, 2>
+createArgumentPlaceholders(const VectorizationUnit &VU, Function *VecFunc,
+                           ValueToValueMapTy &ValueMap) {
+  SmallVector<Instruction *, 2> Placeholders;
+  const auto &Arguments = VU.arguments();
+  unsigned i = 0u;
+  for (Argument &DstArg : VecFunc->args()) {
+    Argument *SrcArg = Arguments[i++].OldArg;
+    DstArg.setName(SrcArg->getName());
+    if (DstArg.getType() != SrcArg->getType()) {
+      // Map old argument to a temporary placeholder to work around the
+      // difference in argument types. This usually happens when vectorizing
+      // builtin functions.
+      Type *IndexTy = Type::getInt32Ty(VecFunc->getParent()->getContext());
+      Constant *Index = Constant::getNullValue(IndexTy);
+      auto *const Placeholder = ExtractElementInst::Create(&DstArg, Index);
+      ValueMap[SrcArg] = Placeholder;
+      Placeholders.push_back(Placeholder);
+    } else {
+      ValueMap[SrcArg] = &DstArg;
+    }
+  }
+  return Placeholders;
+}
+
+} // namespace
+
+namespace vecz {
+std::string getVectorizedFunctionName(StringRef ScalarName, ElementCount VF,
+                                      VectorizationChoices Choices,
+                                      bool IsBuiltin) {
+  const Twine Prefix = Twine(VF.isScalable() ? "nxv" : "v");
+  const Twine IsVP = Twine(Choices.vectorPredication() ? "_vp_" : "_");
+  return ((IsBuiltin ? VectorizationContext::InternalBuiltinPrefix
+                     : Twine("__vecz_")) +
+          Prefix + Twine(VF.getKnownMinValue()) + IsVP + ScalarName)
+      .str();
+}
+
+std::optional<std::tuple<std::string, ElementCount, VectorizationChoices>>
+decodeVectorizedFunctionName(StringRef Name) {
+  if (!Name.consume_front(VectorizationContext::InternalBuiltinPrefix)) {
+    if (!Name.consume_front("__vecz_")) {
+      return std::nullopt;
+    }
+  }
+
+  ElementCount VF;
+  bool Scalable = false;
+  if (Name.consume_front("nxv")) {
+    Scalable = true;
+  } else if (!Name.consume_front("v")) {
+    return std::nullopt;
+  }
+
+  unsigned KnownMin = 0;
+  if (Name.consumeInteger(10, KnownMin)) {
+    return std::nullopt;
+  }
+
+  VF = ElementCount::get(KnownMin, Scalable);
+
+  VectorizationChoices Choices;
+  if (Name.consume_front("_vp_")) {
+    Choices.enableVectorPredication();
+  } else if (!Name.consume_front("_")) {
+    return std::nullopt;
+  }
+
+  return std::make_tuple(Name.str(), VF, Choices);
+}
+
+Function *cloneFunctionToVector(const VectorizationUnit &VU) {
+  auto *const VectorizedFn = declareFunction(VU);
+  VECZ_ERROR_IF(!VectorizedFn, "declareFunction failed to initialize");
+
+  auto *const ScalarFn = VU.scalarFunction();
+
+  // Map the old arguments to the new ones.
+  ValueToValueMapTy ValueMap;
+  auto Placeholders = createArgumentPlaceholders(VU, VectorizedFn, ValueMap);
+
+  // Clone the function to preserve instructions that do not need vectorization.
+  SmallVector<ReturnInst *, 4> Returns;
+
+  // Setting `moduleChanges` to true allows `llvm::CloneFunctionInto()` to do
+  // the work of cloning debug info across translation unit boundaries.
+  // However there can be issues with inlined kernels if the inlined kernel
+  // still exists in the kernel, and also has a vectorized variant.
+  // This value was set to true in this code since LLVM_VERSION_MAJOR > 4 but as
+  // of llvm > 12 we need to be a bit more careful with that value as there is
+  // more nuance introduced in 22a52dfddc with requisite assertions
+  const bool moduleChanges = VectorizedFn->getParent() != ScalarFn->getParent();
+  auto cloneMode = moduleChanges ? CloneFunctionChangeType::DifferentModule
+                                 : CloneFunctionChangeType::LocalChangesOnly;
+  CloneFunctionInto(VectorizedFn, ScalarFn, ValueMap, cloneMode, Returns);
+
+  // Remove unwanted return value attributes.
+  if (VectorizedFn->getReturnType()->isVectorTy()) {
+    LLVMContext &Ctx = VectorizedFn->getContext();
+    AttributeList PAL = VectorizedFn->getAttributes();
+    bool RemovedAttribute = false;
+    for (const Attribute::AttrKind Kind : {Attribute::ZExt, Attribute::SExt}) {
+      if (PAL.hasRetAttr(Kind)) {
+        PAL = PAL.removeRetAttribute(Ctx, Kind);
+        RemovedAttribute = true;
+      }
+    }
+    if (RemovedAttribute) {
+      VectorizedFn->setAttributes(PAL);
+    }
+  }
+
+  // Override the base function name component for the vectorized function.
+  compiler::utils::setBaseFnName(*VectorizedFn, VectorizedFn->getName());
+
+  // Drop any metadata where the scalar kernel already serves as the base or
+  // result of vectorization: this vectorized function does not serve as such:
+  // not directly in the case of 'derived' metadata, anyway: that relationship
+  // will be transitive.
+  compiler::utils::dropVeczOrigMetadata(*VectorizedFn);
+  compiler::utils::dropVeczDerivedMetadata(*VectorizedFn);
+
+  // Add any 'argument placeholder' instructions to the entry block.
+  // Skip over Alloca instructions if there are any.
+  BasicBlock &BB = VectorizedFn->getEntryBlock();
+  auto InsertPt = BB.getFirstInsertionPt();
+  while (isa<AllocaInst>(*InsertPt)) {
+    ++InsertPt;
+  }
+
+  for (auto *Placeholder : Placeholders) {
+    Placeholder->insertBefore(InsertPt);
+  }
+
+  return VectorizedFn;
+}
+
+static DILocation *getDILocation(unsigned Line, unsigned Column, MDNode *Scope,
+                                 MDNode *InlinedAt = nullptr) {
+  // If no scope is available, this is an unknown location.
+  if (!Scope)
+    return DebugLoc();
+  return DILocation::get(Scope->getContext(), Line, Column, Scope, InlinedAt,
+                         /*ImplicitCode*/ false);
+}
+
+void cloneDebugInfo(const VectorizationUnit &VU) {
+  DISubprogram *const ScalarDI = VU.scalarFunction()->getSubprogram();
+  // We don't have debug info
+  if (!ScalarDI) {
+    return;
+  }
+
+  // Create a DISubprogram entry for the vectorized kernel
+  DIBuilder DIB(*VU.scalarFunction()->getParent(), false);
+  DICompileUnit *CU =
+      DIB.createCompileUnit(dwarf::DW_LANG_OpenCL, ScalarDI->getFile(), "",
+                            ScalarDI->isOptimized(), "", 0);
+  DISubprogram *const VectorDI = DIB.createFunction(
+      CU->getFile(), ScalarDI->getName(),
+      StringRef(), /* Don't need a linkage name */
+      CU->getFile(), ScalarDI->getLine(), ScalarDI->getType(),
+      ScalarDI->getScopeLine(), ScalarDI->getFlags(), ScalarDI->getSPFlags());
+
+  // Point kernel function to a parent compile unit
+  VectorDI->replaceUnit(ScalarDI->getUnit());
+
+  VU.vectorizedFunction()->setSubprogram(VectorDI);
+
+  DIB.finalize();
+
+  // Iterate over all the instructions in the kernel looking for
+  // intrinsics containing debug info metadata that must be updated.
+  // Changing the scope to point to the new vectorized function, rather
+  // than the scalar function.
+
+  for (auto &BBItr : *VU.vectorizedFunction()) {
+    for (auto &InstItr : BBItr) {
+      if (InstItr.getDebugLoc()) {
+        // Update debug info line numbers to have vectorized kernel scope,
+        // taking care to preserve inlined locations.
+        const DebugLoc &ScalarLoc = InstItr.getDebugLoc();
+        DebugLoc VectorLoc;
+        if (DILocation *const InlinedLoc = ScalarLoc.getInlinedAt()) {
+          // Don't support nested inlined locations for now
+          if (!InlinedLoc->getInlinedAt()) {
+            const DebugLoc VectorKernel = getDILocation(
+                InlinedLoc->getLine(), InlinedLoc->getColumn(), VectorDI);
+            VectorLoc = getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(),
+                                      ScalarLoc.getScope(), VectorKernel);
+          }
+        } else {
+          VectorLoc =
+              getDILocation(ScalarLoc.getLine(), ScalarLoc.getCol(), VectorDI);
+        }
+        InstItr.setDebugLoc(VectorLoc);
+      }
+    }
+  }
+
+  // Replace temporary MDNode with list of vectorized DILocals we have created
+  // In LLVM 7.0 the variables attribute of DISubprogram was changed to
+  // retainedNodes
+  auto *VectorizedKernelVariables = VectorDI->getRetainedNodes().get();
+  assert(VectorizedKernelVariables && "Could not get retained nodes");
+  if (VectorizedKernelVariables->isTemporary()) {
+    auto NewLocals =
+        MDTuple::getTemporary(VectorizedKernelVariables->getContext(), {});
+    VectorizedKernelVariables->replaceAllUsesWith(NewLocals.get());
+  }
+
+  return;
+}
+
+void cloneOpenCLMetadata(const VectorizationUnit &VU) {
+  cloneOpenCLNamedMetadataHelper(VU, "opencl.kernels");
+  cloneOpenCLNamedMetadataHelper(VU, "opencl.kernel_wg_size_info");
+}
+
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
new file mode 100644
index 0000000000000..afa45e4c325c0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_heuristics.cpp
@@ -0,0 +1,394 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorization_heuristics.h"
+
+#include <compiler/utils/cl_builtin_info.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/Instructions.h>
+
+#include <unordered_set>
+
+#include "vectorization_context.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+class Heuristics {
+  enum class BrClauseKind { None = 0, True, False };
+
+public:
+  Heuristics(llvm::Function &F, VectorizationContext &Ctx, ElementCount VF,
+             unsigned SimdDimIdx)
+      : F(F), Ctx(Ctx), SimdWidth(VF), SimdDimIdx(SimdDimIdx) {}
+
+  /// @brief Look through the scalar code to find patterns that indicate
+  ///        we should not vectorize the kernel; e.g.:
+  ///        __kernel Type FuncName(Params) {
+  ///          if (get_global_id(0) == 0) {
+  ///            // Do something.
+  ///          }
+  ///          // Do nothing.
+  ///        }
+  /// @return Whether we should vectorize the function or not.
+  bool shouldVectorize();
+
+private:
+  /// @brief Passthrough to CmpInst.
+  ///
+  /// @param[in] Comp The instruction to inspect.
+  ///
+  /// @return The branch's path not to vectorize, if any.
+  BrClauseKind shouldVectorizeVisitBr(const llvm::Value *Comp) const;
+  /// @brief Visit a Cmp to check if it involves a call to an opencl builtin.
+  ///
+  /// @param[in] Cmp The comparison instruction to inspect.
+  ///
+  /// @return The branch's path not to vectorize, if any.
+  BrClauseKind shouldVectorizeVisitCmp(const llvm::CmpInst *Cmp) const;
+  /// @brief Visit the operand of a Cmp to strip it down to a
+  ///        CallInst or ConstantInt, if possible.
+  ///
+  /// @param[in] Val The instruction to inspect.
+  /// @param[in] Cmp The comparison instruction Val belongs to.
+  /// @param[in] Cache A map containing previously generated results.
+  ///
+  /// @return A CallInst or ConstantInt, nullptr otherwise.
+  const llvm::Value *shouldVectorizeVisitCmpOperand(
+      const llvm::Value *Val, const llvm::CmpInst *Cmp,
+      DenseMap<const Value *, const Value *> &Cache) const;
+  /// @brief Inspect the predicate and the operand that is compared against an
+  ///        opencl builtin to determine if it's better not to vectorize the
+  ///        kernel.
+  ///
+  /// @param[in] RHS  The operand compared against an opencl builtin.
+  /// @param[in] Pred The kind of comparison.
+  ///
+  /// @return The branch's path not to vectorize, if any.
+  BrClauseKind
+  shouldVectorizeVisitCmpOperands(const llvm::Value *RHS,
+                                  llvm::CmpInst::Predicate Pred) const;
+
+  /// @brief The function to analyze.
+  llvm::Function &F;
+
+  /// @brief The vectorization context.
+  VectorizationContext &Ctx;
+
+  /// @brief Vectorization factor to use.
+  ElementCount SimdWidth;
+
+  /// @brief Vectorization dimension to use.
+  unsigned SimdDimIdx;
+};
+
+Heuristics::BrClauseKind
+Heuristics::shouldVectorizeVisitCmpOperands(const Value *RHS,
+                                            CmpInst::Predicate Pred) const {
+  // If we have an `EQ` comparison, the single lane computation happens on
+  // the true successor.
+  if (Pred == CmpInst::Predicate::ICMP_EQ) {
+    return BrClauseKind::True;
+  }
+
+  // If we have an `NE` comparison, the single lane computation happens on
+  // the false successor.
+  if (Pred == CmpInst::Predicate::ICMP_NE) {
+    return BrClauseKind::False;
+  }
+
+  if (!RHS) {
+    return BrClauseKind::None;
+  }
+
+  // If the value we compare against the opencl builtin call is a constant,
+  // determine if it is worth it to vectorize based on the chances to hit a
+  // branch.
+  if (const ConstantInt *Val = dyn_cast<const ConstantInt>(RHS)) {
+    // If we have a branch whose condition only applies for at most half of the
+    // simd width, it is not worth vectorizing it.
+    switch (Pred) {
+    default:
+      break;
+    // If we have a `GT` or `GE` comparison, if the constant we compare the
+    // opencl builtin against is greater than half of the simd width, we will
+    // not take the true branch as often as the false branch.
+    case CmpInst::Predicate::ICMP_UGT:
+    case CmpInst::Predicate::ICMP_UGE:
+    case CmpInst::Predicate::ICMP_SGT:
+    case CmpInst::Predicate::ICMP_SGE:
+      if (SimdWidth.isScalable()) {
+        return BrClauseKind::True;
+      } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) {
+        return BrClauseKind::True;
+      } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) {
+        return BrClauseKind::False;
+      }
+      break;
+    // If we have an `LT` or `LE` comparison, if the constant we compare the
+    // opencl builtin against is smaller than half of the simd width, we will
+    // not take the true branch as often as the false branch.
+    case CmpInst::Predicate::ICMP_ULT:
+    case CmpInst::Predicate::ICMP_ULE:
+    case CmpInst::Predicate::ICMP_SLT:
+    case CmpInst::Predicate::ICMP_SLE:
+      if (SimdWidth.isScalable()) {
+        return BrClauseKind::False;
+      } else if (Val->getValue().slt(SimdWidth.getFixedValue() / 2)) {
+        return BrClauseKind::True;
+      } else if (Val->getValue().sgt(SimdWidth.getFixedValue() / 2)) {
+        return BrClauseKind::False;
+      }
+      break;
+    }
+  }
+
+  return BrClauseKind::None;
+}
+
+const Value *Heuristics::shouldVectorizeVisitCmpOperand(
+    const Value *Val, const CmpInst *Cmp,
+    DenseMap<const Value *, const Value *> &Cache) const {
+  const auto It = Cache.find(Val);
+  if (It != Cache.end()) {
+    return It->second;
+  }
+
+  // If we are visiting a binary operator, inspect both its operands.
+  if (const BinaryOperator *BO = dyn_cast<const BinaryOperator>(Val)) {
+    const Value *LHS =
+        shouldVectorizeVisitCmpOperand(BO->getOperand(0), Cmp, Cache);
+    const Value *RHS =
+        shouldVectorizeVisitCmpOperand(BO->getOperand(1), Cmp, Cache);
+
+    auto &Result = Cache[Val];
+
+    // If any of LHS and RHS are null and the comparison instruction is not
+    // an equality, Val is not constant and used in a relational comparison.
+    // We don't want to work with that.
+    if ((!LHS || !RHS) && !Cmp->isEquality()) {
+      return (Result = nullptr);
+    }
+
+    // If the operands of the BinaryOperator are a CallInst and anything else
+    // we do not want to keep going. We wish to avoid such comparisons:
+    // if ((get_local_id(0) & Constant) == Constant) {}
+    if (dyn_cast_or_null<const CallInst>(LHS)) {
+      return (Result = nullptr);
+    }
+    if (dyn_cast_or_null<const CallInst>(RHS)) {
+      return (Result = nullptr);
+    }
+
+    // Up to this point, LHS and RHS are either ConstantInt or null.
+    if (LHS) {
+      return (Result = LHS);
+    }
+    return (Result = RHS);
+  }
+
+  // If we are visiting an unary operator, inspect its operand.
+  if (const UnaryInstruction *UI = dyn_cast<const UnaryInstruction>(Val)) {
+    return shouldVectorizeVisitCmpOperand(UI->getOperand(0), Cmp, Cache);
+  }
+
+  if (const CallInst *CI = dyn_cast<const CallInst>(Val)) {
+    // We only care if the CallInst does involve a call to a work-item builtin.
+    const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+    if (auto B = BI.analyzeBuiltinCall(*CI, SimdDimIdx)) {
+      const auto Uniformity = B->uniformity;
+      if (Uniformity == compiler::utils::eBuiltinUniformityInstanceID ||
+          Uniformity == compiler::utils::eBuiltinUniformityMaybeInstanceID) {
+        return (Cache[Val] = CI);
+      }
+    }
+  }
+
+  if (const ConstantInt *CI = dyn_cast<const ConstantInt>(Val)) {
+    return (Cache[Val] = CI);
+  }
+
+  return (Cache[Val] = nullptr);
+}
+
+Heuristics::BrClauseKind
+Heuristics::shouldVectorizeVisitCmp(const CmpInst *Cmp) const {
+  // The following two calls return either a CallInst, a ConstantInt, or
+  // nullptr otherwise. If it returns a CallInst, it necessarily is a call to
+  // get_{global|local}_id, because otherwise we don't care.
+  DenseMap<const Value *, const Value *> Cache;
+  const Value *LHS =
+      shouldVectorizeVisitCmpOperand(Cmp->getOperand(0), Cmp, Cache);
+  const Value *RHS =
+      shouldVectorizeVisitCmpOperand(Cmp->getOperand(1), Cmp, Cache);
+
+  const CmpInst::Predicate pred = Cmp->getPredicate();
+
+  BrClauseKind vectorize = BrClauseKind::None;
+  // The CmpInst may involve two CallInst, or it may involve only one but
+  // we don't know on which side it may be.
+  if (llvm::isa_and_nonnull<const CallInst>(LHS)) {
+    vectorize = shouldVectorizeVisitCmpOperands(RHS, pred);
+  }
+  if (llvm::isa_and_nonnull<const CallInst>(RHS)) {
+    const BrClauseKind RHSStatus = shouldVectorizeVisitCmpOperands(LHS, pred);
+    // This should never happen but in case it does, we want to "void" the
+    // result and vectorize!
+    if (vectorize != BrClauseKind::None && vectorize != RHSStatus) {
+      return BrClauseKind::None;
+    }
+    vectorize = RHSStatus;
+  }
+  return vectorize;
+}
+
+Heuristics::BrClauseKind
+Heuristics::shouldVectorizeVisitBr(const Value *Comp) const {
+  // If we are visiting a binary operator, inspect both its operands to
+  // perhaps find CmpInsts.
+  // E.g.: %and = and ...
+  //       br i1 %and, ...
+  if (const BinaryOperator *BO = dyn_cast<const BinaryOperator>(Comp)) {
+    return (static_cast<BrClauseKind>(
+        static_cast<int>(shouldVectorizeVisitBr(BO->getOperand(0))) &&
+        static_cast<int>(shouldVectorizeVisitBr(BO->getOperand(1)))));
+  }
+
+  if (const CmpInst *CI = dyn_cast<const CmpInst>(Comp)) {
+    return shouldVectorizeVisitCmp(CI);
+  }
+
+  return BrClauseKind::None;
+}
+
+bool Heuristics::shouldVectorize() {
+  BasicBlock &BB = F.getEntryBlock();
+
+  // Weights computed by the kind of instructions.
+  // For the moment, we only consider stores/loads and function calls as being
+  // expensive, without looking at what function call it is
+  // (except for work item calls).
+  //
+  // Ultimately, it feels like this check should be done at some point during
+  // the vectorization process, so that we have a better overview on how bad
+  // the vectorized kernel is compared to the scalar one.
+  //
+  // We should most likely check only for instructions that have varying
+  // operands.
+  auto getWeight = [this](BasicBlock &B) {
+    unsigned weight = 0;
+    for (Instruction &I : B) {
+      if (isa<StoreInst>(&I) || isa<LoadInst>(&I)) {
+        weight++;
+      } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        const compiler::utils::BuiltinInfo &BI = Ctx.builtins();
+        if (Function *Callee = CI->getCalledFunction()) {
+          const auto builtin = BI.analyzeBuiltin(*Callee);
+          if (!builtin || !(builtin->properties &
+                            compiler::utils::eBuiltinPropertyWorkItem)) {
+            weight++;
+          }
+        }
+      }
+    }
+    return weight;
+  };
+
+  // If the program is laid out such that it may not be worth to vectorize
+  // based only on the comparison of the entry block, we also have to make
+  // sure that the entry block does not do as many expensive work as its
+  // successors, in which case it might still be worth to vectorize.
+  // We want to check if the entry block does some computation and store
+  // them. Basically, if the kernel looks like:
+  //
+  // __kernel void FuncName(Params) {
+  //   // (1) Do something.
+  //   // (2) Store that something.
+  //   if (get_global_id(0) == 0) {
+  //     // (3) Do something.
+  //   }
+  //   // (4) Do nothing.
+  // }
+  //
+  // then we might still want to vectorize it because (1) might be eligible for
+  // great vectorization improvements.
+  // If (2) is not present in the kernel, then we will probably not want to
+  // vectorize the kernel as (1) will then either be useless or only be used
+  // in (3). The former implies that it will never be used and the latter
+  // implies that it will be used only once per lane, so not worth vectorizing!
+  const unsigned entryBlockWeight = getWeight(BB);
+
+  Instruction *TI = BB.getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional()) {
+      const BrClauseKind clause = shouldVectorizeVisitBr(BI->getCondition());
+      unsigned succWeight = 0;
+      if (clause != BrClauseKind::None) {
+        BasicBlock *start = nullptr;
+        BasicBlock *terminatingBlock = nullptr;
+        if (clause == BrClauseKind::True) {
+          start = BI->getSuccessor(0);
+          terminatingBlock = BI->getSuccessor(1);
+        } else {
+          start = BI->getSuccessor(1);
+          terminatingBlock = BI->getSuccessor(0);
+        }
+        assert(terminatingBlock &&
+               "Failed to get terminating block of branch inst");
+
+        std::unordered_set<BasicBlock *> visited;
+        std::vector<BasicBlock *> worklist{start};
+        visited.insert(start);
+        while (!worklist.empty()) {
+          BasicBlock *cur = worklist.back();
+          worklist.pop_back();
+          succWeight += getWeight(*cur);
+          for (BasicBlock *succ : successors(cur)) {
+            if (succ == terminatingBlock) {
+              continue;
+            }
+            if (visited.insert(succ).second) {
+              worklist.push_back(succ);
+            }
+          }
+        }
+
+        // We don't want to vectorize if the path that will be taken the most
+        // is the exit block of the function and does nothing else but return.
+        if (isa<ReturnInst>(terminatingBlock->getTerminator()) &&
+            (terminatingBlock->size() == 1) &&
+            // Arbitrary limit.
+            (entryBlockWeight < succWeight)) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+} // namespace
+
+namespace vecz {
+bool shouldVectorize(llvm::Function &F, VectorizationContext &Ctx,
+                     ElementCount VF, unsigned SimdDimIdx) {
+  Heuristics VH(F, Ctx, VF, SimdDimIdx);
+  return VH.shouldVectorize();
+}
+} // namespace vecz
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
new file mode 100644
index 0000000000000..6516d2f593982
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorization_unit.cpp
@@ -0,0 +1,170 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorization_unit.h"
+
+#include <compiler/utils/builtin_info.h>
+#include <llvm/IR/CFG.h>
+#include <llvm/IR/DIBuilder.h>
+#include <llvm/IR/IntrinsicInst.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/PassManagerImpl.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Target/TargetMachine.h>
+#include <multi_llvm/multi_llvm.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <unordered_map>
+#include <unordered_set>
+
+#include "debugging.h"
+#include "vectorization_context.h"
+#include "vectorization_helpers.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+VectorizationUnit::VectorizationUnit(Function &F, ElementCount Width,
+                                     unsigned Dimension,
+                                     VectorizationContext &Ctx,
+                                     const VectorizationChoices &Ch)
+    : Ctx(Ctx), Choices(Ch), ScalarFn(&F), VectorizedFn(nullptr),
+      SimdWidth(ElementCount()), LocalSize(0), AutoSimdWidth(false),
+      SimdDimIdx(Dimension), FnFlags(eFunctionNoFlag) {
+  // Gather information about the function's arguments.
+  for (Argument &Arg : F.args()) {
+    VectorizerTargetArgument TargetArg;
+    TargetArg.OldArg = &Arg;
+    TargetArg.NewArg = nullptr;
+    TargetArg.IsVectorized = false;
+    TargetArg.PointerRetPointeeTy = nullptr;
+    TargetArg.Placeholder = nullptr;
+    Arguments.push_back(TargetArg);
+  }
+
+  // Set the desired SIMD width and try to look up the vectorized function.
+  setWidth(Width);
+}
+
+VectorizationUnit::~VectorizationUnit() {}
+
+Function &VectorizationUnit::function() {
+  if (VectorizedFn) {
+    return *VectorizedFn;
+  } else {
+    return *ScalarFn;
+  }
+}
+
+const Function &VectorizationUnit::function() const {
+  if (VectorizedFn) {
+    return *VectorizedFn;
+  } else {
+    return *ScalarFn;
+  }
+}
+
+void VectorizationUnit::setWidth(ElementCount NewWidth) {
+  if (NewWidth == SimdWidth) {
+    return;
+  }
+  SimdWidth = NewWidth;
+
+  // Determine the vectorized function's name and try to look it up.
+  const std::string VectorizedName =
+      getVectorizedFunctionName(ScalarFn->getName(), SimdWidth, Choices);
+  if (VectorizedFn) {
+    VectorizedFn->setName(VectorizedName);
+  } else {
+    setVectorizedFunction(Ctx.module().getFunction(VectorizedName));
+  }
+}
+
+void VectorizationUnit::setScalarFunction(llvm::Function *NewFunction) {
+  if (!NewFunction) {
+    return;
+  }
+  ScalarFn = NewFunction;
+  unsigned i = 0;
+  for (Argument &Arg : NewFunction->args()) {
+    VectorizerTargetArgument &TargetArg = Arguments[i];
+    TargetArg.OldArg = &Arg;
+    i++;
+  }
+}
+
+void VectorizationUnit::setVectorizedFunction(llvm::Function *NewFunction) {
+  VectorizedFn = NewFunction;
+  ArgumentPlaceholders.clear();
+  if (!NewFunction) {
+    for (unsigned i = 0; i < Arguments.size(); i++) {
+      VectorizerTargetArgument &TargetArg = Arguments[i];
+      TargetArg.NewArg = nullptr;
+      TargetArg.Placeholder = nullptr;
+    }
+  } else {
+    unsigned i = 0;
+    for (Argument &Arg : NewFunction->args()) {
+      VectorizerTargetArgument &TargetArg = Arguments[i];
+      TargetArg.NewArg = &Arg;
+
+      Instruction *Placeholder = nullptr;
+      if (TargetArg.IsVectorized && !TargetArg.PointerRetPointeeTy &&
+          !Arg.user_empty()) {
+        // A vectorized argument will be used only by its placeholder extract
+        // element instruction
+        Placeholder = cast<Instruction>(*Arg.user_begin());
+      }
+
+      TargetArg.Placeholder = Placeholder;
+      if (Placeholder) {
+        // Mark the extract to distinguish them from other instructions.
+        ArgumentPlaceholders.insert(Placeholder);
+      }
+      i++;
+    }
+  }
+}
+
+vecz::internal::AnalysisFailResult
+VectorizationUnit::setFailed(const char *remark, const llvm::Function *F,
+                             const llvm::Value *V) {
+  setFlag(eFunctionVectorizationFailed);
+  emitVeczRemarkMissed(F ? F : &function(), V, remark);
+  return vecz::internal::AnalysisFailResult();
+}
+
+VectorizationResult VectorizationUnit::getResult() const {
+  VectorizationResult res;
+  res.func = VectorizedFn;
+
+  for (const VectorizerTargetArgument &TargetArg : Arguments) {
+    Type *pointerRetPointeeTy = nullptr;
+    VectorizationResult::Arg::Kind kind = VectorizationResult::Arg::SCALAR;
+    if (auto *ty = TargetArg.PointerRetPointeeTy) {
+      pointerRetPointeeTy = ty;
+      kind = VectorizationResult::Arg::POINTER_RETURN;
+    } else if (TargetArg.IsVectorized) {
+      kind = VectorizationResult::Arg::VECTORIZED;
+    }
+    res.args.emplace_back(kind, TargetArg.NewArg->getType(),
+                          pointerRetPointeeTy);
+  }
+  return res;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
new file mode 100644
index 0000000000000..a9c44e44b2cd4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vectorizer.cpp
@@ -0,0 +1,363 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vectorizer.h"
+
+#include <compiler/utils/metadata.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/Debug.h>
+#include <llvm/Support/raw_ostream.h>
+#include <multi_llvm/vector_type_helper.h>
+
+#include <algorithm>
+#include <memory>
+#include <unordered_set>
+
+#include "analysis/vectorizable_function_analysis.h"
+#include "debugging.h"
+#include "memory_operations.h"
+#include "vectorization_context.h"
+#include "vectorization_helpers.h"
+#include "vectorization_heuristics.h"
+#include "vectorization_unit.h"
+#include "vecz/pass.h"
+#include "vecz/vecz_choices.h"
+
+#define DEBUG_TYPE "vecz"
+
+using namespace vecz;
+using namespace llvm;
+
+namespace {
+static cl::opt<bool>
+    VeczDumpReport("vecz-dump-report",
+                   cl::desc("report the post-vectorization status"));
+// static cl options allow us to access these options from other cpp files,
+// such as vectorization_unit.cpp
+
+} // namespace
+
+// Statistics
+STATISTIC(VeczSuccess, "Number of kernels successfully vectorized [ID#V80]");
+STATISTIC(VeczFail, "Number of kernels that failed to vectorize [ID#V81]");
+STATISTIC(VeczBail,
+          "Number of kernels where vectorization was not attempted [ID#V82]");
+
+STATISTIC(ScalarInstructions,
+          "Number of instructions in the scalar kernel [ID#V00]");
+STATISTIC(ScalarLoadStores,
+          "Number of loads and stores in the scalar kernel [ID#V01]");
+STATISTIC(ScalarVectorInsts,
+          "Number of vector instructions in the scalar kernel [ID#V02]");
+STATISTIC(ScalarMaxVectorWidth,
+          "The width of the bigger vector instruction found in the scalar "
+          "kernel [ID#V13]");
+STATISTIC(VeczInstructions,
+          "Number of instructions in the vectorized kernel [ID#V03]");
+STATISTIC(VeczScalarInstructions,
+          "Number of scalar instructions in the vectorized kernel [ID#V04]");
+STATISTIC(VeczVectorInstructions,
+          "Number of vector instructions in the vectorized kernel [ID#V05]");
+STATISTIC(VeczInsertExtract,
+          "Number of insert/extractelement instructions in the vectorized "
+          "kernel [ID#V06]");
+STATISTIC(VeczSplats,
+          "Number of vector splats in the vectorized kernel [ID#V07]");
+STATISTIC(
+    VeczScalarMemOp,
+    "Number of scalar loads and stores in the vectorized kernel [ID#V0A]");
+STATISTIC(
+    VeczVectorMemOp,
+    "Number of vector loads and stores in the vectorized kernel [ID#V0B]");
+STATISTIC(
+    VeczMaskedMemOps,
+    "Number of masked memory operations in the vectorized kernel [ID#V0C]");
+STATISTIC(VeczInterleavedMemOps,
+          "Number of interleaved memory operations in the vectorized kernel "
+          "[ID#V0D]");
+STATISTIC(VeczMaskedInterleavedMemOps,
+          "Number of masked interleaved memory operations in the vectorized "
+          "kernel [ID#V0E]");
+STATISTIC(VeczScatterGatherMemOps,
+          "Number of scatter/gather memory operations in the vectorized kernel "
+          "[ID#V10]");
+STATISTIC(VeczMaskedScatterGatherMemOps,
+          "Number of masked scatter/gather operations in the vectorized "
+          "kernel [ID#V11]");
+STATISTIC(VeczVectorWidth, "Vector width of the vectorized kernel [ID#V12]");
+STATISTIC(Ratio, "Normalized ratio of theoretical speedup[ID#V13]");
+
+namespace {
+/// @brief Calculate vectorization related statistics from the kernels
+///
+/// @param[in] VU The Vectorization Unit we are working on
+/// @param[in] Scalar The scalar function that we have vectorized
+/// @param[in] Vectorized The vectorized version of the scalar function
+void collectStatistics(VectorizationUnit &VU, Function *Scalar,
+                       Function *Vectorized) {
+  // Do not gather statistics if we failed to vectorize, if we're doing
+  // scalable vectorization, or if statistics aren't enabled in the first
+  // place.
+  if (!Scalar || !Vectorized || !AreStatisticsEnabled() ||
+      VU.width().isScalable()) {
+    return;
+  }
+
+  VeczVectorWidth = VU.width().getFixedValue();
+
+  // Function to check if an instruction is a vector instruction or not
+  auto isVectorInst = [](Instruction &I) -> bool {
+    Type *Ty = I.getType();
+
+    // Insert/extractelement are not really vector instructions
+    if (isa<InsertElementInst>(I) || isa<ExtractElementInst>(I)) {
+      return false;
+    }
+    // Instructions that return a vector
+    if (isa<FixedVectorType>(Ty)) {
+      return true;
+    }
+    // Store instructions that store a vector value
+    if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+      auto *ValOp = SI->getValueOperand();
+      assert(ValOp && "Could not get value operand");
+      return isa<FixedVectorType>(ValOp->getType());
+    }
+    // Internal builtins that work on vectors. This is relevant for stores only,
+    // as loads return a vector type and will be caught earlier on.
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (auto Op = MemOp::get(CI)) {
+        // With the exception of masked loads and stores, every other internal
+        // builtin works with vectors
+        if (!Op->isMaskedMemOp()) {
+          return true;
+        }
+        // Masked loads are handled earlier on as they return a vector type.
+        // We need to check if masked stores are storing vectors or not.
+        if (Op->isStore() && isa<FixedVectorType>(Op->getDataType())) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  };
+
+  unsigned MaxScalarVectorWidth = 0;
+  // Collect the scalar kernel's statistics
+  for (auto &BB : *Scalar) {
+    for (auto &I : BB) {
+      ++ScalarInstructions;
+      ScalarLoadStores += (isa<LoadInst>(I) || isa<StoreInst>(I));
+      ScalarVectorInsts += isVectorInst(I);
+      // Find out how wide is the widest vector used in the scalar kernel
+      if (auto *VecTy = dyn_cast<FixedVectorType>(I.getType())) {
+        MaxScalarVectorWidth =
+            std::max(VecTy->getNumElements(), MaxScalarVectorWidth);
+      }
+    }
+  }
+  ScalarMaxVectorWidth = MaxScalarVectorWidth;
+
+  // Collect the vectorized kernel's statistics
+  for (auto &BB : *Vectorized) {
+    for (auto &I : BB) {
+      // Count instructions
+      ++VeczInstructions;
+
+      // Detect vector splats
+      // Count insert/extractelement instructions
+      if (isa<InsertElementInst>(I) || isa<ExtractElementInst>(I)) {
+        if (I.getName().starts_with(".splatinsert")) {
+          ++VeczSplats;
+        }
+        ++VeczInsertExtract;
+      }
+
+      // Count vector and scalar instructions
+      if (isVectorInst(I)) {
+        ++VeczVectorInstructions;
+      } else {
+        ++VeczScalarInstructions;
+      }
+
+      // Count memory operation types
+      if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+        // Normal scalar/vector loads and stores
+        if (isVectorInst(I)) {
+          ++VeczVectorMemOp;
+        } else {
+          ++VeczScalarMemOp;
+        }
+      } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        Function *F = CI->getCalledFunction();
+        if (!F) {
+          continue;
+        }
+        // Subtract 1 for the call instruction, since we are inlining
+        --VeczInstructions;
+
+        for (auto &BB : *F) {
+          for (auto &Inst : BB) {
+            VeczInstructions += !isa<CallInst>(&Inst);
+          }
+        }
+        // Internal builtin memory operations
+        if (auto Op = MemOp::get(&I)) {
+          VeczMaskedMemOps += Op->isMaskedMemOp();
+          VeczInterleavedMemOps += Op->getDesc().isInterleavedMemOp();
+          VeczMaskedInterleavedMemOps += Op->isMaskedInterleavedMemOp();
+          VeczScatterGatherMemOps += Op->getDesc().isScatterGatherMemOp();
+          VeczMaskedScatterGatherMemOps += Op->isMaskedScatterGatherMemOp();
+        }
+      }
+    }
+  }
+
+  // Ratio = Normalized Scalar Insts / Vector Insts
+  // Normalized Scalar Insts = Simd Width * Scalar Insts
+  // IK - Input Kernel
+  // Scalar Insts = IK's Scalar Insts + IK's Vec Insts * IK's VecWidth
+  const unsigned SimdWidth = VU.width().getFixedValue();
+  Ratio = (SimdWidth * (ScalarInstructions - ScalarVectorInsts +
+                        ScalarVectorInsts * MaxScalarVectorWidth)) /
+          VeczInstructions;
+}
+} // namespace
+
+VectorizationUnit *vecz::createVectorizationUnit(VectorizationContext &Ctx,
+                                                 Function *Kernel,
+                                                 const VeczPassOptions &Opts,
+                                                 FunctionAnalysisManager &FAM,
+                                                 bool Check) {
+  const unsigned SimdDimIdx = Opts.vec_dim_idx;
+  const unsigned LocalSize = Opts.local_size;
+  const bool Auto = Opts.vecz_auto;
+  auto VF = Opts.factor;
+
+  if (!Kernel || VF.isScalar()) {
+    ++VeczBail;
+    VECZ_FAIL();
+  }
+
+  // Up to MAX_SIMD_DIM supported dimensions
+  VECZ_ERROR_IF(SimdDimIdx >= MAX_SIMD_DIM,
+                "Specified vectorization dimension is invalid");
+
+  VECZ_ERROR_IF(VF.getKnownMinValue() == 0, "Vectorization factor of zero");
+
+  // Adjust VF if the local size is known to vectorize more often.
+  if (LocalSize && !VF.isScalable()) {
+    // If we know the vectorized loop will never be entered, because the
+    // vectorization factor is too large, then vectorizing is a waste of time.
+    // It is better instead to vectorize by a smaller factor. Keep on halfing
+    // the vector width until a useable value is found (worst case this value
+    // will be 1, because that evenly divides everything).
+    unsigned FixedSimdWidth = VF.getFixedValue();
+    // Note FixedSimdWidth is either a power of two or 3. If FixedSimdWidth
+    // was 1 then we would not enter the body of the loop (as X%1 is 0 for all
+    // X), if FixedSimdWidth is a greater power of two then dividing it by 2
+    // gives another power of two, 3 divided by 2 gives 1, a power of two. Thus
+    // if this loop runs at least once then FixedSimdWidth will be a power of
+    // 2.
+    assert(FixedSimdWidth == 3 || llvm::isPowerOf2_32(FixedSimdWidth));
+    while (FixedSimdWidth != 1 && FixedSimdWidth > LocalSize) {
+      FixedSimdWidth /= 2;
+      assert(FixedSimdWidth > 0 && "Cannot vectorize (or modulo) by 0.");
+    }
+    if (FixedSimdWidth == 1) {
+      ++VeczBail;
+      emitVeczRemarkMissed(Kernel, nullptr,
+                           "requested Vectorization factor of 1");
+      return nullptr;
+    }
+    VF = ElementCount::get(FixedSimdWidth, false);
+  }
+
+  bool canVectorize = true;
+  if (Check) {
+    auto Res = FAM.getResult<VectorizableFunctionAnalysis>(*Kernel);
+    canVectorize = Res.canVectorize;
+  }
+
+  if (canVectorize &&
+      (!Auto || shouldVectorize(*Kernel, Ctx, VF, SimdDimIdx))) {
+    auto VU =
+        Ctx.createVectorizationUnit(*Kernel, VF, SimdDimIdx, Opts.choices);
+    VU->setAutoWidth(Auto);
+    VU->setLocalSize(Opts.local_size);
+    return VU;
+  }
+  return nullptr;
+}
+
+void vecz::trackVeczSuccessFailure(VectorizationUnit &VU) {
+  Function *Fn = VU.scalarFunction();
+  Function *vectorizedFn = VU.vectorizedFunction();
+  const bool failed = VU.failed();
+  VeczFail += failed;
+  VeczSuccess += !failed;
+  collectStatistics(VU, Fn, vectorizedFn);
+
+  if (VeczDumpReport) {
+    const auto VF = VU.width();
+    auto FnName = Fn->getName();
+    if (vectorizedFn) {
+      errs() << "vecz: Vectorization succeeded for kernel '" << FnName
+             << "' << (" << (VF.isScalable() ? "scalable-vector" : "SIMD")
+             << " factor: " << VF.getKnownMinValue() << ") "
+             << *vectorizedFn->getType() << "\n";
+    } else {
+      errs() << "vecz: Vectorization failed for kernel '" << FnName << "'\n";
+    }
+  }
+}
+
+bool vecz::createVectorizedFunctionMetadata(VectorizationUnit &vu) {
+  Function *fn = vu.scalarFunction();
+  Function *vectorizedFn = vu.vectorizedFunction();
+  if (vu.failed()) {
+    vectorizedFn = nullptr;
+  } else {
+    // If vectorization succeeded, clone the OpenCL related metadata from the
+    // scalar kernel. We do not do this while cloning the kernel because if
+    // vectorization fails we will have metadata pointing to non-existing
+    // kernels.
+    cloneOpenCLMetadata(vu);
+  }
+  const auto vf = vu.width();
+  const auto dim = vu.dimension();
+
+  // emit output metadata based on vectorization result
+  auto finalVF = vf;
+
+  const compiler::utils::VectorizationInfo info{
+      finalVF, dim, vu.choices().vectorPredication()};
+
+  if (vectorizedFn && vectorizedFn != fn) { // success
+    // Link the original function to the vectorized one.
+    compiler::utils::linkOrigToVeczFnMetadata(*fn, *vectorizedFn, info);
+
+    // Link the vectorized function back to the original one.
+    compiler::utils::linkVeczToOrigFnMetadata(*vectorizedFn, *fn, info);
+  } else { // fail or bail
+    compiler::utils::encodeVectorizationFailedMetadata(*fn, info);
+  }
+  return vectorizedFn;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
new file mode 100644
index 0000000000000..bcbeabbf9766b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/source/vecz_pass_builder.cpp
@@ -0,0 +1,289 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "vecz_pass_builder.h"
+
+#include <llvm/Analysis/AssumptionCache.h>
+#include <llvm/Analysis/BasicAliasAnalysis.h>
+#include <llvm/Analysis/DominanceFrontier.h>
+#include <llvm/Analysis/LoopInfo.h>
+#include <llvm/Analysis/MemoryDependenceAnalysis.h>
+#include <llvm/Analysis/MemorySSA.h>
+#include <llvm/Analysis/PhiValues.h>
+#include <llvm/Analysis/PostDominators.h>
+#include <llvm/Analysis/ScalarEvolution.h>
+#include <llvm/Analysis/TargetLibraryInfo.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Config/llvm-config.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/PassManagerImpl.h>
+#include <llvm/IR/Verifier.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h>
+#include <llvm/Transforms/InstCombine/InstCombine.h>
+#include <llvm/Transforms/Scalar.h>
+#include <llvm/Transforms/Scalar/ADCE.h>
+#include <llvm/Transforms/Scalar/DCE.h>
+#include <llvm/Transforms/Scalar/EarlyCSE.h>
+#include <llvm/Transforms/Scalar/FlattenCFG.h>
+#include <llvm/Transforms/Scalar/GVN.h>
+#include <llvm/Transforms/Scalar/IndVarSimplify.h>
+#include <llvm/Transforms/Scalar/InferAlignment.h>
+#include <llvm/Transforms/Scalar/LoopPassManager.h>
+#include <llvm/Transforms/Scalar/SROA.h>
+#include <llvm/Transforms/Scalar/SimplifyCFG.h>
+#include <llvm/Transforms/Scalar/Sink.h>
+#include <llvm/Transforms/Utils/BreakCriticalEdges.h>
+#include <llvm/Transforms/Utils/FixIrreducible.h>
+#include <llvm/Transforms/Utils/LowerSwitch.h>
+#include <llvm/Transforms/Utils/Mem2Reg.h>
+#include <llvm/Transforms/Utils/UnifyFunctionExitNodes.h>
+
+#include <cassert>
+
+#include "analysis/control_flow_analysis.h"
+#include "analysis/divergence_analysis.h"
+#include "analysis/liveness_analysis.h"
+#include "analysis/packetization_analysis.h"
+#include "analysis/simd_width_analysis.h"
+#include "analysis/stride_analysis.h"
+#include "analysis/uniform_value_analysis.h"
+#include "analysis/vectorizable_function_analysis.h"
+#include "analysis/vectorization_unit_analysis.h"
+#include "debugging.h"
+#include "multi_llvm/llvm_version.h"
+#include "transform/common_gep_elimination_pass.h"
+#include "transform/control_flow_conversion_pass.h"
+#include "transform/inline_post_vectorization_pass.h"
+#include "transform/interleaved_group_combine_pass.h"
+#include "transform/packetization_helpers.h"
+#include "transform/packetization_pass.h"
+#include "transform/passes.h"
+#include "transform/scalarization_pass.h"
+#include "transform/ternary_transform_pass.h"
+
+#define DEBUG_TYPE "vecz"
+using namespace llvm;
+using namespace vecz;
+
+VeczPassMachinery::VeczPassMachinery(
+    llvm::LLVMContext &llvmCtx, llvm::TargetMachine *TM,
+    VectorizationContext &Ctx, bool verifyEach,
+    compiler::utils::DebugLogging debugLogLevel)
+    : compiler::utils::PassMachinery(llvmCtx, TM, verifyEach, debugLogLevel),
+      Ctx(Ctx) {}
+
+void VeczPassMachinery::registerPasses() {
+  // Register standard passes
+  compiler::utils::PassMachinery::registerPasses();
+
+  FAM.registerPass([&] { return VectorizationContextAnalysis(Ctx); });
+  FAM.registerPass([&] { return VectorizationUnitAnalysis(Ctx); });
+  FAM.registerPass([&] { return VectorizableFunctionAnalysis(); });
+  FAM.registerPass([] { return StrideAnalysis(); });
+  FAM.registerPass([] { return UniformValueAnalysis(); });
+  FAM.registerPass([] { return LivenessAnalysis(); });
+  FAM.registerPass([] { return PacketizationAnalysis(); });
+  FAM.registerPass([] { return CFGAnalysis(); });
+  FAM.registerPass([] { return DivergenceAnalysis(); });
+
+  if (!TM) {
+    FAM.registerPass([] { return TargetIRAnalysis(); });
+  } else {
+    FAM.registerPass(
+        [this] { return TargetIRAnalysis(TM->getTargetIRAnalysis()); });
+    FAM.registerPass([] { return SimdWidthAnalysis(); });
+  }
+}
+
+void VeczPassMachinery::addClassToPassNames() {
+  {
+#define MODULE_PASS(NAME, CREATE_PASS)                                         \
+  PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  PIC.addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#include "passes.def"
+  }
+
+  // Register a callback which skips all passes once we've failed to vectorize
+  // a function.
+  PIC.registerShouldRunOptionalPassCallback([&](StringRef, llvm::Any IR) {
+    const Function *const *FPtr = any_cast<const Function *>(&IR);
+    const Function *F = FPtr ? *FPtr : nullptr;
+    if (!F) {
+      if (const auto *const *L = any_cast<const Loop *>(&IR)) {
+        F = (*L)->getHeader()->getParent();
+      } else {
+        // Always run module passes
+        return true;
+      }
+    }
+    // FIXME: This is repeating the job of the VectorizationUnitAnalysis.
+    // We should track 'failure' more directly in the
+    // Function/VectorizationContext?
+    const auto *const VU = Ctx.getActiveVU(F);
+    if (!VU) {
+      // Don't run on anything without a VU since it's not currently being
+      // vectorized.
+      return false;
+    }
+    return !VU->failed();
+  });
+}
+
+void VeczPassMachinery::registerPassCallbacks() {
+  // Add a backwards-compatible way of supporting simplifycfg, which used
+  // to be called simplify-cfg before LLVM 12.
+  PB.registerPipelineParsingCallback(
+      [](StringRef Name, ModulePassManager &PM,
+         ArrayRef<PassBuilder::PipelineElement>) {
+#define MODULE_PASS(NAME, CREATE_PASS)                                         \
+  if (Name == NAME) {                                                          \
+    PM.addPass(CREATE_PASS);                                                   \
+    return true;                                                               \
+  }
+#define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
+  if (Name == NAME) {                                                          \
+    PM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS));                \
+    return true;                                                               \
+  }
+#define LOOP_PASS(NAME, CREATE_PASS)                                           \
+  if (Name == NAME) {                                                          \
+    PM.addPass(createModuleToFunctionPassAdaptor(                              \
+        createFunctionToLoopPassAdaptor(CREATE_PASS)));                        \
+    return true;                                                               \
+  }
+#include "passes.def"
+        return false;
+      });
+}
+
+bool vecz::buildPassPipeline(ModulePassManager &PM) {
+  // Preparation passes
+  PM.addPass(BuiltinInliningPass());
+  {
+    FunctionPassManager FPM;
+    // Lower switches after builtin inlining, in case the builtins had switches.
+    FPM.addPass(LowerSwitchPass());
+    FPM.addPass(FixIrreduciblePass());
+
+    // It's helpful to run SROA in case it opens up more opportunities to
+    // eliminate aggregates in (particularly SYCL) kernels. This is especially
+    // true after inlining - which we've (usually) just performed in the
+    // BuiltinInliningPass - because otherwise SROA is unable to analyze the
+    // lifetime of allocas due to them being "escaped" by the function callee.
+    FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
+    // We have to run LLVM's Mem2Reg pass in case the front end didn't. Note
+    // that SROA usually runs Mem2Reg internally (unless disabled via a
+    // command-line option) though using its own heuristic. We run it
+    // unconditionally regardless, just for good measure.
+    FPM.addPass(PromotePass());
+    // LLVM's own Mem2Reg pass doesn't always get everything
+    FPM.addPass(BasicMem2RegPass());
+
+    FPM.addPass(InstCombinePass());
+    FPM.addPass(AggressiveInstCombinePass());
+    FPM.addPass(DCEPass());
+    FPM.addPass(PreLinearizePass());
+    // If pre-linearization created any unnecessary Hoist Guards,
+    // Instruction Combining Pass will handily clean them up.
+    FPM.addPass(InstCombinePass());
+    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(DCEPass());
+    FPM.addPass(UnifyFunctionExitNodesPass());
+    FPM.addPass(LoopSimplifyPass());
+    // Lower switches again because CFG simplifcation can create them.
+    FPM.addPass(LowerSwitchPass());
+    {
+      LoopPassManager LPM;
+      LPM.addPass(VeczLoopRotatePass());
+      // IndVarSimplify can create a lot of duplicate instructions when there
+      // are unrolled loops. EarlyCSE is there to clear them up. However,
+      // this can destroy LCSSA form, so we need to restore it.
+      LPM.addPass(IndVarSimplifyPass());
+      FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM)));
+    }
+
+    FPM.addPass(EarlyCSEPass());
+    // We run this last because EarlyCSE can actually create infinite loops
+    // (with a "conditional" branch on true)
+    FPM.addPass(createFunctionToLoopPassAdaptor(SimplifyInfiniteLoopPass()));
+
+    FPM.addPass(RemoveIntPtrPass());
+    FPM.addPass(SquashSmallVectorsPass());
+    FPM.addPass(UniformReassociationPass());
+    FPM.addPass(TernaryTransformPass());
+
+    FPM.addPass(BreakCriticalEdgesPass());
+    FPM.addPass(LCSSAPass());
+    FPM.addPass(ControlFlowConversionPass());
+
+    PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  // Verify that the preparation passes (particularly control-flow conversion)
+  // have left the module in a correct state.
+  PM.addPass(VerifierPass());
+
+  {
+    FunctionPassManager FPM;
+
+    FPM.addPass(DivergenceCleanupPass());
+
+    FPM.addPass(CommonGEPEliminationPass());
+    FPM.addPass(ScalarizationPass());
+
+    FPM.addPass(AggressiveInstCombinePass());
+    FPM.addPass(ADCEPass());
+    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(SimplifyMaskedMemOpsPass());
+
+    // Having multiple GEP instructions that perform the same operation
+    // greatly amplifies the code generated by the packetizer as it duplicates
+    // the amount of extractelement instructions, so we want to remove what
+    // is unnecessary.
+    FPM.addPass(CommonGEPEliminationPass());
+
+    // The packetizer - the 'main' bit of the vectorization process.
+    FPM.addPass(PacketizationPass());
+
+    FPM.addPass(InlinePostVectorizationPass());
+    FPM.addPass(FlattenCFGPass());
+    FPM.addPass(GVNPass(GVNOptions().setMemDep(true)));
+    FPM.addPass(AggressiveInstCombinePass());
+    FPM.addPass(ADCEPass());
+    FPM.addPass(SinkingPass());
+    FPM.addPass(SimplifyCFGPass());
+    FPM.addPass(AggressiveInstCombinePass());
+
+    FPM.addPass(InterleavedGroupCombinePass(eInterleavedStore));
+    FPM.addPass(InterleavedGroupCombinePass(eInterleavedLoad));
+    FPM.addPass(InstCombinePass());
+    FPM.addPass(InferAlignmentPass());
+    FPM.addPass(DCEPass());
+    FPM.addPass(SimplifyMaskedMemOpsPass());
+
+    PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+  }
+
+  PM.addPass(DefineInternalBuiltinsPass());
+  PM.addPass(VerifierPass());
+
+  return true;
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt
new file mode 100644
index 0000000000000..b47f8f35b3df2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lit)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt
new file mode 100644
index 0000000000000..7f67eb3a1a873
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/CMakeLists.txt
@@ -0,0 +1,26 @@
+configure_lit_site_cfg(
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
+  ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
+  MAIN_CONFIG
+  ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
+  PATHS
+  "CMAKE_OSX_SYSROOT"
+  "LLVM_SOURCE_DIR"
+  "LLVM_BINARY_DIR"
+  "LLVM_TOOLS_DIR"
+  "LLVM_LIBS_DIR"
+  "SHLIBDIR"
+  )
+
+# TODO: Consider adding to check-sycl if this is every moved to under llvm/tests
+# Add a target to invoke tests via Ninja/make.
+add_lit_testsuite(check-sycl-vecz-tests "Running SYCL vecz lit tests"
+    "${CMAKE_CURRENT_BINARY_DIR}"
+
+    DEPENDS
+    veczc
+    FileCheck
+)
+
+add_custom_target(check-sycl-vecz)
+add_dependencies(check-sycl-vecz check-sycl-vecz-tests)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py
new file mode 100644
index 0000000000000..0c0a2590b6274
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.cfg.py
@@ -0,0 +1,37 @@
+# Copyright (C) Codeplay Software Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+# Exceptions; you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Python configuration file for lit."""
+
+import os
+import lit.formats
+from lit.llvm import llvm_config
+
+
+# Name of the test suite.
+config.name = "LLVM"
+
+# File extensions for testing.
+config.suffixes = [".hlsl", ".ll"]
+
+# The test format used to interpret tests.
+config.test_format = lit.formats.ShTest(execute_external=False)
+
+# The root path where tests are located.
+config.test_source_root = os.path.dirname(__file__)
+
+llvm_config.with_environment(
+    "PATH", os.path.abspath(config.llvm_tools_dir), append_path=True
+)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in
new file mode 100644
index 0000000000000..785ee42143601
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/lit.site.cfg.py.in
@@ -0,0 +1,22 @@
+"""Python configuration file for lit."""
+
+@LIT_SITE_CFG_IN_HEADER@
+
+import os
+from lit.llvm.subst import ToolSubst
+from lit.llvm import llvm_config
+
+config.test_exec_root = r"@CURRENT_BINARY_DIR@"
+
+# Paths to helper utilities
+config.tools = [ ToolSubst('veczc') ]
+
+config.targets = frozenset('@LLVM_TARGETS_TO_BUILD@'.split(';'))
+
+config.llvm_tools_dir = lit_config.substitute(path(r"@LLVM_TOOLS_DIR@"))
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+# Let the main config do the real work.
+lit_config.load_config(config, os.path.join('@CMAKE_CURRENT_SOURCE_DIR@', "lit.cfg.py"))
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg
new file mode 100644
index 0000000000000..13f31884ad10f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/lit.local.cfg
@@ -0,0 +1,18 @@
+# Copyright (C) Codeplay Software Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+# Exceptions; you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if not 'AArch64' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
new file mode 100644
index 0000000000000..4a73b10725a00
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_1.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idx.ext = sext i32 %mul3 to i64
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idx.ext
+  %0 = load i32, i32 addrspace(1)* %add.ptr, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr, i64 1
+  %1 = load i32, i32 addrspace(1)* %arrayidx4, align 4
+  %add5 = add nsw i32 %1, %0
+  %idxprom = sext i32 %add to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add5, i32 addrspace(1)* %arrayidx8, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
new file mode 100644
index 0000000000000..fc0cc97549baf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_2.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %conv4 = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %conv4
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add5 = or i64 %conv4, 1
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add5
+  %1 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %add7 = add nsw i32 %1, %0
+  %idxprom = sext i32 %add to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add7, i32 addrspace(1)* %arrayidx10, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
new file mode 100644
index 0000000000000..f000efae816a6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_3.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idxprom = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add7 = or i32 %mul3, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4
+  %add10 = add nsw i32 %1, %0
+  %idxprom13 = sext i32 %add to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom13
+  store i32 %add10, i32 addrspace(1)* %arrayidx14, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
new file mode 100644
index 0000000000000..82c8454716a5f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_4.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = or i32 %mul3, 1
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom8 = sext i32 %mul3 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4
+  %sub = sub nsw i32 %0, %1
+  %idxprom12 = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom12
+  store i32 %sub, i32 addrspace(1)* %arrayidx13, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
new file mode 100644
index 0000000000000..cd0d380e50e54
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_5.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idxprom = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add7 = or i32 %mul3, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i32, i32 addrspace(1)* %arrayidx9, align 4
+  %add13 = add nsw i32 %mul3, 2
+  %idxprom14 = sext i32 %add13 to i64
+  %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom14
+  %2 = load i32, i32 addrspace(1)* %arrayidx15, align 4
+  %add19 = add nsw i32 %mul3, 3
+  %idxprom20 = sext i32 %add19 to i64
+  %arrayidx21 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom20
+  %3 = load i32, i32 addrspace(1)* %arrayidx21, align 4
+  %add22 = add nsw i32 %1, %0
+  %add23 = add nsw i32 %add22, %2
+  %add24 = add nsw i32 %add23, %3
+  %idxprom27 = sext i32 %add to i64
+  %arrayidx28 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom27
+  store i32 %add24, i32 addrspace(1)* %arrayidx28, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD1:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD1]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD1]], 1
+; CHECK: [[LOAD2:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD2]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD2]], 1
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
new file mode 100644
index 0000000000000..b6327e55775cd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/AArch64/shuffled_load_aarch64_6.ll
@@ -0,0 +1,59 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple=aarch64-unknown-unknown -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = add nsw i32 %mul3, 3
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %shl = shl i32 %0, 1
+  %add8 = add nsw i32 %mul3, 2
+  %idxprom9 = sext i32 %add8 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %sub = sub nsw i32 %shl, %1
+  %idxprom13 = sext i32 %add to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom13
+  store i32 %sub, i32 addrspace(1)* %arrayidx14, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define {{(dso_local )?}}spir_kernel void @load16
+; CHECK: [[LOAD:%.+]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p1
+; CHECK-NOT: load <4 x i32>
+; CHECK-NOT: call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i32> @__vecz_b_gather_load
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 0
+; CHECK: extractvalue { <4 x i32>, <4 x i32> } [[LOAD]], 1
+; CHECK-NOT: extractvalue
+; CHECK-NOT: shufflevector
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
new file mode 100644
index 0000000000000..b6b34d30c45fc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_killer.ll
@@ -0,0 +1,150 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k boscc_killer -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+@boscc_killer.shared = internal unnamed_addr addrspace(3) global i32 poison, align 4
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @boscc_killer(float addrspace(1)* %A, float addrspace(1)* %B, i32 %N, i32 %lda) {
+entry:
+  %gid0 = tail call i64 @__mux_get_local_id(i32 0)
+  %cmp0 = icmp eq i64 %gid0, 0
+  br i1 %cmp0, label %if.then, label %if.end
+
+if.then:                                        ; preds = %if.end24
+  store i32 %N, i32 addrspace(3)* @boscc_killer.shared, align 4
+  br label %if.end
+
+if.end:                                         ; preds = %for.end, %if.end24
+  %ldl.a = load i32, i32 addrspace(3)* @boscc_killer.shared, align 4
+  %ldl.b = trunc i64 %gid0 to i32
+  %ldl = add i32 %ldl.a, %ldl.b
+  %cmp1 = icmp eq i32 %ldl, 0
+  br i1 %cmp1, label %if.then2, label %if.else
+
+if.else:                                       ; preds = %if.end
+  %cmp2 = icmp slt i32 %ldl, %N
+  br i1 %cmp2, label %for.body, label %exit
+
+for.body:                                   ; preds = %for.inc, %if.end227
+  %acc = phi i32 [ %update2, %for.inc ], [ 1, %if.else ]
+  %acc_shl = shl nuw nsw i32 %acc, 2
+  %update = add i32 %ldl, %acc_shl
+  %cmp3 = icmp slt i32 %update, %ldl
+  br i1 %cmp3, label %for.if.then, label %for.inc
+
+for.if.then:                                    ; preds = %for.body
+  %mul297.us = mul nsw i32 %update, %lda
+  %add298.us = add nsw i32 %mul297.us, %ldl
+  %idxprom299.us = sext i32 %add298.us to i64
+  %arrayidx300.us = getelementptr inbounds float, float addrspace(1)* %A, i64 %idxprom299.us
+  store float zeroinitializer, float addrspace(1)* %arrayidx300.us, align 16
+  br label %for.inc
+
+for.inc:                                     ; preds = %for.if.then, %for.body
+  %update2 = add nuw nsw i32 %acc, 1
+  %cmp4 = icmp ult i32 %acc, 4
+  br i1 %cmp4, label %for.body, label %exit
+
+if.then2:                                        ; preds = %if.end
+  %gid0_trunc = trunc i64 %gid0 to i32
+  %cmp5 = icmp sgt i32 %ldl, %gid0_trunc
+  br i1 %cmp5, label %if.then3, label %exit
+
+if.then3:                             ; preds = %for.cond.exit, %if.then53
+  %arrayidxB = getelementptr inbounds float, float addrspace(1)* %B, i64 %gid0
+  %v23 = load float, float addrspace(1)* %arrayidxB, align 16
+  %arrayidxA = getelementptr inbounds float, float addrspace(1)* %A, i64 %gid0
+  store float %v23, float addrspace(1)* %arrayidxA, align 16
+  %call149 = tail call i64 @__mux_get_local_size(i32 0) #6
+  %conv152 = add i64 %call149, %gid0
+  %cmp71 = icmp slt i64 %conv152, 0
+  br label %exit
+
+exit:                                          ; preds = %for.inc, %if.end227, %for.cond.exit, %if.then53, %entry
+  ret void
+}
+
+; We mostly want to check that it succeeded since this CFG crashed the block
+; ordering algorithm, however it seems it is not easy to create a UnitCL test
+; for this, since the CFG gets changed into something that doesn't cause the
+; crash. This bug was identified from an Ecosystem failure, however, so it must
+; be possible to do somehow.
+;
+; CHECK: spir_kernel void @__vecz_v4_boscc_killer
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %entry.boscc_indir
+; CHECK: if.then.uniform:
+; CHECK: br label %if.end
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.end, label %if.then
+; CHECK: if.then:
+; CHECK: br label %if.end
+; CHECK: if.end:
+; CHECK: br i1 %{{.+}}, label %if.then2.uniform, label %if.end.boscc_indir
+; CHECK: if.else.uniform:
+; CHECK: br i1 %{{.+}}, label %for.body.preheader.uniform, label %if.else.uniform.boscc_indir
+; CHECK: for.body.preheader.uniform:
+; CHECK: br label %for.body.uniform
+; CHECK: if.else.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %exit, label %for.body.preheader
+; CHECK: for.body.uniform:
+; CHECK: br i1 %{{.+}}, label %for.if.then.uniform, label %for.body.uniform.boscc_indir
+; CHECK: for.if.then.uniform:
+; CHECK: br label %for.inc.uniform
+; CHECK: for.body.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.inc.uniform, label %for.body.uniform.boscc_store
+; CHECK: for.body.uniform.boscc_store:
+; CHECK: br label %for.if.then
+; CHECK: for.inc.uniform:
+; CHECK: br i1 %{{.+}}, label %for.body.uniform, label %exit.loopexit.uniform
+; CHECK: exit.loopexit.uniform:
+; CHECK: br label %exit
+; CHECK: if.then2.uniform:
+; CHECK: br i1 %{{.+}}, label %if.then3.uniform, label %if.then2.uniform.boscc_indir
+; CHECK: if.end.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.else.uniform, label %if.else
+; CHECK: if.then3.uniform:
+; CHECK: br label %exit
+; CHECK: if.then2.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %exit, label %if.then3
+; CHECK: if.else:
+; CHECK: br label %for.body.preheader
+; CHECK: for.body.preheader:
+; CHECK: br label %for.body
+; CHECK: for.body:
+; CHECK: br label %for.if.then
+; CHECK: for.if.then:
+; CHECK: br label %for.inc
+; CHECK: for.inc:
+; CHECK: br i1 %{{.+}}, label %for.body, label %exit.loopexit
+; CHECK: if.then2:
+; CHECK: br label %if.then3
+; CHECK: if.then3:
+; CHECK: br label %exit
+; CHECK: exit.loopexit:
+; CHECK: br label %if.then2
+; CHECK: exit:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
new file mode 100644
index 0000000000000..014f19594e2b0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge.ll
@@ -0,0 +1,303 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k boscc_merge -vecz-passes="function(instcombine),function(simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32) #0
+declare i64 @__mux_get_local_size(i32) #0
+
+define spir_kernel void @boscc_merge(i32 %n, float addrspace(1)* %out, i64 %x) {
+entry:
+  %lid = tail call i64 @__mux_get_local_id(i32 0)
+  %lsize = tail call i64 @__mux_get_local_size(i32 0)
+  %out_ptr = getelementptr inbounds float, float addrspace(1)* %out, i64 %x
+  %lid_sum_lsize = add i64 %lid, %lsize
+  %cmp1 = icmp ult i64 %lsize, %x
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                              ; preds = %entry
+  %cmp2 = icmp ult i64 %lid, %x
+  br i1 %cmp2, label %if.then2, label %if.else2.preheader
+
+if.else2.preheader:                                   ; preds = %if.then
+  store float 0.000000e+00, float addrspace(1)* %out_ptr, align 4 ; just so it's non-trivial for BOSCC
+  br label %if.else2
+
+if.then2:                                              ; preds = %if.then
+  %cmp3 = icmp ugt i64 %lsize, %x
+  br i1 %cmp3, label %if.then3.preheader, label %if.else3.preheader
+
+if.else3.preheader:                                    ; preds = %if.then2
+  br label %if.else3
+
+if.then3.preheader:                                    ; preds = %if.then2
+  br label %if.then3
+
+if.then3:                                              ; preds = %if.then3.preheader, %if.else5
+  %cmp4 = icmp ugt i64 %lid, %x
+  br i1 %cmp4, label %if.then4.preheader, label %if.else4.preheader
+
+if.else4.preheader:                                    ; preds = %if.then3
+  br label %if.else4
+
+if.then4.preheader:                                    ; preds = %if.then3
+  br label %if.then4
+
+if.else4:                                              ; preds = %if.else4.preheader, %if.else4
+  %cmp5 = icmp ult i64 %lid, %x
+  br i1 %cmp5, label %if.else4, label %if.else5.loopexit1
+
+if.else5.loopexit:                                     ; preds = %if.then4
+  br label %if.else5
+
+if.else5.loopexit1:                                    ; preds = %if.else4
+  br label %if.else5
+
+if.else5:                                              ; preds = %if.else5.loopexit1, %if.else5.loopexit
+  %cmp6 = icmp ult i64 %lid, %x
+  br i1 %cmp6, label %if.then3, label %if.else.loopexit
+
+if.then4:                                              ; preds = %if.then4.preheader, %if.then4
+  %cmp7 = icmp ult i64 %lid_sum_lsize, %x
+  br i1 %cmp7, label %if.then4, label %if.else5.loopexit
+
+if.else3:                                              ; preds = %if.else3.preheader, %if.else3
+  %cmp8 = icmp ult i64 %lid_sum_lsize, %x
+  br i1 %cmp8, label %if.else3, label %if.else.loopexit2
+
+if.else2:                                             ; preds = %if.else2.preheader, %if.else2
+  %cmp9 = icmp ult i64 %lid_sum_lsize, %x
+  br i1 %cmp9, label %if.else2, label %if.else.loopexit3
+
+if.else.loopexit:                                    ; preds = %if.else5
+  br label %if.else
+
+if.else.loopexit2:                                   ; preds = %if.else3
+  br label %if.else
+
+if.else.loopexit3:                                   ; preds = %if.else2
+  br label %if.else
+
+if.else:                                             ; preds = %if.else.loopexit3, %if.else.loopexit2, %if.else.loopexit, %entry
+  %cmp10 = icmp ult i64 %lid, %x
+  br i1 %cmp10, label %if.then5, label %if.else6
+
+if.then5:                                             ; preds = %if.else
+  %cmp11 = icmp eq i64 %x, 0
+  br i1 %cmp11, label %if.then6, label %if.else7
+
+if.else7:                                             ; preds = %if.then5
+  %load = load float, float addrspace(1)* %out, align 4
+  br label %if.then6
+
+if.then6:                                             ; preds = %if.else7, %if.then5
+  %ret = phi float [ 0.000000e+00, %if.then5 ], [ %load, %if.else7 ]
+  store float %ret, float addrspace(1)* %out_ptr, align 4
+  br label %if.else6
+
+if.else6:                                             ; preds = %if.then6, %if.else
+  ret void
+}
+
+; CHECK: spir_kernel void @__vecz_v4_boscc_merge
+; CHECK: %[[CMP1:.+]] = icmp
+; CHECK:  br i1 %[[CMP1]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2UNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE2PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[IFELSE2UNIFORM:.+]]
+
+; CHECK: [[IFELSE2UNIFORM]]:
+; CHECK:  br i1 %{{.+}}, label %[[IFELSE2UNIFORM]], label %[[IFELSE2UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE2UNIFORMBOSCCINDIR]]:
+; CHECK:  br i1 %{{.+}}, label %[[IFELSELOOPEXIT3UNIFORM:.+]], label %[[IFELSE2UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE2UNIFORMBOSCCSTORE]]:
+; CHECK:  br label %[[IFELSE2:.+]]
+
+; CHECK: [[IFELSELOOPEXIT3UNIFORM]]:
+; CHECK: br label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFTHEN2UNIFORM]]:
+; CHECK: %[[CMP3UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP3UNIFORM]], label %[[IFTHEN3PREHEADERUNIFORM:.+]], label %[[IFELSE3PREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE2PREHEADERUNIFORM]], label %[[IFELSE2PREHEADER:.+]]
+
+; CHECK: [[IFELSE3PREHEADERUNIFORM]]:
+; CHECK: br label %[[IFELSE3UNIFORM:.+]]
+
+; CHECK: [[IFELSE3UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE3UNIFORM]], label %[[IFELSE3UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE3UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSELOOPEXIT2UNIFORM:.+]], label %[[IFELSE3UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE3UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFELSE3:.+]]
+
+; CHECK: [[IFELSELOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[IFELSEUNIFORM]]
+
+; CHECK: [[IFTHEN3PREHEADERUNIFORM]]:
+; CHECK: br label %[[IFTHEN3UNIFORM:.+]]
+
+; CHECK: [[IFTHEN3UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4PREHEADERUNIFORM:.+]], label %[[IFTHEN3UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE5UNIFORMBOSCCINDIR:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSELOOPEXITUNIFORM:.+]], label %[[IFELSE5UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE5UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN3:.+]]
+
+; CHECK: [[IFELSE4PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[IFELSE4UNIFORM:.+]]
+
+; CHECK: [[IFELSE4UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE4UNIFORM]], label %[[IFELSE4UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE4UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE5LOOPEXIT1UNIFORM:.+]], label %[[IFELSE4UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE4UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFELSE4:.+]]
+
+; CHECK: [[IFELSE5LOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[IFELSE5UNIFORM:.+]]
+
+; CHECK: [[IFTHEN4PREHEADERUNIFORM]]:
+; CHECK: br label %[[IFTHEN4UNIFORM:.+]]
+
+; CHECK: [[IFTHEN3UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE4PREHEADERUNIFORM]], label %[[IFTHEN3UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFTHEN3UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFELSE4PREHEADER:.+]]
+
+; CHECK: [[IFTHEN4UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM]], label %[[IFTHEN4UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN4UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE5LOOPEXITUNIFORM:.+]], label %[[IFTHEN4UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFTHEN4UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN4:.+]]
+
+; CHECK: [[IFELSE5LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFELSE5UNIFORM]]
+
+; CHECK: [[IFELSE5UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN3UNIFORM]], label %[[IFELSE5UNIFORMBOSCCINDIR]]
+
+; CHECK: [[IFELSELOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFELSE]]
+
+; CHECK: [[IFELSE2PREHEADER]]:
+; CHECK: br label %[[IFELSE2]]
+
+; CHECK: [[IFTHEN2:.+]]:
+; CHECK: %[[CMP3:.+]] = icmp
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[CMP3_ACTIVE:.+]] = select i1 %[[CMP2]], i1 %[[CMP3]], i1 false
+; CHECK: %[[CMP3_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[CMP3_ACTIVE]])
+; CHECK: br i1 %[[CMP3_ACTIVE_ANY]], label %[[IFTHEN3PREHEADER:.+]], label %[[IFELSE3PREHEADER:.+]]
+
+; CHECK: [[IFELSE3PREHEADER]]:
+; CHECK: br label %[[IFELSE3]]
+
+; CHECK: [[IFTHEN3PREHEADER]]:
+; CHECK: br label %[[IFTHEN3]]
+
+; CHECK: [[IFTHEN3]]:
+; CHECK: br label %[[IFELSE4PREHEADER]]
+
+; CHECK: [[IFELSE4PREHEADER]]:
+; CHECK: br label %[[IFELSE4]]
+
+; CHECK: [[IFTHEN4PREHEADER:.+]]:
+; CHECK: br label %[[IFTHEN4]]
+
+; CHECK: [[IFELSE4]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE4]], label %[[IFELSE4PUREEXIT:.+]]
+
+; CHECK: [[IFELSE4PUREEXIT]]:
+; CHECK: br label %[[IFELSE5LOOPEXIT1:.+]]
+
+; CHECK: [[IFELSE5LOOPEXIT:.+]]:
+; CHECK: br label %[[IFELSE5:.+]]
+
+; CHECK: [[IFELSE5LOOPEXIT1]]:
+; CHECK: br label %[[IFTHEN4PREHEADER]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN3]], label %[[IFTHEN3PUREEXIT:.+]]
+
+; CHECK: [[IFTHEN3PUREEXIT]]:
+; CHECK: br label %[[IFELSELOOPEXIT:.+]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4]], label %[[IFTHEN4PUREEXIT:.+]]
+
+; CHECK: [[IFTHEN4PUREEXIT]]:
+; CHECK: br label %[[IFELSE5LOOPEXIT]]
+
+; CHECK: [[IFELSE3]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE3]], label %[[IFELSE3PUREEXIT:.+]]
+
+; CHECK: [[IFELSE3PUREEXIT]]:
+; CHECK: br label %[[IFELSELOOPEXIT2:.+]]
+
+; CHECK: [[IFELSE2]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE2]], label %[[IFELSE2PUREEXIT:.+]]
+
+; CHECK: [[IFELSE2PUREEXIT]]:
+; CHECK: br label %[[IFELSELOOPEXIT3:.+]]
+
+; CHECK: [[IFELSELOOPEXIT]]:
+; CHECK: br label %[[IFELSE]]
+
+; CHECK: [[IFELSELOOPEXIT2]]:
+; CHECK: br label %[[IFELSE]]
+
+; CHECK: [[IFELSELOOPEXIT3]]:
+; CHECK: br label %[[IFTHEN2]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE7UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE7UNIFORM]]:
+; CHECK: br label %[[IFELSE6:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE6]], label %[[IFELSE7:.+]]
+
+; CHECK: [[IFELSE7]]:
+; CHECK: br label %[[IFELSE6]]
+
+; CHECK: [[IFELSE6]]:
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
new file mode 100644
index 0000000000000..bdaf96b9903fe
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge2.ll
@@ -0,0 +1,173 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k boscc_merge2 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare float @llvm.fmuladd.f32(float, float, float) #2
+declare void @__mux_work_group_barrier(i32, i32, i32) #3
+declare spir_func float @_Z3maxff(float, float) #1
+declare i64 @__mux_get_local_id(i32) #1
+declare i64 @__mux_get_group_id(i32) #1
+
+@fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared = internal addrspace(3) global [640 x float] poison, align 4
+@fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared = internal addrspace(3) global [1152 x float] poison, align 4
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @boscc_merge2(float addrspace(1)* noalias %input0, float addrspace(1)* noalias %input1, float addrspace(1)* noalias %tensor, float addrspace(1)* noalias %input2) #2 {
+entry:
+  %compute = alloca [28 x float], align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc2, %for.inc ]
+  %cmp1 = icmp ult i32 %storemerge, 16
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                      ; preds = %for.cond
+  %call1 = call i64 @__mux_get_local_id(i32 0) #5
+  %call2 = call i64 @__mux_get_group_id(i32 1) #5
+  %idx1 = getelementptr inbounds [640 x float], [640 x float] addrspace(3)* @fuse_conv2d_broadcast_add_relu_1_kernel0.pad_temp_shared, i64 0, i64 %call1
+  store float 0.000000e+00, float addrspace(3)* %idx1, align 4
+  %cmp2 = icmp sgt i64 %call2, %call1
+  br i1 %cmp2, label %if.then2, label %land.lhs.true1
+
+land.lhs.true1:                                 ; preds = %if.then
+  %call3 = call i64 @__mux_get_group_id(i32 1) #5
+  %call4 = call i64 @__mux_get_local_id(i32 0) #5
+  %cmp3 = icmp slt i64 %call3, %call4
+  br i1 %cmp3, label %land.lhs.true2, label %if.then2
+
+land.lhs.true2:                                 ; preds = %land.lhs.true1
+  %call5 = call i64 @__mux_get_local_id(i32 0) #5
+  %call6 = call i64 @__mux_get_group_id(i32 0) #5
+  %cmp4 = icmp sgt i64 %call6, %call5
+  br i1 %cmp4, label %if.then2, label %land.lhs.true3
+
+land.lhs.true3:                                 ; preds = %land.lhs.true2
+  %call7 = call i64 @__mux_get_group_id(i32 0) #5
+  %call8 = call i64 @__mux_get_local_id(i32 0) #5
+  %cmp5 = icmp slt i64 %call7, %call8
+  br i1 %cmp5, label %cond.true4, label %if.then2
+
+cond.true4:                                     ; preds = %land.lhs.true3
+  %call9 = call i64 @__mux_get_local_id(i32 1) #5
+  %idx2 = getelementptr inbounds float, float addrspace(1)* %input0, i64 %call9
+  br label %if.then2
+
+if.then2:                                      ; preds = %cond.true4, %land.lhs.true3, %land.lhs.true2, %land.lhs.true1, %if.then
+  %call10 = call i64 @__mux_get_local_id(i32 0) #5
+  %conv = trunc i64 %call10 to i32
+  %idx3 = sext i32 %conv to i64
+  %idx4 = getelementptr inbounds [1152 x float], [1152 x float] addrspace(3)* @fuse_conv2d_broadcast_add_relu_1_kernel0.input1_shared, i64 0, i64 %idx3
+  %idx5 = getelementptr inbounds float, float addrspace(1)* %input1, i64 %idx3
+  %load1 = load float, float addrspace(1)* %idx5, align 4
+  store float %load1, float addrspace(3)* %idx4, align 4
+  call void @__mux_work_group_barrier(i32 0, i32 1, i32 272) #4
+  br label %for.cond2
+
+for.cond2:                                     ; preds = %for.body, %if.then2
+  %storemerge1 = phi i32 [ 0, %if.then2 ], [ %inc1, %for.body ]
+  %cmp6 = icmp ult i32 %storemerge1, 4
+  br i1 %cmp6, label %for.body, label %for.inc
+
+for.body:                                     ; preds = %for.cond2
+  %load2 = load float, float addrspace(3)* %idx4, align 4
+  %fmul = call float @llvm.fmuladd.f32(float %load2, float %load2, float %load2)
+  %idx6 = getelementptr inbounds [28 x float], [28 x float]* %compute, i64 0, i64 27
+  store float %fmul, float* %idx6, align 4
+  %inc1 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond2
+
+for.inc:                                      ; preds = %for.cond2
+  %inc2 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+if.else:                                      ; preds = %for.cond
+  %idx7 = getelementptr inbounds [28 x float], [28 x float]* %compute, i64 0, i64 0
+  %load3 = load float, float* %idx7, align 4
+  %storemerge_sext = sext i32 %storemerge to i64
+  %idx8 = getelementptr inbounds float, float addrspace(1)* %tensor, i64 %storemerge_sext
+  store float %load3, float addrspace(1)* %idx8, align 4
+  ret void
+}
+
+attributes #0 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+; CHECK: spir_kernel void @__vecz_v4_boscc_merge2
+; CHECK:  br label %[[IFTHEN:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[LANDLHSTRUE1UNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE2UNIFORM:.+]], label %[[LANDLHSTRUE1UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[LANDLHSTRUE2UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[LANDLHSTRUE2UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[LANDLHSTRUE1UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[LANDLHSTRUE2:.+]]
+
+; CHECK: [[LANDLHSTRUE3UNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[CONDTRUE4UNIFORM:.+]], label %[[LANDLHSTRUE3UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[CONDTRUE4UNIFORM]]:
+; CHECK: br label %[[IFTHEN2]]
+
+; CHECK: [[LANDLHSTRUE3UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN2]], label %[[CONDTRUE4:.+]]
+
+; CHECK: [[LANDLHSTRUE1:.+]]:
+; CHECK: br label %[[LANDLHSTRUE2]]
+
+; CHECK: [[LANDLHSTRUE2]]:
+; CHECK: br label %[[LANDLHSTRUE3:.+]]
+
+; CHECK: [[LANDLHSTRUE3]]:
+; CHECK: br label %[[CONDTRUE4]]
+
+; CHECK: [[CONDTRUE4]]:
+; CHECK: br label %[[IFTHEN2]]
+
+; CHECK: [[IFTHEN2]]:
+; CHECK: br label %[[FORCOND2:.+]]
+
+; CHECK: [[LANDLHSTRUE2UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE3UNIFORM]], label %[[LANDLHSTRUE3]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[LANDLHSTRUE1UNIFORM]], label %[[LANDLHSTRUE1]]
+
+; CHECK: [[FORCOND2]]:
+; CHECK: %[[EXITCOND:.+]] = icmp
+; CHECK: br i1 %[[EXITCOND]], label %[[FORBODY:.+]], label %[[FORINC:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND2]]
+
+; CHECK: [[FORINC]]:
+; CHECK: %[[EXITCOND4:.+]] = icmp
+; CHECK: br i1 %[[EXITCOND4]], label %[[IFTHEN]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
new file mode 100644
index 0000000000000..c73edafd0548d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/boscc_merge3.ll
@@ -0,0 +1,130 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k boscc_merge3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_global_id(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64, float addrspace(1)*)
+
+define spir_kernel void @boscc_merge3(float addrspace(1)* %out, i64 noundef %n, float noundef %m) {
+entry:
+  %gid0 = tail call i64 @__mux_get_global_id(i32 0) #0
+  %gid1 = tail call i64 @__mux_get_global_id(i32 1) #0
+  %cmp1 = icmp slt i64 %gid0, %n
+  br i1 %cmp1, label %if.then1, label %end
+
+if.then1:                                     ; preds = %entry
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %gid1
+  %cmp2 = fcmp une float %m, 0.000000e+00
+  br i1 %cmp2, label %if.then2, label %if.end1
+
+if.then2:                                     ; preds = %if.then1
+  %cmp3 = icmp sge i64 %gid1, %n
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %gid0
+  br i1 %cmp3, label %if.then3, label %if.else3
+
+if.then3:                                     ; preds = %x51
+  %load1 = load float, float addrspace(1)* %gep2, align 4
+  %ie_load1 = insertelement <4 x float> poison, float %load1, i32 0
+  br label %if.end2
+
+if.else3:                                    ; preds = %x51
+  %vload1 = tail call spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64 0, float addrspace(1)* %gep2)
+  %cmp4 = icmp slt i64 %gid0, %n
+  br i1 %cmp4, label %if.then4, label %if.end2
+
+if.then4:                                    ; preds = %x175
+  %vload2 = tail call spir_func <4 x float> @_Z6vload4mPU3AS1Kf(i64 4, float addrspace(1)* %gep2)
+  br label %if.end2
+
+if.end2:                                    ; preds = %x274, %x271, %if.then4, %x175, %x155, %x132
+  %phi_gep2_load = phi <4 x float> [ %ie_load1, %if.then3 ], [ %vload2, %if.then4 ], [ %vload1, %if.else3 ]
+  %ie_m = insertelement <4 x float> poison, float %m, i32 0
+  %shuffle_ie_m = shufflevector <4 x float> %ie_m, <4 x float> poison, <4 x i32> zeroinitializer
+  %fmul = fmul <4 x float> %shuffle_ie_m, %phi_gep2_load
+  br label %if.end1
+
+if.end1:                                    ; preds = %if.end2, %if.then1
+  %phi_fmul = phi <4 x float> [ %fmul, %if.end2 ], [ zeroinitializer, %if.then1 ]
+  %ee0 = extractelement <4 x float> %phi_fmul, i32 0
+  store float %ee0, float addrspace(1)* %gep1, align 4
+  br label %end
+
+end:
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
+; CHECK: spir_kernel void @__vecz_v4_boscc_merge3
+; CHECK: entry:
+; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp1)
+; CHECK: br i1 %[[BOSCC]], label %if.then1.uniform, label %entry.boscc_indir
+
+; CHECK: if.then1.uniform:
+; CHECK: %gep1.uniform =
+; CHECK: br i1 %cmp2.uniform, label %if.then2.uniform, label %if.end1.uniform
+
+; CHECK: if.else3.uniform:
+; CHECK: %[[BOSCC2:.+]] = call i1 @__vecz_b_divergence_all(i1 %{{if.then4.uniform.exit_mask|cmp4.uniform}})
+; CHECK: br i1 %[[BOSCC2]], label %if.then4.uniform, label %if.else3.uniform.boscc_indir
+
+; CHECK: if.else3.uniform.boscc_indir:
+; CHECK: %[[BOSCC3:.+]] = call i1 @__vecz_b_divergence_all(i1 %if.end2.uniform.exit_mask)
+; CHECK: br i1 %[[BOSCC3]], label %if.end2.uniform, label %if.then4
+
+; CHECK: if.then1:
+; CHECK: %gep1 =
+; CHECK: br i1 %cmp2, label %if.then2, label %if.end1
+
+; Generalizing the expected %cmp3 value because the 'icmp' could go off
+; by one BB between LLVM versions. Therefore we can get %cmp3.not.
+; CHECK: if.then2:
+; CHECK: br i1 %cmp3{{(.+)?}}, label %if.else3, label %if.then3
+
+; CHECK: if.then3:
+; CHECK: br label %if.end2
+
+; CHECK: if.else3:
+; CHECK: br label %if.then4
+
+; CHECK: if.then4:
+; CHECK: %gep1.boscc_blend = phi ptr addrspace(1) [ %gep1.uniform, %if.else3.uniform.boscc_indir ], [ %gep1, %if.else3 ]
+; CHECK: br label %if.end2
+
+; CHECK: if.end2:
+
+; Check we have correctly blended the instruction during the BOSCC connection
+; rather than while repairing the SSA form.
+; CHECK-NOT: %gep1.boscc_blend.merge{{.*}} = phi
+; CHECK: %gep1.boscc_blend{{[0-9]*}} = phi ptr addrspace(1) [ %gep1.boscc_blend{{[0-9]*}}, %if.then4 ], [ %gep1, %if.then3 ]
+; CHECK: br label %if.end1
+
+; CHECK: if.end1:
+
+; Check we have correctly blended the instruction during the BOSCC connection
+; rather than while repairing the SSA form.
+; CHECK-NOT: %gep1.boscc_blend.merge{{.*}} = phi
+; CHECK: %gep1.boscc_blend{{[0-9]*}} = phi ptr addrspace(1) [ %gep1.boscc_blend{{[0-9]*}}, %if.end2 ], [ %gep1, %if.then1 ]
+; CHECK: br label %end
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
new file mode 100644
index 0000000000000..64c5d6e7cbc28
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/duplicate_preheader.ll
@@ -0,0 +1,134 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; This test checks that we create a new preheader that blends the preheader
+; of the uniform and the predicated paths for a loop that has not been
+; duplicated (because of the barrier in it).
+
+; RUN: veczc -k duplicate_preheader -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: noduplicate
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_local_id(i32)
+
+define spir_kernel void @duplicate_preheader(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %id = tail call i64 @__mux_get_local_id(i32 0)
+  %cmp = icmp sgt i64 %id, 3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                     ; preds = %entry
+  br label %for.cond
+
+for.cond:
+  %ret.0 = phi i64 [ 0, %if.then ], [ %inc, %for.body ]
+  %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp uge i32 %storemerge8, %mul
+  br i1 %cmp2, label %for.body, label %if.end
+
+for.body:
+  %inc = add nsw i64 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge8, 1
+  br label %for.cond
+
+if.end:                                     ; preds = %if.then, %entry
+  %idx.blend = phi i64 [ %id, %entry ], [ %ret.0, %for.cond ]
+  %gep_var = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx.blend
+  br label %barrier
+
+barrier:                                     ; preds = %latch, %if.end
+  call void @__mux_work_group_barrier(i32 0, i32 1, i32 272)
+  br i1 %cmp, label %body, label %latch
+
+body:                                     ; preds = %barrier
+  %gep_uni = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
+  %ret = load i32, i32 addrspace(1)* %gep_uni, align 16
+  store i32 %ret, i32 addrspace(1)* %gep_var, align 16
+  br label %latch
+
+latch:                                     ; preds = %body, %barrier
+  %cmp3 = icmp sgt i32 %n, 10
+  br i1 %cmp3, label %exit, label %barrier
+
+exit:                                     ; preds = %latch
+  ret void
+}
+
+attributes #1 = { noduplicate }
+
+; CHECK: spir_kernel void @__vecz_v4_duplicate_preheader
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]]
+
+; Make sure we have both the uniform and non-uniform versions of the for loop.
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[ENTRYBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND:.+]], label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[IFENDLOOPEXITUNIFORM:.+]], label %[[FORBODYUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[IFENDLOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[IFENDLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[IFENDLOOPEXIT]]:
+; CHECK: br label %[[IFEND]]
+
+; Make sure we're reconverging here from the uniform and predicated paths before
+; branching to the barrier.
+; CHECK: [[IFEND]]:{{.*}}preds
+; CHECK-DAG: %[[IFENDLOOPEXIT]]
+; CHECK-DAG: %[[IFENDLOOPEXITUNIFORM]]
+; CHECK: br label %[[BARRIER:.+]]
+
+; CHECK: [[BARRIER]]:
+; CHECK: br i1 %{{.+}}, label %[[BODYUNIFORM:.+]], label %[[BARRIERBOSCCINDIR:.+]]
+
+; CHECK: [[BODYUNIFORM]]:
+; CHECK: br label %[[LATCHUNIFORM:.+]]
+
+; CHECK: [[BARRIERBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[LATCH:.+]], label %[[BODY:.+]]
+
+; CHECK: [[BODY]]:
+; CHECK: br label %[[LATCH]]
+
+; CHECK: [[LATCH]]:
+; CHECK: %[[CMP3:.+]] = icmp
+; CHECK: br i1 %[[CMP3]], label %[[EXIT:.+]], label %[[BARRIER]]
+
+; CHECK: [[EXIT]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
new file mode 100644
index 0000000000000..33ea2580691fb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops1.ll
@@ -0,0 +1,198 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k nested_loops1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_global_id(i32) #0
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_global_size(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func float @_Z3madfff(float, float, float) #0
+
+; Function Attrs: nounwind
+define spir_kernel void @nested_loops1(i32 %n, float addrspace(1)* %out) #1 {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 0) #0
+  %gsize = tail call i64 @__mux_get_global_size(i32 0) #0
+  %trunc_gid = trunc i64 %gid to i32
+  %trunc_gsize = trunc i64 %gsize to i32
+  %cmp1 = icmp slt i32 %trunc_gid, %n
+  br i1 %cmp1, label %if.then1, label %end
+
+if.then1:                                     ; preds = %16
+  %cmp2 = icmp slt i32 %n, 0
+  %cmp3 = icmp slt i32 %n, 0
+  %cmp4 = icmp sgt i32 %n, 0
+  %cmp5 = icmp slt i32 %n, 1
+  br label %for.cond
+
+for.cond:                                     ; preds = %if.else4, %if.then1
+  %trunc_gid_phi = phi i32 [ %trunc_gid, %if.then1 ], [ %add3, %if.else4 ]
+  %cmp6 = icmp eq i32 %trunc_gid_phi, -2147483648
+  %select1 = select i1 %cmp6, i32 1, i32 %n
+  %div1 = sdiv i32 %trunc_gid_phi, %select1
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.else2:                                     ; preds = %for.cond
+  %cmp7 = icmp eq i32 %n, 0
+  %select2 = select i1 %cmp7, i32 1, i32 %n
+  %div2 = sdiv i32 %n, %select2
+  br label %if.then2
+
+if.then2:                                     ; preds = %if.else2, %for.cond
+  br i1 %cmp3, label %if.then3, label %if.else3
+
+if.else3:                                     ; preds = %if.then2
+  %cmp8 = icmp eq i32 %n, 0
+  %select3 = select i1 %cmp8, i32 1, i32 %n
+  %div3 = sdiv i32 %n, %select3
+  br label %if.then3
+
+if.then3:                                     ; preds = %if.else3, %if.then2
+  br i1 %cmp4, label %if.then4, label %if.else4
+
+if.then4:                                     ; preds = %if.then3
+  br i1 %cmp5, label %if.else4, label %if.else5
+
+if.else5:                                     ; preds = %if.then4
+  %sext_div1 = sext i32 %div1 to i64
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_div1
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_div1
+  br label %for.cond2
+
+for.cond2:                                    ; preds = %if.else6, %if.else5
+  %float_idx = phi float [ 0.000000e+00, %if.else5 ], [ %phi_phi_mad, %if.else6 ]
+  %phi_div1_1 = phi i32 [ %div1, %if.else5 ], [ %add2, %if.else6 ]
+  %i32_idx = phi i32 [ 0, %if.else5 ], [ %add2, %if.else6 ]
+  %cmp9 = icmp slt i32 %phi_div1_1, %n
+  br i1 %cmp9, label %if.then6, label %if.else6
+
+if.then6:                                    ; preds = %for.cond2
+  br label %for.cond3
+
+for.cond3:                                    ; preds = %if.else7, %if.then6
+  %phi_float_idx = phi float [ %float_idx, %if.then6 ], [ %phi_mad, %if.else7 ]
+  %phi_div1_2 = phi i32 [ %div1, %if.then6 ], [ %add1, %if.else7 ]
+  %phi_i32_idx = phi i32 [ %i32_idx, %if.then6 ], [ %add1, %if.else7 ]
+  %cmp10 = icmp sgt i32 %phi_div1_2, -1
+  br i1 %cmp10, label %if.then7, label %if.else7
+
+if.then7:                                    ; preds = %for.cond3
+  %sext_phi_div1_2 = sext i32 %phi_div1_2 to i64
+  %gep3 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %sext_phi_div1_2
+  %load1 = load float, float addrspace(1)* %gep3, align 4
+  %sext_phi_i32_idx = sext i32 %phi_i32_idx to i64
+  %gep4 = getelementptr inbounds float, float addrspace(1)* %gep2, i64 %sext_phi_i32_idx
+  %load2 = load float, float addrspace(1)* %gep4, align 4
+  %mad = tail call spir_func float @_Z3madfff(float %load1, float %load2, float %phi_float_idx) #0
+  br label %if.else7
+
+if.else7:                                    ; preds = %if.then7, %for.cond3
+  %phi_mad = phi float [ %mad, %if.then7 ], [ %phi_float_idx, %for.cond3 ]
+  %add1 = add nsw i32 %phi_i32_idx, %n
+  %cmp11 = icmp slt i32 %add1, %div1
+  br i1 %cmp11, label %for.cond3, label %if.else6
+
+if.else6:                                    ; preds = %if.else7, %for.cond2
+  %phi_phi_mad = phi float [ %float_idx, %for.cond2 ], [ %phi_mad, %if.else7 ]
+  %add2 = add nsw i32 %i32_idx, %div1
+  %cmp12 = icmp slt i32 %add2, %div1
+  br i1 %cmp12, label %for.cond2, label %if.else4
+
+if.else4:                                    ; preds = %if.else8, %if.then4, %if.then3
+  %phi_phi_float_idx = phi float [ 0.000000e+00, %if.then3 ], [ 0.000000e+00, %if.then4 ], [ %phi_phi_mad, %if.else6 ]
+  %sext_trunc_gid_phi = sext i32 %trunc_gid_phi to i64
+  %gep5 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_trunc_gid_phi
+  store float %phi_phi_float_idx, float addrspace(1)* %gep5, align 4
+  %add3 = add nsw i32 %trunc_gid_phi, %trunc_gsize
+  %cmp13 = icmp slt i32 %add3, %n
+  br i1 %cmp13, label %for.cond, label %end
+
+end:                                    ; preds = %if.else4, %16
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
+
+; The purpose of this test is to make sure we correctly blend all the loops
+; live through at each entry point of the divergent loops and don't create
+; merge instructions for them.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops1
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %if.then1.uniform, label %entry.boscc_indir
+
+; CHECK: if.then1.uniform:
+; CHECK: br label %for.cond.uniform
+
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %end, label %if.then1
+
+; CHECK: for.cond2.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond3.preheader.uniform, label %for.cond2.uniform.boscc_indir
+
+; CHECK: for.cond2.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.else6.uniform, label %for.cond2.uniform.boscc_store
+
+; CHECK: for.cond2.uniform.boscc_store:
+; CHECK: br label %for.cond3.preheader
+
+; CHECK: for.cond3.uniform:
+; CHECK: br i1 %{{.+}}, label %if.then7.uniform, label %for.cond3.uniform.boscc_indir
+
+; CHECK: for.cond3.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.else7.uniform, label %for.cond3.uniform.boscc_store
+
+; CHECK: for.cond3.uniform.boscc_store:
+; CHECK: br label %if.then7
+
+; CHECK: end.loopexit.uniform:
+; CHECK: br label %end
+
+; CHECK: for.cond:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: for.cond2:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: for.cond3:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: if.then7:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: if.else4:
+; CHECK-NOT: %{{.+}}.boscc_blend{{.+}}.merge{{.+}} =
+; CHECK: br
+
+; CHECK: end.loopexit:
+; CHECK: br label %end
+
+; CHECK: end:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
new file mode 100644
index 0000000000000..cca12985f5031
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops2.ll
@@ -0,0 +1,140 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k nested_loops2 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @nested_loops2(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %conv, 16
+  br i1 %cmp, label %if.then, label %if.end25
+
+if.then:                                          ; preds = %entry
+  %mul2 = mul nsw i32 %conv, %n
+  %0 = icmp eq i32 %mul2, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div3 = sdiv i32 %mul2, %5
+  %add = add nsw i32 %div3, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.then
+  %ret.0 = phi i32 [ 0, %if.then ], [ %ret.2, %for.inc ]
+  %storemerge = phi i32 [ 0, %if.then ], [ %inc24, %for.inc ]
+  %cmp7 = icmp slt i32 %storemerge, %n
+  br i1 %cmp7, label %for.body, label %if.end25
+
+for.body:                                         ; preds = %for.cond
+  %cmp9 = icmp slt i32 %conv, 9
+  br i1 %cmp9, label %while.body, label %for.inc
+
+while.body:                                       ; preds = %while.body, %for.body
+  %ret.1 = phi i32 [ %ret.0, %for.body ], [ %add17, %while.body ]
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc18, %while.body ]
+  %mul13 = mul nsw i32 %mul2, %mul2
+  %6 = icmp eq i32 %n, 0
+  %7 = select i1 %6, i32 1, i32 %n
+  %div14 = sdiv i32 %mul13, %7
+  %reass.add = add i32 %div14, %add
+  %reass.mul = mul i32 %reass.add, 8
+  %add6 = add i32 %mul2, 1
+  %add16 = add i32 %add6, %add
+  %inc = add i32 %add16, %ret.1
+  %add17 = add i32 %inc, %reass.mul
+  %inc18 = add nuw nsw i32 %j.0, 1
+  %add19 = add nsw i32 %j.0, %conv
+  %cmp20 = icmp sgt i32 %add19, 3
+  br i1 %cmp20, label %for.inc, label %while.body
+
+for.inc:                                          ; preds = %for.body, %while.body
+  %ret.2 = phi i32 [ %ret.0, %for.body ], [ %add17, %while.body ]
+  %inc24 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+if.end25:                                         ; preds = %for.cond, %entry
+  %ret.3 = phi i32 [ 0, %entry ], [ %ret.0, %for.cond ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"}
+!3 = !{void (i32 addrspace(1)*, i32)* @nested_loops2, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!5 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!6 = !{!"kernel_arg_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!8 = !{!"kernel_arg_type_qual", !"", !""}
+!9 = !{!"kernel_arg_name", !"out", !"n"}
+
+; The purpose of this test is to make sure we correctly add a boscc connection
+; at a div causing latch from the uniform region.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops2
+; CHECK: entry:
+; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp)
+; CHECK: br i1 %[[BOSCC]], label %if.then.uniform, label %entry.boscc_indir
+
+; CHECK: if.then.uniform:
+; CHECK: br i1 %cmp71.uniform, label %for.body.lr.ph.uniform, label %if.end25.loopexit.uniform
+
+; CHECK: entry.boscc_indir:
+; CHECK: %[[BOSCC2:.+]] = call i1 @__vecz_b_divergence_all(i1 %cmp.not{{.*}})
+; CHECK: br i1 %[[BOSCC2]], label %if.end25, label %if.then
+
+; CHECK: for.body.lr.ph.uniform:
+; CHECK: br label %for.body.uniform
+
+; CHECK: for.body.uniform:
+; CHECK: br i1 %[[LBLCOND:.+]], label %while.body.preheader.uniform, label %for.body.uniform.boscc_indir
+
+; CHECK: while.body.preheader.uniform:
+; CHECK: br label %while.body.uniform
+
+; CHECK: for.body.uniform.boscc_indir:
+; CHECK: %[[BOSCC3:.+]] = call i1 @__vecz_b_divergence_all(i1 %for.inc.uniform.exit_mask)
+; CHECK: br i1 %[[BOSCC3]], label %for.inc.uniform, label %for.body.uniform.boscc_store
+
+; CHECK: while.body.uniform:
+; CHECK: %cmp20.uniform = icmp sgt i32 %add19.uniform, 3
+; CHECK-NOT: br i1 %[[LBLCOND3:.+]], label %for.inc.loopexit.uniform, label %while.body.uniform
+; CHECK: br i1 %[[LBLCOND2:.+]], label %for.inc.loopexit.uniform, label %while.body.uniform.boscc_indir
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
new file mode 100644
index 0000000000000..e39f0e1361850
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops3.ll
@@ -0,0 +1,149 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k nested_loops3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #0
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.fmuladd.f32(float, float, float) #1
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @nested_loops3(float addrspace(1)* %symmat, float addrspace(1)* %data, i32 %m, i32 %n) #2 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %conv = trunc i64 %call to i32
+  %sub = add nsw i32 %m, -1
+  %cmp = icmp sgt i32 %sub, %conv
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %mul = mul nsw i32 %conv, %m
+  %add = add nsw i32 %mul, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom
+  store float 1.000000e+00, float addrspace(1)* %arrayidx, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %if.then
+  %storemerge.in = phi i32 [ %conv, %if.then ], [ %storemerge, %for.end ]
+  %storemerge = add nsw i32 %storemerge.in, 1
+  %cmp3 = icmp slt i32 %storemerge, %m
+  br i1 %cmp3, label %for.cond5, label %if.end
+
+for.cond5:                                        ; preds = %for.body8, %for.cond
+  %storemerge1 = phi i32 [ %inc, %for.body8 ], [ 0, %for.cond ]
+  %cmp6 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp6, label %for.body8, label %for.end
+
+for.body8:                                        ; preds = %for.cond5
+  %mul9 = mul nsw i32 %storemerge1, %m
+  %add10 = add nsw i32 %mul9, %conv
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds float, float addrspace(1)* %data, i64 %idxprom11
+  %0 = load float, float addrspace(1)* %arrayidx12, align 4
+  %mul13 = mul nsw i32 %storemerge1, %m
+  %add14 = add nsw i32 %mul13, %storemerge
+  %idxprom15 = sext i32 %add14 to i64
+  %arrayidx16 = getelementptr inbounds float, float addrspace(1)* %data, i64 %idxprom15
+  %1 = load float, float addrspace(1)* %arrayidx16, align 4
+  %mul18 = mul nsw i32 %conv, %m
+  %add19 = add nsw i32 %storemerge, %mul18
+  %idxprom20 = sext i32 %add19 to i64
+  %arrayidx21 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom20
+  %2 = load float, float addrspace(1)* %arrayidx21, align 4
+  %3 = call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  store float %3, float addrspace(1)* %arrayidx21, align 4
+  %inc = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond5
+
+for.end:                                          ; preds = %for.cond5
+  %mul22 = mul nsw i32 %conv, %m
+  %add23 = add nsw i32 %storemerge, %mul22
+  %idxprom24 = sext i32 %add23 to i64
+  %arrayidx25 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom24
+  %4 = load float, float addrspace(1)* %arrayidx25, align 4
+  %mul26 = mul nsw i32 %storemerge, %m
+  %add27 = add nsw i32 %mul26, %conv
+  %idxprom28 = sext i32 %add27 to i64
+  %arrayidx29 = getelementptr inbounds float, float addrspace(1)* %symmat, i64 %idxprom28
+  store float %4, float addrspace(1)* %arrayidx29, align 4
+  br label %for.cond
+
+if.end:                                           ; preds = %for.cond, %entry
+  ret void
+}
+
+; The purpose of this test is to make sure we correctly set the incoming value
+; of a boscc_blend instruction (in a loop header) from the latch as being the
+; value defined in the latch iteself.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops3
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %entry.boscc_indir
+
+; CHECK: if.then.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond5.preheader.lr.ph.uniform, label %if.then.uniform.boscc_indir
+
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.end, label %if.then
+
+; CHECK: for.cond5.preheader.lr.ph.uniform:
+; CHECK: br label %for.cond5.preheader.uniform
+
+; CHECK: if.then.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.end.loopexit.uniform, label %for.cond5.preheader.lr.ph
+
+; CHECK: for.cond5.preheader.uniform:
+; CHECK: br label %for.cond5.uniform
+
+; CHECK: for.end.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond.if.end.loopexit_crit_edge.uniform, label %for.end.uniform.boscc_store
+
+; CHECK: for.end.uniform.boscc_store:
+; CHECK: br label %for.cond5.preheader
+
+; CHECK: if.then:
+; CHECK: br label %for.cond5.preheader.lr.ph
+
+; CHECK: for.cond5.preheader.lr.ph:
+; CHECK: br label %for.cond5.preheader
+
+; CHECK: for.cond5.preheader:
+
+; This is the important bit of the test
+; Note that the LCSSA PHI node got cleaned up!
+; For some reason LIT needs these checks to be split across two lines
+; CHECK: %[[LATCH_VALUE1:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE1]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
+
+; CHECK: %[[LATCH_VALUE2:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE2]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
+
+; CHECK: %[[LATCH_VALUE3:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE3]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
+
+; CHECK: %[[LATCH_VALUE4:.*\.boscc_blend[0-9]*]] = phi i{{32|64}} [ %{{.+}}, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE4]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
+
+; CHECK: %[[LATCH_VALUE5:.+\.boscc_blend[0-9]*]] = phi i1 [ true, %for.end.uniform.boscc_store ],
+; CHECK-SAME: [ %[[LATCH_VALUE5]], %for.end ], [ %{{.+}}, %for.cond5.preheader.lr.ph ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
new file mode 100644
index 0000000000000..a967bec0aed5d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops4.ll
@@ -0,0 +1,190 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k nested_loops4 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_global_id(i32) #0
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_global_size(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func float @_Z3dotDv2_fS_(<2 x float>, <2 x float>) #0
+
+declare spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64, float addrspace(1)*)
+
+; Function Attrs: nounwind readnone
+declare spir_func i32 @_Z6mul_hijj(i32, i32) #0
+
+define spir_kernel void @nested_loops4(i32 %n, float addrspace(1)* %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 0) #0
+  %gsize = tail call i64 @__mux_get_global_size(i32 0) #0
+  %trunc_gid = trunc i64 %gid to i32
+  %trunc_gsize = trunc i64 %gsize to i32
+  %cmp1 = icmp slt i32 %trunc_gid, %n
+  br i1 %cmp1, label %for.cond1, label %end
+
+for.cond1:                                     ; preds = %entry, %for.cond1.end
+  %phi_trunc_gid = phi i32 [ %trunc_gid, %entry ], [ %add2, %for.cond1.end ]
+  %mul_hi = tail call spir_func i32 @_Z6mul_hijj(i32 %phi_trunc_gid, i32 %n) #0
+  %wrong = sdiv i32 %mul_hi, %n
+  %sext_mul_hi = sext i32 %mul_hi to i64
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_mul_hi
+  %cmp2 = icmp slt i32 %mul_hi, %n
+  br i1 %cmp2, label %for.cond2, label %for.cond1.end
+
+for.cond2:                                    ; preds = %for.cond1, %for.cond2.end
+  %phi4_fadd = phi float [ %phi3_fadd, %for.cond2.end ], [ 0.000000e+00, %for.cond1 ]
+  %cmp3 = icmp slt i32 %mul_hi, %n
+  br i1 %cmp3, label %for.cond3.preheader, label %for.cond2.end
+
+for.cond3.preheader:                                    ; preds = %for.cond2
+  %add1 = add nsw i32 %mul_hi, %wrong
+  br label %for.cond3
+
+for.cond3:                                    ; preds = %for.cond3.preheader, %for.cond3.end
+  %phi_wrong_correct_correct = phi i32 [ %wrong, %for.cond3.preheader ], [ %correct, %for.cond3.end ]
+  %phi_add1 = phi i32 [ %add1, %for.cond3.preheader ], [ %phi_add1, %for.cond3.end ]
+  %phi2_fadd = phi float [ %phi4_fadd, %for.cond3.preheader ], [ %phi1_fadd, %for.cond3.end ]
+  %cmp4 = icmp slt i32 %phi_wrong_correct_correct, %n
+  br i1 %cmp4, label %for.cond3.body, label %for.cond3.end
+
+for.cond3.body:                                    ; preds = %for.cond3
+  %sext_phi_add1 = sext i32 %phi_add1 to i64
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %sext_phi_add1
+  %vload = tail call spir_func <2 x float> @_Z6vload2mPU3AS1Kf(i64 0, float addrspace(1)* %gep2)
+  %dot = tail call spir_func float @_Z3dotDv2_fS_(<2 x float> %vload, <2 x float> %vload) #0
+  %fadd = fadd float %phi2_fadd, %dot
+  br label %for.cond3.end
+
+for.cond3.end:                                    ; preds = %for.cond3.body, %for.cond3
+  %phi1_fadd = phi float [ %phi2_fadd, %for.cond3 ], [ %fadd, %for.cond3.body ]
+  %correct = add nsw i32 %phi_wrong_correct_correct, 1
+  %cmp5 = icmp slt i32 %wrong, %n
+  br i1 %cmp5, label %for.cond3, label %for.cond2.end
+
+for.cond2.end:                                    ; preds = %for.cond3.end, %for.cond2
+  %phi3_fadd = phi float [ %phi4_fadd, %for.cond2 ], [ %phi1_fadd, %for.cond3.end ]
+  %cmp6 = icmp slt i32 %mul_hi, %n
+  br i1 %cmp6, label %for.cond2, label %for.cond1.end
+
+for.cond1.end:                                    ; preds = %for.cond2.end, %for.cond1
+  %ret = phi float [ 0.000000e+00, %for.cond1 ], [ %phi3_fadd, %for.cond2.end ]
+  %sext_phi_trunc_gid = sext i32 %phi_trunc_gid to i64
+  %gep3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %sext_phi_trunc_gid
+  store float %ret, float addrspace(1)* %gep3, align 4
+  %add2 = add nsw i32 %phi_trunc_gid, %trunc_gsize
+  %cmp7 = icmp slt i32 %add2, %n
+  br i1 %cmp7, label %for.cond1, label %end
+
+end:                                    ; preds = %for.cond1.end, %entry
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
+; The purpose of this test is to make sure we choose the correct incoming value
+; for a boscc blend instruction.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops4
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %for.cond1.preheader.uniform, label %entry.boscc_indir
+
+; CHECK: for.cond1.preheader.uniform:
+; CHECK: br label %for.cond1.uniform
+
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %end, label %for.cond1.preheader
+
+; CHECK: for.cond1.uniform:
+; CHECK: %wrong.uniform = sdiv i32 %mul_hi.uniform, %n
+; CHECK: br i1 %{{.+}}, label %for.cond2.preheader.uniform, label %for.cond1.uniform.boscc_indir
+
+; CHECK: for.cond1.end.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %end.loopexit.uniform, label %for.cond1.end.uniform.boscc_store
+
+; CHECK: for.cond1.end.uniform.boscc_store:
+; CHECK: br label %for.cond1
+
+; CHECK: for.cond2.preheader.uniform:
+; CHECK: br label %for.cond2.uniform
+
+; CHECK: for.cond1.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond1.end.uniform, label %for.cond1.uniform.boscc_store
+
+; CHECK: for.cond1.uniform.boscc_store:
+;    LCSSA PHI nodes got cleaned up:
+; CHECK-NOT: %{{.*\.boscc_lcssa.*}}
+; CHECK: br label %for.cond2.preheader
+
+; CHECK: for.cond2.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond3.preheader.uniform, label %for.cond2.uniform.boscc_indir
+
+; CHECK: for.cond2.end.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond1.end.loopexit.uniform, label %for.cond2.end.uniform.boscc_store
+
+; CHECK: for.cond3.preheader.uniform:
+; CHECK: br label %for.cond3.uniform
+
+; CHECK: for.cond2.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond2.end.uniform, label %for.cond2.uniform.boscc_store
+
+; CHECK: for.cond3.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond3.body.uniform, label %for.cond3.uniform.boscc_indir
+
+; CHECK: for.cond3.end.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %for.cond2.end.loopexit.uniform, label %for.cond3.end.uniform.boscc_store
+
+; CHECK: for.cond3.end.uniform.boscc_store:
+;    LCSSA PHI nodes got cleaned up:
+; CHECK-NOT: %{{.*\.boscc_lcssa.*}}
+; CHECK: br label %for.cond3
+
+; CHECK: for.cond3.body.uniform:
+; CHECK: br label %for.cond3.end.uniform
+
+; CHECK: for.cond3.uniform.boscc_indir:
+; CHECK: %[[BOSCC:.+]] = call i1 @__vecz_b_divergence_all(i1 %for.cond3.end.uniform.exit_mask)
+; CHECK: br i1 %[[BOSCC]], label %for.cond3.end.uniform, label %for.cond3.uniform.boscc_store
+
+; CHECK: for.cond3.end.uniform:
+; CHECK: br i1 %{{.+}}, label %for.cond3.uniform, label %for.cond3.end.uniform.boscc_indir
+
+; CHECK: for.cond1.preheader:
+; CHECK: br label %for.cond1
+
+; CHECK: for.cond1:
+; CHECK: br label %for.cond2.preheader
+
+; CHECK: for.cond2.preheader:
+; CHECK: br label %for.cond2
+
+; CHECK: for.cond2:
+; CHECK: br label %for.cond3.preheader
+
+; CHECK: for.cond3.preheader:
+; CHECK: br label %for.cond3
+
+; CHECK: for.cond3:
+
+; This is the important part of the test.
+; CHECK: %phi_wrong_correct_correct = phi i32 [ %wrong.boscc_blend{{.+}}, %for.cond3.preheader ], [ %correct, %for.cond3.end ], [ %correct.uniform, %for.cond3.end.uniform.boscc_store ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
new file mode 100644
index 0000000000000..3f8e7f2b3a395
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/nested_loops5.ll
@@ -0,0 +1,117 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k nested_loops5 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+
+declare i64 @__mux_get_local_size(i32)
+
+define spir_kernel void @nested_loops5(float addrspace(1)*) {
+entry:
+  %lid = tail call i64 @__mux_get_local_id(i32 0)
+  %lsize = tail call i64 @__mux_get_local_size(i32 0)
+  %cmp1 = icmp ult i64 %lid, %lsize
+  br i1 %cmp1, label %loop, label %end
+
+loop:                                             ; preds = %if.end, %entry
+  %livethrough = phi i64 [ %add2, %if.end ], [ %lsize, %entry ]
+  %add1 = add i64 %livethrough, %lsize
+  %cmp2 = icmp ult i64 %add1, %lsize
+  br i1 %cmp2, label %if.then, label %if.else
+
+if.then:                                          ; preds = %if.then, %loop
+  %phi = phi i64 [ %add3, %if.then ], [ %lid, %loop ]
+  %add3 = add i64 %phi, %lsize
+  %cmp4 = icmp ult i64 %add3, %lsize
+  br i1 %cmp4, label %if.then, label %if.end
+
+if.else:                                          ; preds = %loop
+  %gep = getelementptr inbounds float, float addrspace(1)* %0, i64 %add1
+  store float 0.000000e+00, float addrspace(1)* %gep, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %if.else
+  %add2 = add i64 %livethrough, %lsize
+  %cmp3 = icmp ult i64 %add2, %lsize
+  br i1 %cmp3, label %loop, label %end
+
+end:                                              ; preds = %if.end, %entry
+  ret void
+}
+
+; The purpose of this test is to make sure we choose the correct incoming value
+; for a boscc blend instruction.
+
+; CHECK: spir_kernel void @__vecz_v4_nested_loops5
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %loop.preheader.uniform, label %entry.boscc_indir
+
+; CHECK: loop.preheader.uniform:
+; CHECK: br label %loop.uniform
+
+; CHECK: entry.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %end, label %loop.preheader
+
+; CHECK: loop.uniform:
+; CHECK: %livethrough.uniform = phi i64 [ %add2.uniform, %if.end.uniform ], [ %lsize, %loop.preheader.uniform ]
+; CHECK: br i1 %{{.+}}, label %if.then.preheader.uniform, label %if.else.uniform
+
+; CHECK: if.then.preheader.uniform:
+; CHECK: br label %if.then.uniform
+
+; CHECK: if.then.uniform:
+; CHECK: br i1 %{{.+}}, label %if.then.uniform, label %if.then.uniform.boscc_indir
+
+; CHECK: if.then.uniform.boscc_indir:
+; CHECK: br i1 %{{.+}}, label %if.end.loopexit.uniform, label %if.then.uniform.boscc_store
+
+; CHECK: if.then.uniform.boscc_store:
+;    LCSSA PHI nodes got cleaned up:
+; CHECK-NOT: %{{.*\.boscc_lcssa.*}}
+; CHECK: br label %if.then
+
+; CHECK: loop.preheader:
+; CHECK: br label %loop
+
+; CHECK: loop:
+; CHECK: %livethrough = phi i64 [ %add2, %if.end ], [ %lsize, %loop.preheader ]
+; CHECK: br i1 %{{.+}}, label %if.then.preheader, label %if.else
+
+; CHECK: if.then.preheader:
+; CHECK: br label %if.then
+
+; CHECK: if.then:
+; CHECK: %livethrough.boscc_blend = phi i64 [ %livethrough.uniform, %if.then.uniform.boscc_store ], [ %livethrough.boscc_blend, %if.then ], [ %livethrough, %if.then.preheader ]
+; CHECK: br i1 %{{.+}}, label %if.then, label %if.then.pure_exit
+
+; CHECK: if.then.pure_exit:
+; CHECK: br label %if.end.loopexit
+
+; CHECK: if.else:
+; CHECK: br label %if.end
+
+; CHECK: if.end.loopexit:
+; CHECK: br label %if.end
+
+; CHECK: if.end:
+; CHECK-NOT: %livethrough.boscc_blend{{.+}}.merge = phi i64 [ %livethrough.boscc_blend, %if.end.loopexit ], [ 0, %if.else ]
+; CHECK: %livethrough.boscc_blend{{.+}} = phi i64 [ %livethrough.boscc_blend, %if.end.loopexit ], [ %livethrough, %if.else ]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
new file mode 100644
index 0000000000000..96261d872a3df
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization0.ll
@@ -0,0 +1,436 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;        a
+;       / \
+;      b   c
+;       \ /
+;        d
+;        |
+;        e
+;       / \
+;      /   \
+;     f     g
+;    / \   / \
+;   h   i j   k
+;    \ /   \ /
+;     l     m
+;      \   /
+;       \ /
+;        n
+;
+; * where node e is a uniform branch, and nodes a, f and g are varying
+;   branches.
+; * where nodes b, c, d, h, i, j, k, l, m are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a___
+;        / \  \
+;       b   c  c'
+;        \ /   |
+;         d    b'
+;         |    |
+;         |    d'
+;         |   /
+;          \ /
+;           e
+;          / \
+;         /   \
+;     ___f     g___
+;    /  / \   / \  \
+;   i' h   i j   k  k'
+;   |   \ /   \ /   |
+;   h'   l     m    j'
+;   |    |     |    |
+;   l'   |     |    m'
+;    \   |     |   /
+;     \ /       \ /
+;      & -> n <- &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization0(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (id % 5 == 0) {
+;     for (int i = 0; i < n * 2; i++) ret++;
+;   } else {
+;     for (int i = 0; i < n / 4; i++) ret++;
+;   }
+;
+;   if (n > 10) { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n + 10; i++) ret++;
+;     } else { // varying
+;       for (int i = 0; i < n + 10; i++) ret *= 2;
+;     }
+;     ret += id * 10;
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n + 8; i++) ret++;
+;     } else { // varying
+;       for (int i = 0; i < n + 8; i++) ret *= 2;
+;     }
+;     ret += id / 2;
+;   }
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization0(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %rem = srem i32 %conv, 5
+  %cmp = icmp eq i32 %rem, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
+  %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge8, %mul
+  br i1 %cmp2, label %for.body, label %if.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge8, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %entry
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.body9, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %inc10, %for.body9 ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc12, %for.body9 ]
+  %div = sdiv i32 %n, 4
+  %cmp7 = icmp slt i32 %storemerge, %div
+  br i1 %cmp7, label %for.body9, label %if.end
+
+for.body9:                                        ; preds = %for.cond6
+  %inc10 = add nsw i32 %ret.1, 1
+  %inc12 = add nsw i32 %storemerge, 1
+  br label %for.cond6
+
+if.end:                                           ; preds = %for.cond6, %for.cond
+  %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond6 ]
+  %cmp14 = icmp sgt i32 %n, 10
+  %rem175 = and i32 %conv, 1
+  %cmp18 = icmp eq i32 %rem175, 0
+  br i1 %cmp14, label %if.then16, label %if.else44
+
+if.then16:                                        ; preds = %if.end
+  br i1 %cmp18, label %if.then20, label %if.else30
+
+if.then20:                                        ; preds = %if.then16
+  br label %for.cond22
+
+for.cond22:                                       ; preds = %for.body25, %if.then20
+  %ret.3 = phi i32 [ %ret.2, %if.then20 ], [ %inc26, %for.body25 ]
+  %storemerge7 = phi i32 [ 0, %if.then20 ], [ %inc28, %for.body25 ]
+  %add = add nsw i32 %n, 10
+  %cmp23 = icmp slt i32 %storemerge7, %add
+  br i1 %cmp23, label %for.body25, label %if.end41
+
+for.body25:                                       ; preds = %for.cond22
+  %inc26 = add nsw i32 %ret.3, 1
+  %inc28 = add nsw i32 %storemerge7, 1
+  br label %for.cond22
+
+if.else30:                                        ; preds = %if.then16
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.4 = phi i32 [ %ret.2, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge6 = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 10
+  %cmp34 = icmp slt i32 %storemerge6, %add33
+  br i1 %cmp34, label %for.body36, label %if.end41
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.4, 1
+  %inc39 = add nsw i32 %storemerge6, 1
+  br label %for.cond32
+
+if.end41:                                         ; preds = %for.cond32, %for.cond22
+  %ret.5 = phi i32 [ %ret.3, %for.cond22 ], [ %ret.4, %for.cond32 ]
+  %mul42 = mul nsw i32 %conv, 10
+  %add43 = add nsw i32 %ret.5, %mul42
+  br label %if.end73
+
+if.else44:                                        ; preds = %if.end
+  br i1 %cmp18, label %if.then48, label %if.else59
+
+if.then48:                                        ; preds = %if.else44
+  br label %for.cond50
+
+for.cond50:                                       ; preds = %for.body54, %if.then48
+  %ret.6 = phi i32 [ %ret.2, %if.then48 ], [ %inc55, %for.body54 ]
+  %storemerge4 = phi i32 [ 0, %if.then48 ], [ %inc57, %for.body54 ]
+  %add51 = add nsw i32 %n, 8
+  %cmp52 = icmp slt i32 %storemerge4, %add51
+  br i1 %cmp52, label %for.body54, label %if.end70
+
+for.body54:                                       ; preds = %for.cond50
+  %inc55 = add nsw i32 %ret.6, 1
+  %inc57 = add nsw i32 %storemerge4, 1
+  br label %for.cond50
+
+if.else59:                                        ; preds = %if.else44
+  br label %for.cond61
+
+for.cond61:                                       ; preds = %for.body65, %if.else59
+  %ret.7 = phi i32 [ %ret.2, %if.else59 ], [ %mul66, %for.body65 ]
+  %storemerge2 = phi i32 [ 0, %if.else59 ], [ %inc68, %for.body65 ]
+  %add62 = add nsw i32 %n, 8
+  %cmp63 = icmp slt i32 %storemerge2, %add62
+  br i1 %cmp63, label %for.body65, label %if.end70
+
+for.body65:                                       ; preds = %for.cond61
+  %mul66 = shl nsw i32 %ret.7, 1
+  %inc68 = add nsw i32 %storemerge2, 1
+  br label %for.cond61
+
+if.end70:                                         ; preds = %for.cond61, %for.cond50
+  %ret.8 = phi i32 [ %ret.6, %for.cond50 ], [ %ret.7, %for.cond61 ]
+  %div71 = sdiv i32 %conv, 2
+  %add72 = add nsw i32 %ret.8, %div71
+  br label %if.end73
+
+if.end73:                                         ; preds = %if.end70, %if.end41
+  %storemerge3 = phi i32 [ %add72, %if.end70 ], [ %add43, %if.end41 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization0, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization0
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND6PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND6UNIFORM:.+]]
+
+; CHECK: [[FORCOND6UNIFORM]]:
+; CHECK: %[[CMP7UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP7UNIFORM]], label %[[FORBODY9UNIFORM:.+]], label %[[IFENDLOOPEXIT3UNIFORM:.+]]
+
+; CHECK: [[FORBODY9UNIFORM]]:
+; CHECK: br label %[[FORCOND6UNIFORM]]
+
+; CHECK: [[IFENDLOOPEXIT3UNIFORM]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[ENTRYBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND6PREHEADERUNIFORM]], label %[[FORCOND6PREHEADER:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[IFENDLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[IFENDLOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND]]
+
+; CHECK: [[FORCOND6PREHEADER]]:
+; CHECK: br label %[[FORCOND6:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFENDLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND6]]:
+; CHECK: %[[CMP7:.+]] = icmp
+; CHECK: br i1 %[[CMP7]], label %[[FORBODY9:.+]], label %[[IFENDLOOPEXIT3:.+]]
+
+; CHECK: [[FORBODY9]]:
+; CHECK: br label %[[FORCOND6]]
+
+; CHECK: [[IFENDLOOPEXIT]]:
+; CHECK: br label %[[IFEND]]
+
+; CHECK: [[IFENDLOOPEXIT3]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[IFEND]]:
+; CHECK: %[[CMP14:.+]] = icmp
+; CHECK: br i1 %[[CMP14]], label %[[IFTHEN16:.+]], label %[[IFELSE44:.+]]
+
+; CHECK: [[IFTHEN16]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND22PREHEADERUNIFORM:.+]], label %[[IFTHEN16BOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: %[[CMP34UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP34UNIFORM]], label %[[FORBODY36UNIFORM:.+]], label %[[IFEND41LOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[IFEND41LOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[IFEND41UNIFORM:.+]]
+
+; CHECK: [[FORCOND22PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND22UNIFORM:.+]]
+
+; CHECK: [[IFTHEN16BOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM]], label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND22UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY25UNIFORM:.+]], label %[[IFEND41LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY25UNIFORM]]:
+; CHECK: br label %[[FORCOND22UNIFORM]]
+
+; CHECK: [[IFEND41LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND41:.+]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND22PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND22:.+]]
+
+; CHECK: [[FORCOND22]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY25:.+]], label %[[IFEND41LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY25]]:
+; CHECK: br label %[[FORCOND22]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: %[[CMP34:.+]] = icmp
+; CHECK: br i1 %[[CMP34]], label %[[FORBODY36:.+]], label %[[IFEND41LOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[IFEND41LOOPEXIT]]:
+; CHECK: br label %[[IFEND41]]
+
+; CHECK: [[IFEND41LOOPEXIT1]]:
+; CHECK: br label %[[FORCOND22PREHEADER]]
+
+; CHECK: [[IFEND41]]:
+; CHECK: br label %[[IFEND73:.+]]
+
+; CHECK: [[IFELSE44]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND50PREHEADERUNIFORM:.+]], label %[[IFELSE44BOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND61PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND61UNIFORM:.+]]
+
+; CHECK: [[FORCOND61UNIFORM]]:
+; CHECK: %[[CMP63UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP63UNIFORM]], label %[[FORBODY65UNIFORM:.+]], label %[[IFEND70LOOPEXIT2UNIFORM:.+]]
+
+; CHECK: [[FORBODY65UNIFORM]]:
+; CHECK: br label %[[FORCOND61UNIFORM]]
+
+; CHECK: [[IFEND70LOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[IFEND70UNIFORM:.+]]
+
+; CHECK: [[FORCOND50PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND50UNIFORM:.+]]
+
+; CHECK: [[IFELSE44BOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND61PREHEADERUNIFORM]], label %[[FORCOND61PREHEADER:.+]]
+
+; CHECK: [[FORCOND50UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY54UNIFORM:.+]], label %[[IFEND70LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY54UNIFORM]]:
+; CHECK: br label %[[FORCOND50UNIFORM]]
+
+; CHECK: [[IFEND70LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND70:.+]]
+
+; CHECK: [[FORCOND61PREHEADER]]:
+; CHECK: br label %[[FORCOND61:.+]]
+
+; CHECK: [[FORCOND50PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND50:.+]]
+
+; CHECK: [[FORCOND50]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY54:.+]], label %[[IFEND70LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY54]]:
+; CHECK: br label %[[FORCOND50]]
+
+; CHECK: [[FORCOND61]]:
+; CHECK: %[[CMP63:.+]] = icmp
+; CHECK: br i1 %[[CMP63]], label %[[FORBODY65:.+]], label %[[IFEND70LOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY65]]:
+; CHECK: br label %[[FORCOND61]]
+
+; CHECK: [[IFEND70LOOPEXIT]]:
+; CHECK: br label %[[IFEND70]]
+
+; CHECK: [[IFEND70LOOPEXIT2]]:
+; CHECK: br label %[[FORCOND50PREHEADER]]
+
+; CHECK: [[IFEND70]]:
+; CHECK: br label %[[IFEND73]]
+
+; CHECK: [[IFEND73]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
new file mode 100644
index 0000000000000..acc9bee5af397
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization1.ll
@@ -0,0 +1,320 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ g
+;      \|
+;       h
+;
+; * where nodes c and f are uniform branches, and node b is a varying
+;   branch.
+; * where nodes c, d, e, f, g and h are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <-.  b' <.
+;      / \__|_ |   |
+;     c   d | `d'  |
+;    / \ /  |  |   |
+;   e   f --'  c'  |
+;    \  |      |   |
+;     \ g      f' -'
+;      \|      |
+;       h      g'
+;       |      |
+;       |      e'
+;       |      |
+;       |      h'
+;       \     /
+;        \   /
+;         \ /
+;          &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization1(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (id + i % 2 == 0) {
+;       if (n > 2) {
+;         goto e;
+;       }
+;     } else {
+;       for (int i = 0; i < n + 10; i++) ret++;
+;     }
+;     if (n <= 2) break;
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization1(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end14, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.2, %if.end14 ]
+  %cmp = icmp eq i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 2
+  br i1 %cmp2, label %e, label %if.end10
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.else
+  %ret.1 = phi i32 [ %ret.0, %if.else ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc9, %for.body ]
+  %add6 = add nsw i32 %n, 10
+  %cmp7 = icmp slt i32 %storemerge, %add6
+  br i1 %cmp7, label %for.body, label %if.end10
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc9 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+if.end10:                                         ; preds = %for.cond, %if.then
+  %ret.2 = phi i32 [ %ret.0, %if.then ], [ %ret.1, %for.cond ]
+  %cmp11 = icmp slt i32 %n, 3
+  br i1 %cmp11, label %while.end, label %if.end14
+
+if.end14:                                         ; preds = %if.end10
+  br label %while.body
+
+while.end:                                        ; preds = %if.end10
+  %mul = mul i32 %n, 2
+  %add15 = add nsw i32 %ret.2, %mul
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.body21, %while.end
+  %ret.3 = phi i32 [ %add15, %while.end ], [ %sub, %for.body21 ]
+  %storemerge1 = phi i32 [ 0, %while.end ], [ %inc23, %for.body21 ]
+  %mul18 = shl nsw i32 %n, 1
+  %cmp19 = icmp slt i32 %storemerge1, %mul18
+  br i1 %cmp19, label %for.body21, label %for.end24
+
+for.body21:                                       ; preds = %for.cond17
+  %sub = sub nsw i32 %ret.3, %storemerge1
+  %inc23 = add nsw i32 %storemerge1, 1
+  br label %for.cond17
+
+for.end24:                                        ; preds = %for.cond17
+  %0 = icmp eq i32 %ret.3, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.3, %5
+  br label %early
+
+e:                                                ; preds = %if.then
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.body30, %e
+  %ret.4 = phi i32 [ %ret.0, %e ], [ %div31, %for.body30 ]
+  %storemerge3 = phi i32 [ 0, %e ], [ %inc33, %for.body30 ]
+  %add27 = add nsw i32 %n, 5
+  %cmp28 = icmp slt i32 %storemerge3, %add27
+  br i1 %cmp28, label %for.body30, label %for.end34
+
+for.body30:                                       ; preds = %for.cond26
+  %div31 = sdiv i32 %ret.4, 2
+  %inc33 = add nsw i32 %storemerge3, 1
+  br label %for.cond26
+
+for.end34:                                        ; preds = %for.cond26
+  %sub35 = sub nsw i32 %ret.4, %n
+  br label %early
+
+early:                                            ; preds = %for.end34, %for.end24
+  %storemerge2 = phi i32 [ %div, %for.end24 ], [ %sub35, %for.end34 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization1, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization1
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[IFEND10LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[IFEND10LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND10UNIFORM:.+]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: %[[CMP2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FORCOND26PREHEADERUNIFORM:.+]], label %[[IFEND10UNIFORM]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[IFEND10UNIFORM]]:
+; CHECK: %[[CMP11UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP11UNIFORM]], label %[[WHILEENDUNIFORM:.+]], label %[[WHILEBODYUNIFORM]]
+
+; CHECK: [[WHILEENDUNIFORM]]:
+; CHECK: br label %[[FORCOND17UNIFORM:.+]]
+
+; CHECK: [[FORCOND17UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21UNIFORM:.+]], label %[[FOREND24UNIFORM:.+]]
+
+; CHECK: [[FORBODY21UNIFORM]]:
+; CHECK: br label %[[FORCOND17UNIFORM]]
+
+; CHECK: [[FOREND24UNIFORM]]:
+; CHECK: br label %[[EARLYUNIFORM:.+]]
+
+; CHECK: [[FORCOND26PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM:.+]]
+
+; CHECK: [[FORCOND26UNIFORM]]:
+; CHECK: %[[CMP29UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP29UNIFORM]], label %[[FORBODY30UNIFORM:.+]], label %[[FOREND34UNIFORM:.+]]
+
+; CHECK: [[FORBODY30UNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM]]
+
+; CHECK: [[FOREND34UNIFORM]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[IFEND10:.+]]
+
+; CHECK: [[FORCOND26PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFEND10LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[IFEND10LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[IFEND10]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[FORCOND17:.+]]
+
+; CHECK: [[WHILEENDELSE:.+]]:
+; CHECK: br label %[[FORCOND26PREHEADER]]
+
+; CHECK: [[FORCOND17]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21:.+]], label %[[FOREND24:.+]]
+
+; CHECK: [[FORBODY21]]:
+; CHECK: br label %[[FORCOND17]]
+
+; CHECK: [[FOREND24]]:
+; CHECK: br label %[[WHILEENDELSE]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: %[[CMP29:.+]] = icmp
+; CHECK: br i1 %[[CMP29]], label %[[FORBODY30:.+]], label %[[FOREND34:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[FOREND34]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
new file mode 100644
index 0000000000000..1a07de7f75123
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization10.ll
@@ -0,0 +1,568 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;            a
+;            |
+;            b <-----.
+;           / \      |
+;          c   d     |
+;         / \ /      |
+;        /   e       |
+;       /    |       |
+;      /     g <---. |
+;     /     / \    | |
+;    /     h   i   | |
+;   f     / \ / \  | |
+;   |    j   k   l | |
+;   |   /|  / \ /  | |
+;   |  m | n   o --' |
+;   | /  |/          |
+;   |/   q ----------'
+;   p    |
+;    \   r
+;     \ /
+;      s
+;
+; * where nodes b, c, g, h, j, k and q are uniform branches, and node i is a
+;   varying branch.
+; * where nodes k, l, o, n, m, p, q, r and s are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;            a
+;            |
+;            b <-----.        b' <-----.
+;           / \      |       / \       |
+;          c   d     |      c'  d'     |
+;         / \ /      |     / \ /       |
+;        /   e       |    /   e'       |
+;       /    |       |   /    |        |
+;      /     g <---. |  /     g' <---. |
+;     /     / \    | | f'    / \     | |
+;    /     h   i___|_|_|____/__ \    | |
+;   f     / \ / \  | | |   h'  \ i'  | |
+;   |    j   k   l | | |  / \   \|   | |
+;   |   /|  / \ /  | | | j'  |   l'  | |
+;   |  m | n   o --' | | |    \ /    | |
+;   | /  |/          | | |     k'    | |
+;   |/   q ----------' |  \    |     | |
+;   p    |             |   \   o' ---' |
+;    \   r             |    \ /        |
+;     \ /              |     n'        |
+;      s                \    |         |
+;      |                 \   q' -------'
+;      |                  \ /
+;      |                   m'
+;      |                   |
+;      |                   r'
+;      |                   |
+;      |                   p'
+;      |                   |
+;      `-------> & <------ s'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization10(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n <= 10) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n & 1) { // g
+;         // h
+;         if (n < 3) {
+;           // j
+;           goto j;
+;         }
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // l
+;           ret /= n * n + ret;
+;           goto o;
+;         }
+;       }
+;       // k
+;       if (n & 1) {
+;         // n
+;         ret += n * ret;
+;         goto n;
+;       }
+;       // o
+; o:
+;       ret++;
+;     }
+; j:
+;     if (n < 2) {
+;       // m
+;       ret += n * 2 + 20;
+;       goto p;
+;     } else {
+;       goto q;
+;     }
+; n:
+;     ret *= 4;
+; q:
+;     if (n & 1) {
+;       // r
+;       ret++;
+;       goto r;
+;     }
+;   }
+;
+; r:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto s;
+;
+; f:
+;   ret /= n;
+;   goto p;
+;
+; p:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; s:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization10(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end55, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.5, %if.end55 ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.1 = phi i32 [ %ret.0, %if.then ], [ %inc, %for.body ]
+  %storemerge5 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge5, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc4 = add nsw i32 %storemerge5, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 11
+  br i1 %cmp5, label %f, label %if.end17
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond9
+
+for.cond9:                                        ; preds = %for.body12, %if.else
+  %ret.2 = phi i32 [ %ret.0, %if.else ], [ %inc13, %for.body12 ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.2, 1
+  %inc15 = add nsw i32 %storemerge, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %o, %if.end17
+  %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %o ]
+  %storemerge1 = add nsw i32 %storemerge1.in, 1
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.else26, label %if.then21
+
+if.then21:                                        ; preds = %while.body20
+  %cmp22 = icmp slt i32 %n, 3
+  br i1 %cmp22, label %j, label %if.end34
+
+if.else26:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge1, %conv
+  %cmp27 = icmp slt i32 %add, %n
+  br i1 %cmp27, label %if.end34, label %if.then29
+
+if.then29:                                        ; preds = %if.else26
+  %mul30 = mul nsw i32 %n, %n
+  %add31 = add nsw i32 %storemerge1, %mul30
+  %0 = icmp eq i32 %add31, 0
+  %1 = select i1 %0, i32 1, i32 %add31
+  %div32 = sdiv i32 %storemerge1, %1
+  br label %o
+
+if.end34:                                         ; preds = %if.else26, %if.then21
+  %and35 = and i32 %n, 1
+  %tobool36 = icmp eq i32 %and35, 0
+  br i1 %tobool36, label %o, label %if.then37
+
+if.then37:                                        ; preds = %if.end34
+  %mul38 = mul nsw i32 %storemerge1, %n
+  %add39 = add nsw i32 %mul38, %storemerge1
+  %mul50 = shl nsw i32 %add39, 2
+  br label %q
+
+o:                                                ; preds = %if.end34, %if.then29
+  %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ]
+  br label %while.body20
+
+j:                                                ; preds = %if.then21
+  %cmp42 = icmp eq i32 %n, 2
+  br i1 %cmp42, label %q, label %if.then44
+
+if.then44:                                        ; preds = %j
+  %mul45 = mul i32 %n, 2
+  %add46 = add nsw i32 %mul45, 20
+  %add47 = add nsw i32 %add46, %storemerge1
+  br label %p
+
+q:                                                ; preds = %j, %if.then37
+  %ret.5 = phi i32 [ %mul50, %if.then37 ], [ %storemerge1, %j ]
+  %and51 = and i32 %n, 1
+  %tobool52 = icmp eq i32 %and51, 0
+  br i1 %tobool52, label %if.end55, label %if.then53
+
+if.then53:                                        ; preds = %q
+  br label %for.cond57
+
+if.end55:                                         ; preds = %q
+  br label %while.body
+
+for.cond57:                                       ; preds = %for.body61, %if.then53
+  %ret.6.in = phi i32 [ %ret.5, %if.then53 ], [ %ret.6, %for.body61 ]
+  %storemerge2 = phi i32 [ 0, %if.then53 ], [ %inc64, %for.body61 ]
+  %ret.6 = add nsw i32 %ret.6.in, 1
+  %div58 = sdiv i32 %n, 4
+  %cmp59 = icmp slt i32 %storemerge2, %div58
+  br i1 %cmp59, label %for.body61, label %s
+
+for.body61:                                       ; preds = %for.cond57
+  %inc64 = add nsw i32 %storemerge2, 1
+  br label %for.cond57
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %ret.1, -2147483648
+  %3 = icmp eq i32 %n, -1
+  %4 = and i1 %3, %2
+  %5 = icmp eq i32 %n, 0
+  %6 = or i1 %5, %4
+  %7 = select i1 %6, i32 1, i32 %n
+  %div66 = sdiv i32 %ret.1, %7
+  br label %p
+
+p:                                                ; preds = %f, %if.then44
+  %storemerge3 = phi i32 [ %add47, %if.then44 ], [ %div66, %f ]
+  br label %for.cond68
+
+for.cond68:                                       ; preds = %for.body72, %p
+  %ret.7 = phi i32 [ %storemerge3, %p ], [ %inc73, %for.body72 ]
+  %storemerge4 = phi i32 [ 0, %p ], [ %inc75, %for.body72 ]
+  %mul69 = shl nsw i32 %n, 1
+  %cmp70 = icmp slt i32 %storemerge4, %mul69
+  br i1 %cmp70, label %for.body72, label %s
+
+for.body72:                                       ; preds = %for.cond68
+  %inc73 = add nsw i32 %ret.7, 1
+  %inc75 = add nsw i32 %storemerge4, 1
+  br label %for.cond68
+
+s:                                                ; preds = %for.cond68, %for.cond57
+  %ret.8 = phi i32 [ %ret.6, %for.cond57 ], [ %ret.7, %for.cond68 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization10, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization10
+; CHECK: br i1 true, label %[[WHILBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[TOBOOL:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]]
+
+; CHECK: [[IFTHEN21]]:
+; CHECK: %[[CMP22:.+]] = icmp
+; CHECK: br i1 %[[CMP22]], label %[[J:.+]], label %[[IFEND34:.+]]
+
+; CHECK: [[IFELSE26]]:
+; CHECK: br label %[[IFTHEN29:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM:.+]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND9PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND9PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND9UNIFORM:.+]]
+
+; CHECK: [[FORCOND9UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[IFEND17LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY12UNIFORM]]:
+; CHECK: br label %[[FORCOND9UNIFORM]]
+
+; CHECK: [[IFEND17LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND17UNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FORENDUNIFORM]]:
+; CHECK: %[[CMP5UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND17UNIFORM]]
+
+; CHECK: [[IFEND17UNIFORM]]:
+; CHECK: br label %[[WHILEBODY20UNIFORM:.+]]
+
+; CHECK: [[WHILEBODY20UNIFORM]]:
+; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[IFELSE26UNIFORM:.+]], label %[[IFTHEN21UNIFORM:.+]]
+
+; CHECK: [[IFTHEN21UNIFORM]]:
+; CHECK: %[[CMP22UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP22UNIFORM]], label %[[JUNIFORM:.+]], label %[[IFEND34UNIFORM:.+]]
+
+; CHECK: [[IFELSE26UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND34UNIFORM]], label %[[IFELSE26UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN29UNIFORM:.+]]:
+; CHECK: br label %[[OUNIFORM:.+]]
+
+; CHECK: [[IFEND34UNIFORM]]:
+; CHECK: %[[TOBOOL36UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL36UNIFORM]], label %[[OUNIFORM]], label %[[IFTHEN37UNIFORM:.+]]
+
+; CHECK: [[IFELSE26UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN29UNIFORM]], label %[[IFELSE26UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE26UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN29]]
+
+; CHECK: [[OUNIFORM]]:
+; CHECK: br label %[[WHILEBODY20UNIFORM]]
+
+; CHECK: [[JUNIFORM]]:
+; CHECK: %[[CMP42UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP42UNIFORM]], label %[[QUNIFORM:.+]], label %[[IFTHEN44UNIFORM:.+]]
+
+; CHECK: [[IFTHEN37UNIFORM]]:
+; CHECK: br label %[[QUNIFORM]]
+
+; CHECK: [[QUNIFORM]]:
+; CHECK: %[[TOBOOL52UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL52UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FORCOND57PREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFTHEN44UNIFORM]]:
+; CHECK: br label %[[PUNIFORM:.+]]
+
+; CHECK: [[FORCOND57PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND57UNIFORM:.+]]
+
+; CHECK: [[FORCOND57UNIFORM]]:
+; CHECK: %[[CMP59UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP59UNIFORM]], label %[[FORBODY61UNIFORM:.+]], label %[[SLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY61UNIFORM]]:
+; CHECK: br label %[[FORCOND57UNIFORM]]
+
+; CHECK: [[SLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[SUNIFORM:.+]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[PUNIFORM]]
+
+; CHECK: [[PUNIFORM]]:
+; CHECK: br label %[[FORCOND68UNIFORM:.+]]
+
+; CHECK: [[FORCOND68UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72UNIFORM:.+]], label %[[SLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY72UNIFORM]]:
+; CHECK: br label %[[FORCOND68UNIFORM]]
+
+; CHECK: [[SLOOPEXITUNIFORM]]:
+; CHECK: br label %[[S:.+]]
+
+; CHECK: [[IFTHEN29]]:
+; CHECK: br label %[[IFEND34]]
+
+; CHECK: [[IFEND34]]:
+; CHECK: br label %[[O:.+]]
+
+; CHECK: [[IFTHEN37:.+]]:
+; CHECK: br label %[[IFTHEN37ELSE:.+]]
+
+; CHECK: [[IFTHEN37ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[JELSE:.+]], label %[[JSPLIT:.+]]
+
+; CHECK: [[O]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[IFTHEN37]]
+
+; CHECK: [[J]]:
+; CHECK: br label %[[WHILEBODY20PUREEXIT]]
+
+; CHECK: [[JELSE]]:
+; CHECK: br label %[[Q:.+]]
+
+; CHECK: [[JSPLIT]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[IFTHEN44:.+]]:
+; CHECK: br label %[[IFTHEN44ELSE:.+]]
+
+; CHECK: [[IFTHEN44ELSE]]:
+; CHECK: br label %[[FORCOND57PREHEADER:.+]]
+
+; CHECK: [[Q]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFTHEN44]]
+
+; CHECK: [[FORCOND57PREHEADER]]:
+; CHECK: br label %[[FORCOND57:.+]]
+
+; CHECK: [[FORCOND57PREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[FORCOND57]]:
+; CHECK: %[[CMP59:.+]] = icmp
+; CHECK: br i1 %[[CMP59]], label %[[FORBODY61:.+]], label %[[SLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY61]]:
+; CHECK: br label %[[FORCOND57]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[P]]
+
+; CHECK: [[P]]:
+; CHECK: br label %[[FORCOND68:.+]]
+
+; CHECK: [[FORCOND68]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72:.+]], label %[[SLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY72]]:
+; CHECK: br label %[[FORCOND68]]
+
+; CHECK: [[SLOOPEXIT]]:
+; CHECK: br label %[[S]]
+
+; CHECK: [[SLOOPEXIT1]]:
+; CHECK: br label %[[FORCOND57PREHEADERELSE]]
+
+; CHECK: [[S]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
new file mode 100644
index 0000000000000..4b423f2d3f079
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization11.ll
@@ -0,0 +1,425 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-------.
+;       |         |
+;       c <---.   |
+;      / \    |   |
+;     d   e   |   |
+;    / \ / \  |   |
+;   i   f   g |   |
+;   |  / \ / \|   |
+;   | j   h --'   |
+;   | |        \  |
+;   | |         k |
+;   |  \       /  |
+;   |   \     /   |
+;   |    \   /    |
+;   |     \ /     |
+;   |      l -----'
+;   |     /
+;    \   m
+;     \ /
+;      n
+;
+; * where nodes c, d, f, g, and l are uniform branches, and node e is a
+;   varying branch.
+; * where nodes i, f, g, j, h, k, l, m and n are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <-------.    b' <----.
+;       |         |    |       |
+;       c <---.   |    c' <--. |
+;      / \    |   |   / \    | |
+;     d   e___|___|_ d'  e'  | |
+;    / \ / \  |   | \|__ |   | |
+;   i   f   g |   |  |  `g'  | |
+;   |  / \ / \|   |   \ /    | |
+;   | j   h --'   |    f'    | |
+;   | |        \  |    |     | |
+;   | |         k |    h' ---' |
+;   |  \       /  |    |       |
+;   |   \     /   |    k'      |
+;   |    \   /    |    |       |
+;   |     \ /     |    j'      |
+;   |      l -----'    |       |
+;   |     /            l' -----'
+;    \   m             |
+;     \ /              m'
+;      n               |
+;      |               i'
+;      |               |
+;      `-----> & <---- n'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization11(__global int *out, int n) {
+;   // a
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     // b
+;     while (1) {
+;       if (n < 5) { // c
+;         // d
+;         for (int i = 0; i < n * 2; i++) ret++;
+;         if (n <= 3) {
+;           // i
+;           goto i;
+;         }
+;       } else {
+;         // e
+;         if (ret + id >= n) {
+;           // g
+;           ret /= n * n + ret;
+;           if (n <= 10) {
+;             goto k;
+;           } else {
+;             goto h;
+;           }
+;         }
+;       }
+;       // f
+;       ret *= n;
+;       if (n & 1) {
+;         goto j;
+;       }
+;
+;       // h
+; h:
+;       ret++;
+;     }
+;
+; j:
+;     ret += n * 2 + 20;
+;     goto l;
+;
+; k:
+;     ret *= n;
+;     goto l;
+;
+; l:
+;     if (n & 1) {
+;       // m
+;       ret++;
+;       goto m;
+;     }
+;   }
+;
+; m:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto n;
+;
+; i:
+;   ret /= n;
+;
+; n:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization11(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end33, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %storemerge, %if.end33 ]
+  br label %while.body2
+
+while.body2:                                      ; preds = %h, %while.body
+  %ret.1 = phi i32 [ %ret.0, %while.body ], [ %inc24, %h ]
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.2 = phi i32 [ %ret.1, %if.then ], [ %inc, %for.body ]
+  %storemerge2 = phi i32 [ 0, %if.then ], [ %inc6, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp4 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.2, 1
+  %inc6 = add nsw i32 %storemerge2, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp7 = icmp slt i32 %n, 4
+  br i1 %cmp7, label %i44, label %if.end20
+
+if.else:                                          ; preds = %while.body2
+  %add = add nsw i32 %ret.1, %conv
+  %cmp10 = icmp slt i32 %add, %n
+  br i1 %cmp10, label %if.end20, label %if.then12
+
+if.then12:                                        ; preds = %if.else
+  %mul13 = mul nsw i32 %n, %n
+  %add14 = add nsw i32 %ret.1, %mul13
+  %0 = icmp eq i32 %ret.1, -2147483648
+  %1 = icmp eq i32 %add14, -1
+  %2 = and i1 %0, %1
+  %3 = icmp eq i32 %add14, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %add14
+  %div = sdiv i32 %ret.1, %5
+  %cmp15 = icmp slt i32 %n, 11
+  br i1 %cmp15, label %k, label %h
+
+if.end20:                                         ; preds = %if.else, %for.end
+  %ret.3 = phi i32 [ %ret.2, %for.end ], [ %ret.1, %if.else ]
+  %mul21 = mul nsw i32 %ret.3, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %h, label %j
+
+h:                                                ; preds = %if.end20, %if.then12
+  %ret.4 = phi i32 [ %div, %if.then12 ], [ %mul21, %if.end20 ]
+  %inc24 = add nsw i32 %ret.4, 1
+  br label %while.body2
+
+j:                                                ; preds = %if.end20
+  %mul25 = mul i32 %n, 2
+  %add26 = add nsw i32 %mul25, 20
+  %add27 = add nsw i32 %add26, %mul21
+  br label %l
+
+k:                                                ; preds = %if.then12
+  %mul28 = mul nsw i32 %div, %n
+  br label %l
+
+l:                                                ; preds = %k, %j
+  %storemerge = phi i32 [ %add27, %j ], [ %mul28, %k ]
+  %and29 = and i32 %n, 1
+  %tobool30 = icmp eq i32 %and29, 0
+  br i1 %tobool30, label %if.end33, label %if.then31
+
+if.then31:                                        ; preds = %l
+  br label %for.cond35
+
+if.end33:                                         ; preds = %l
+  br label %while.body
+
+for.cond35:                                       ; preds = %for.body39, %if.then31
+  %ret.5.in = phi i32 [ %storemerge, %if.then31 ], [ %ret.5, %for.body39 ]
+  %storemerge1 = phi i32 [ 0, %if.then31 ], [ %inc42, %for.body39 ]
+  %ret.5 = add nsw i32 %ret.5.in, 1
+  %div36 = sdiv i32 %n, 4
+  %cmp37 = icmp slt i32 %storemerge1, %div36
+  br i1 %cmp37, label %for.body39, label %n46
+
+for.body39:                                       ; preds = %for.cond35
+  %inc42 = add nsw i32 %storemerge1, 1
+  br label %for.cond35
+
+i44:                                              ; preds = %for.end
+  %6 = icmp eq i32 %ret.2, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %div45 = sdiv i32 %ret.2, %11
+  br label %n46
+
+n46:                                              ; preds = %i44, %for.cond35
+  %ret.6 = phi i32 [ %div45, %i44 ], [ %ret.5, %for.cond35 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization11, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization11
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[WHILEBODY2:.+]]
+
+; CHECK: [[WHILEBODY2]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: br label %[[IFEND20:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFTHEN12:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: br label %[[WHILEBODY2UNIFORM:.+]]
+
+; CHECK: [[WHILEBODY2UNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND20UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN12UNIFORM:.+]]:
+; CHECK: %[[CMP15UNIFORM:cmp.+]] = icmp
+; CHECK: br i1 %[[CMP15UNIFORM]], label %[[KUNIFORM:.+]], label %[[HUNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FORENDUNIFORM]]:
+; CHECK: %[[CMP7UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP7UNIFORM]], label %[[I44UNIFORM:.+]], label %[[IFEND20UNIFORM]]
+
+; CHECK: [[IFEND20UNIFORM]]:
+; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[HUNIFORM]], label %[[JUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN12UNIFORM]], label %[[IFELSEUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN12]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[WHILEBODY2UNIFORM]]
+
+; CHECK: [[KUNIFORM]]:
+; CHECK: br label %[[LUNIFORM:.+]]
+
+; CHECK: [[JUNIFORM]]:
+; CHECK: br label %[[LUNIFORM]]
+
+; CHECK: [[LUNIFORM]]:
+; CHECK: %[[TOBOOL30UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL30UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FORCOND35PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND35PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND35UNIFORM:.+]]
+
+; CHECK: [[FORCOND35UNIFORM]]:
+; CHECK: %[[CMP37UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP37UNIFORM]], label %[[FORBODY39UNIFORM:.+]], label %[[N46LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY39UNIFORM]]:
+; CHECK: br label %[[FORCOND35UNIFORM]]
+
+; CHECK: [[N46LOOPEXITUNIFORM]]:
+; CHECK: br label %[[N46UNIFORM:.+]]
+
+; CHECK: [[I44UNIFORM]]:
+; CHECK: br label %[[N46:.+]]
+
+; CHECK: [[IFTHEN12]]:
+; CHECK: br label %[[IFEND20]]
+
+; CHECK: [[IFEND20]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[H]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY2]], label %[[WHILEBODY2PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY2PUREEXIT]]:
+; CHECK: br label %[[K:.+]]
+
+; CHECK: [[J:.+]]:
+; CHECK: br label %[[L:.+]]
+
+; CHECK: [[K]]:
+; CHECK: br label %[[KELSE:.+]]
+
+; CHECK: [[KELSE]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[L]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FORCOND35PREHEADER:.+]]
+
+; CHECK: [[FORCOND35PREHEADER]]:
+; CHECK: br label %[[FORCOND35:.+]]
+
+; CHECK: [[FORCOND35PREHEADERELSE:.+]]:
+; CHECK: br label %[[I44:.+]]
+
+; CHECK: [[FORCOND35]]:
+; CHECK: %[[CMP37:.+]] = icmp
+; CHECK: br i1 %[[CMP37]], label %[[FORBODY39:.+]], label %[[N46LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY39]]:
+; CHECK: br label %[[FORCOND35]]
+
+; CHECK: [[I44]]:
+; CHECK: br label %[[N46]]
+
+; CHECK: [[N46LOOPEXIT]]:
+; CHECK: br label %[[FORCOND35PREHEADERELSE]]
+
+; CHECK: [[N46]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
new file mode 100644
index 0000000000000..270774ef0c142
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization12.ll
@@ -0,0 +1,782 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;              a
+;              |
+;              b <-----.
+;             / \      |
+;            c   d     |
+;           / \ /      |
+;          /   e       |
+;         /    |       |
+;        /     g <---. |
+;       f     / \    | |
+;       |    h   i   | |
+;       |   /   / \  | |
+;       |  /   k   l | |
+;       | /    |\ /| | |
+;       |/     |/ \| | |
+;       j      m   n | |
+;      /|     / \ /  | |
+;     / |    o   p --' |
+;    /  |   /   /      |
+;   |   |  /   r       |
+;   |   | /    |       |
+;   |   |/     s ------'
+;   |   |     /
+;   |  /|    t
+;   | / |   /
+;   |/  |  /
+;   q   | /
+;   |   |/
+;   |   u
+;    \ /
+;     v
+;
+; * where nodes b, c, g, j, k, l, m, p and s are uniform branches,
+;   and node i is a varying branch.
+; * where nodes k, l, o, n, m, p, q, s, r, t and v are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;              a
+;              |
+;              b <-----.        b' <----.
+;             / \      |       / \      |
+;            c   d     |      c'  d'    |
+;           / \ /      |     / \ /      |
+;          /   e       |    /   e'      |
+;         /    |       |   /    |       |
+;        /     g <---. |  f'    g' <--. |
+;       f     / \    | |  |    / \    | |
+;       |    h   i___|_|__|_  h   i'  | |
+;       |   /   / \  | |  | \/___ |   | |
+;       |  /   k   l | |  | /    `l'  | |
+;       | /    |\ /| | |  |/      |   | |
+;       |/     |/ \| | |  j'      k'  | |
+;       j      m   n | |  |\      |   | |
+;      /|     / \ /  | |  | \     n'  | |
+;     / |    o   p --' |  |  \    |   | |
+;    /  |   /   /      |  |   |   m'  | |
+;   |   |  /   r       |  |   |   |   | |
+;   |   | /    |       |  |   |   p' -' |
+;   |   |/     s ------'  |   |  /      |
+;   |   |     /           |   | r'      |
+;   |  /|    t            |   | |       |
+;   | / |   /             |   | s' -----'
+;   |/  |  /              |   |/
+;   q   | /               |   o'
+;   |   |/                |  /
+;   |   u                 | t'
+;    \ /                  |/
+;     v                   u'
+;     |                   |
+;     |                   q'
+;     |                   |
+;     `-------> & <------ v'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization12(__global int *out, int n) {
+;   // a
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n < 5) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n <= 2) { // g
+;         // h
+;         ret -= n * ret;
+;         for (int i = 0; i < n * 2; i++) ret++;
+;         // j
+;         goto j;
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // k
+;           ret /= n * n + ret;
+;           if (n < 5) {
+;             // m
+;             ret -= n;
+;             goto m;
+;           } else {
+;             // n
+;             ret += n;
+;             goto n;
+;           }
+;         } else {
+;           // l
+;           if (n >= 5) {
+;             // m
+;             ret += n;
+;             goto m;
+;           } else {
+;             // n
+;             ret -= n;
+;             goto n;
+;           }
+;         }
+;       }
+;       // m
+; m:
+;       if (n & 1) {
+;         // o
+;         ret *= n;
+;         goto q;
+;       } else {
+;         // p
+;         goto p;
+;       }
+;
+;       // n
+; n:
+;       ret *= ret;
+;       // p
+; p:
+;       if (n > 3) {
+;         goto r;
+;       }
+;       ret++;
+;     }
+;
+;     // r
+; r:
+;     ret *= 4;
+;     for (int i = 0; i < n / 4; i++) ret++;
+;
+;     // s
+;     if (n & 1) {
+;       goto t;
+;     }
+;     ret++;
+;   }
+;
+; f:
+;   ret /= n;
+;   goto j;
+;
+; j:
+;   if (n == 2) {
+;     goto q;
+;   } else {
+;     goto u;
+;   }
+;
+; t:
+;   for (int i = 0; i < n + 1; i++) ret++;
+;   goto u;
+;
+; q:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto v;
+;
+; u:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; v:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end79, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc80, %if.end79 ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %storemerge, %if.then ], [ %inc, %for.body ]
+  %storemerge10 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge10, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge10, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 5
+  br i1 %cmp5, label %f, label %if.end17
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond9
+
+for.cond9:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ %storemerge, %if.else ], [ %inc13, %for.body12 ]
+  %storemerge1 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge1, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.1, 1
+  %inc15 = add nsw i32 %storemerge1, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.2 = phi i32 [ %ret.0, %for.end ], [ %ret.1, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %if.end63, %if.end17
+  %storemerge2.in = phi i32 [ %ret.2, %if.end17 ], [ %ret.4, %if.end63 ]
+  %storemerge2 = add nsw i32 %storemerge2.in, 1
+  %cmp21 = icmp slt i32 %n, 3
+  br i1 %cmp21, label %if.then23, label %if.else35
+
+if.then23:                                        ; preds = %while.body20
+  %mul24 = mul nsw i32 %storemerge2, %n
+  %sub = sub nsw i32 %storemerge2, %mul24
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.body30, %if.then23
+  %ret.3 = phi i32 [ %sub, %if.then23 ], [ %inc31, %for.body30 ]
+  %storemerge9 = phi i32 [ 0, %if.then23 ], [ %inc33, %for.body30 ]
+  %mul27 = shl nsw i32 %n, 1
+  %cmp28 = icmp slt i32 %storemerge9, %mul27
+  br i1 %cmp28, label %for.body30, label %j
+
+for.body30:                                       ; preds = %for.cond26
+  %inc31 = add nsw i32 %ret.3, 1
+  %inc33 = add nsw i32 %storemerge9, 1
+  br label %for.cond26
+
+if.else35:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge2, %conv
+  %cmp36 = icmp slt i32 %add, %n
+  br i1 %cmp36, label %if.else48, label %if.then38
+
+if.then38:                                        ; preds = %if.else35
+  %mul39 = mul nsw i32 %n, %n
+  %add40 = add nsw i32 %storemerge2, %mul39
+  %0 = icmp eq i32 %add40, 0
+  %1 = select i1 %0, i32 1, i32 %add40
+  %div41 = sdiv i32 %storemerge2, %1
+  %cmp42 = icmp slt i32 %n, 5
+  br i1 %cmp42, label %if.then44, label %if.else46
+
+if.then44:                                        ; preds = %if.then38
+  %sub45 = sub nsw i32 %div41, %n
+  br label %m
+
+if.else46:                                        ; preds = %if.then38
+  %add47 = add nsw i32 %div41, %n
+  br label %n58
+
+if.else48:                                        ; preds = %if.else35
+  %cmp49 = icmp sgt i32 %n, 4
+  br i1 %cmp49, label %if.then51, label %if.else53
+
+if.then51:                                        ; preds = %if.else48
+  %add52 = add nsw i32 %storemerge2, %n
+  br label %m
+
+if.else53:                                        ; preds = %if.else48
+  %sub54 = sub nsw i32 %storemerge2, %n
+  br label %n58
+
+m:                                                ; preds = %if.then51, %if.then44
+  %storemerge7 = phi i32 [ %add52, %if.then51 ], [ %sub45, %if.then44 ]
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %p, label %if.then55
+
+if.then55:                                        ; preds = %m
+  %mul56 = mul nsw i32 %storemerge7, %n
+  br label %q
+
+n58:                                              ; preds = %if.else53, %if.else46
+  %storemerge3 = phi i32 [ %sub54, %if.else53 ], [ %add47, %if.else46 ]
+  %mul59 = mul nsw i32 %storemerge3, %storemerge3
+  br label %p
+
+p:                                                ; preds = %n58, %m
+  %ret.4 = phi i32 [ %mul59, %n58 ], [ %storemerge7, %m ]
+  %cmp60 = icmp sgt i32 %n, 3
+  br i1 %cmp60, label %r, label %if.end63
+
+if.end63:                                         ; preds = %p
+  br label %while.body20
+
+r:                                                ; preds = %p
+  %mul65 = shl nsw i32 %ret.4, 2
+  br label %for.cond67
+
+for.cond67:                                       ; preds = %for.body71, %r
+  %ret.5 = phi i32 [ %mul65, %r ], [ %inc72, %for.body71 ]
+  %storemerge4 = phi i32 [ 0, %r ], [ %inc74, %for.body71 ]
+  %div68 = sdiv i32 %n, 4
+  %cmp69 = icmp slt i32 %storemerge4, %div68
+  br i1 %cmp69, label %for.body71, label %for.end75
+
+for.body71:                                       ; preds = %for.cond67
+  %inc72 = add nsw i32 %ret.5, 1
+  %inc74 = add nsw i32 %storemerge4, 1
+  br label %for.cond67
+
+for.end75:                                        ; preds = %for.cond67
+  %and76 = and i32 %n, 1
+  %tobool77 = icmp eq i32 %and76, 0
+  br i1 %tobool77, label %if.end79, label %t
+
+if.end79:                                         ; preds = %for.end75
+  %inc80 = add nsw i32 %ret.5, 1
+  br label %while.body
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %n, 0
+  %3 = select i1 %2, i32 1, i32 %n
+  %div81 = sdiv i32 %ret.0, %3
+  br label %j
+
+j:                                                ; preds = %f, %for.cond26
+  %ret.6 = phi i32 [ %div81, %f ], [ %ret.3, %for.cond26 ]
+  %cmp82 = icmp eq i32 %n, 2
+  br i1 %cmp82, label %q, label %u
+
+t:                                                ; preds = %for.end75
+  br label %for.cond87
+
+for.cond87:                                       ; preds = %for.body91, %t
+  %ret.7 = phi i32 [ %ret.5, %t ], [ %inc92, %for.body91 ]
+  %storemerge5 = phi i32 [ 0, %t ], [ %inc94, %for.body91 ]
+  %cmp89 = icmp sgt i32 %storemerge5, %n
+  br i1 %cmp89, label %u, label %for.body91
+
+for.body91:                                       ; preds = %for.cond87
+  %inc92 = add nsw i32 %ret.7, 1
+  %inc94 = add nsw i32 %storemerge5, 1
+  br label %for.cond87
+
+q:                                                ; preds = %j, %if.then55
+  %ret.8 = phi i32 [ %mul56, %if.then55 ], [ %ret.6, %j ]
+  br label %for.cond97
+
+for.cond97:                                       ; preds = %for.body101, %q
+  %ret.9 = phi i32 [ %ret.8, %q ], [ %inc102, %for.body101 ]
+  %storemerge8 = phi i32 [ 0, %q ], [ %inc104, %for.body101 ]
+  %div98 = sdiv i32 %n, 4
+  %cmp99 = icmp slt i32 %storemerge8, %div98
+  br i1 %cmp99, label %for.body101, label %v
+
+for.body101:                                      ; preds = %for.cond97
+  %inc102 = add nsw i32 %ret.9, 1
+  %inc104 = add nsw i32 %storemerge8, 1
+  br label %for.cond97
+
+u:                                                ; preds = %for.cond87, %j
+  %ret.10 = phi i32 [ %ret.6, %j ], [ %ret.7, %for.cond87 ]
+  br label %for.cond107
+
+for.cond107:                                      ; preds = %for.body111, %u
+  %ret.11 = phi i32 [ %ret.10, %u ], [ %inc112, %for.body111 ]
+  %storemerge6 = phi i32 [ 0, %u ], [ %inc114, %for.body111 ]
+  %mul108 = shl nsw i32 %n, 1
+  %cmp109 = icmp slt i32 %storemerge6, %mul108
+  br i1 %cmp109, label %for.body111, label %v
+
+for.body111:                                      ; preds = %for.cond107
+  %inc112 = add nsw i32 %ret.11, 1
+  %inc114 = add nsw i32 %storemerge6, 1
+  br label %for.cond107
+
+v:                                                ; preds = %for.cond107, %for.cond97
+  %ret.12 = phi i32 [ %ret.9, %for.cond97 ], [ %ret.11, %for.cond107 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.12, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization12, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization12
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[CMP21:.+]] = icmp
+; CHECK: br i1 %[[CMP21]], label %[[IFTHEN23:.+]], label %[[IFELSE35:.+]]
+
+; CHECK: [[IFTHEN23]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHEN23ELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[IFTHEN23SPLIT:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[JLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[IFELSE35]]:
+; CHECK: br label %[[IFTHEN38:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND9PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND9PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND9UNIFORM:.+]]
+
+; CHECK: [[FORCOND9UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[IFEND17LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY12UNIFORM]]:
+; CHECK: br label %[[FORCOND9UNIFORM]]
+
+; CHECK: [[IFEND17LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND17UNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[FORENDUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FORENDUNIFORM]]:
+; CHECK: %[[CMP5UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND17UNIFORM]]
+
+; CHECK: [[IFEND17UNIFORM]]:
+; CHECK: br label %[[WHILEBODY20UNIFORM:.+]]
+
+; CHECK: [[WHILEBODY20UNIFORM]]:
+; CHECK: %[[CMP21UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP21UNIFORM]], label %[[IFTHEN23UNIFORM:.+]], label %[[IFELSE35UNIFORM:.+]]
+
+; CHECK: [[IFELSE35UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE48UNIFORM:.+]], label %[[IFELSE35UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN38UNIFORM:.+]]:
+; CHECK: %[[CMP42UNIFORM:cmp.+]] = icmp
+; CHECK: br i1 %[[CMP42UNIFORM]], label %[[IFTHEN44UNIFORM:.+]], label %[[IFELSE46UNIFORM:.+]]
+
+; CHECK: [[IFELSE46UNIFORM]]:
+; CHECK: br label %[[N58UNIFORM:.+]]
+
+; CHECK: [[IFTHEN44UNIFORM]]:
+; CHECK: br label %[[MUNIFORM:.+]]
+
+; CHECK: [[IFELSE48UNIFORM]]:
+; CHECK: %[[CMP49UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP49UNIFORM]], label %[[IFTHEN51UNIFORM:.+]], label %[[IFELSE53UNIFORM:.+]]
+
+; CHECK: [[IFELSE35UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN38UNIFORM]], label %[[IFELSE35UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE35UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN38]]
+
+; CHECK: [[IFELSE53UNIFORM]]:
+; CHECK: br label %[[N58UNIFORM]]
+
+; CHECK: [[IFTHEN51UNIFORM]]:
+; CHECK: br label %[[MUNIFORM]]
+
+; CHECK: [[N58UNIFORM]]:
+; CHECK: br label %[[PUNIFORM:.+]]
+
+; CHECK: [[MUNIFORM]]:
+; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[PUNIFORM]], label %[[IFTHEN55UNIFORM:.+]]
+
+; CHECK: [[PUNIFORM]]:
+; CHECK: %[[CMP60UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP60UNIFORM]], label %[[RUNIFORM:.+]], label %[[WHILEBODY20UNIFORM]]
+
+; CHECK: [[RUNIFORM]]:
+; CHECK: br label %[[FORCOND67UNIFORM:.+]]
+
+; CHECK: [[FORCOND67UNIFORM]]:
+; CHECK: %[[CMP69UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP69UNIFORM]], label %[[FORBODY71UNIFORM:.+]], label %[[FOREND75UNIFORM:.+]]
+
+; CHECK: [[FORBODY71UNIFORM]]:
+; CHECK: br label %[[FORCOND67UNIFORM]]
+
+; CHECK: [[FOREND75UNIFORM]]:
+; CHECK: %[[TOBOOL77UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL77UNIFORM]], label %[[IFEND79UNIFORM:.+]], label %[[FORCOND87PREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFEND79UNIFORM]]:
+; CHECK: br label %[[WHILEBODYUNIFORM]]
+
+; CHECK: [[IFEND55UNIFORM:.+]]:
+; CHECK: br label %[[QUNIFORM:.+]]
+
+; CHECK: [[FORCOND87PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND87UNIFORM:.+]]
+
+; CHECK: [[FORCOND87UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXITUNIFORM:.+]], label %[[FORBODY91UNIFORM:.+]]
+
+; CHECK: [[FORBODY91UNIFORM]]:
+; CHECK: br label %[[FORCOND87UNIFORM]]
+
+; CHECK: [[ULOOPEXITUNIFORM]]:
+; CHECK: br label %[[UUNIFORM:.+]]
+
+; CHECK: [[IFTHEN23UNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM:.+]]
+
+; CHECK: [[FORCOND26UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30UNIFORM:.+]], label %[[JLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY30UNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM]]
+
+; CHECK: [[JLOOPEXITUNIFORM]]:
+; CHECK: br label %[[JUNIFORM:.+]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[JUNIFORM]]
+
+; CHECK: [[JUNIFORM]]:
+; CHECK: %[[CMP82UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP82UNIFORM]], label %[[QUNIFORM]], label %[[UUNIFORM]]
+
+; CHECK: [[UUNIFORM]]:
+; CHECK: br label %[[FORCOND107UNIFORM:.+]]
+
+; CHECK: [[FORCOND107UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111UNIFORM:.+]], label %[[ULOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY111UNIFORM]]:
+; CHECK: br label %[[FORCOND107UNIFORM]]
+
+; CHECK: [[VLOOPEXIT1UNIFORM:.+]]:
+; CHECK: br label %[[VUNIFORM:.+]]
+
+; CHECK: [[QUNIFORM]]:
+; CHECK: br label %[[FORCOND97UNIFORM:.+]]
+
+; CHECK: [[FORCOND97UNIFORM]]:
+; CHECK: %[[CMP99UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP99UNIFORM]], label %[[FORBODY101UNIFORM:.+]], label %[[VLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY101UNIFORM]]:
+; CHECK: br label %[[FORCOND97UNIFORM]]
+
+; CHECK: [[VLOOPEXITUNIFORM]]:
+; CHECK: br label %[[V:.+]]
+
+; CHECK: [[IFTHEN38]]:
+; CHECK: %[[CMP42:cmp.+]] = icmp
+; CHECK: br i1 %[[CMP42]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]]
+
+; CHECK: [[IFTHEN44]]:
+; CHECK: br label %[[IFELSE48:.+]]
+
+; CHECK: [[IFELSE46]]:
+; CHECK: br label %[[IFELSE48]]
+
+; CHECK: [[IFELSE48]]:
+; CHECK: %[[CMP49:.+]] = icmp
+; CHECK: br i1 %[[CMP49]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]]
+
+; CHECK: [[IFTHEN51]]:
+; CHECK: br label %[[N58:.+]]
+
+; CHECK: [[IFELSE53]]:
+; CHECK: br label %[[N58]]
+
+; CHECK: [[M:.+]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[IFTHEN55:.+]]:
+; CHECK: br label %[[IFTHEN55ELSE:.+]]
+
+; CHECK: [[IFTHEN55ELSE]]:
+; CHECK: br label %[[FORCOND87PREHEADER:.+]]
+
+; CHECK: [[N58]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[P]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[R:.+]]
+
+; CHECK: [[R]]:
+; CHECK: br label %[[FORCOND67:.+]]
+
+; CHECK: [[FORCOND67]]:
+; CHECK: %[[CMP69:.+]] = icmp
+; CHECK: br i1 %[[CMP69]], label %[[FORBODY71:.+]], label %[[FOREND75:.+]]
+
+; CHECK: [[FORBODY71]]:
+; CHECK: br label %[[FORCOND67]]
+
+; CHECK: [[FOREND75]]:
+; CHECK: br label %[[IFEND79:.+]]
+
+; CHECK: [[FORCOND87PREHEADER]]:
+; CHECK: br label %[[FORCOND87:.+]]
+
+; CHECK: [[FORCOND87PREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN23ELSE]], label %[[IFTHEN23SPLIT]]
+
+; CHECK: [[IFEND79]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFTHEN55]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[U:.+]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[J:.+]]
+
+; CHECK: [[JLOOPEXIT]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[J]]:
+; CHECK: %[[CMP82:.+]] = icmp
+; CHECK: br i1 %[[CMP82]], label %[[Q:.+]], label %[[U]]
+
+; CHECK: [[FORCOND87]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXIT:.+]], label %[[FORBODY91:.+]]
+
+; CHECK: [[FORBODY91]]:
+; CHECK: br label %[[FORCOND87]]
+
+; CHECK: [[Q]]:
+; CHECK: br label %[[FORCOND97:.+]]
+
+; CHECK: [[FORCOND97]]:
+; CHECK: %[[CMP99:.+]] = icmp
+; CHECK: br i1 %[[CMP99]], label %[[FORBODY101:.+]], label %[[VLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY101]]:
+; CHECK: br label %[[FORCOND97]]
+
+; CHECK: [[ULOOPEXIT]]:
+; CHECK: br label %[[FORCOND87PREHEADERELSE]]
+
+; CHECK: [[U]]:
+; CHECK: br label %[[FORCOND107:.+]]
+
+; CHECK: [[FORCOND107]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111:.+]], label %[[VLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY111]]:
+; CHECK: br label %[[FORCOND107]]
+
+; CHECK: [[VLOOPEXIT]]:
+; CHECK: br label %[[V]]
+
+; CHECK: [[VLOOPEXIT1]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[V]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
new file mode 100644
index 0000000000000..67d4e6542cdb5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization13.ll
@@ -0,0 +1,251 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c
+;    \ / \
+;     |   \
+;     |    d
+;     |   / \
+;     |  |   e
+;     |   \ /
+;     |    f
+;     |   / \
+;     |  |   g
+;     |   \ /
+;      \   h
+;       \ /
+;        i
+;
+; * where nodes d and f are uniform branches, and nodes a and c are varying
+;   branches.
+; * where nodes b, c, i are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a___________
+;    / \          \
+;   b   c_________ c'
+;    \ / \        \|
+;     |   \        d'
+;     |    d      / \
+;     |   / \    |   e'
+;     |  |   e    \ /
+;     |   \ /      f'
+;     |    f      / \
+;     |   / \    |   g'
+;     |  |   g    \ /
+;     |   \ /      h'
+;      \   h       |
+;       \ /        b'
+;        i         |
+;        `--> & <- i'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization13(__global int *out, int n) {
+;   size_t tid = get_global_id(0);
+;   size_t size = get_global_size(0);
+;   // a
+;   if (tid + 1 < size) {
+;     // b
+;     out[tid] = n;
+;   } else if (tid + 1 == size) { // c
+;     size_t leftovers = 1 + (size & 1);
+;     switch (leftovers) { // d
+;       case 2: // e
+;         out[tid] = 2 * n + 1;
+;         // fall through
+;       case 1: // f
+;         out[tid] += 3 * n - 1;
+;         break;
+;     }
+;     switch (leftovers) { // g
+;       case 2:
+;         out[tid] /= n;
+;         // fall through
+;       case 1: // h
+;         out[tid]--;
+;         break;
+;     }
+;   }
+;   // i
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization13(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %call1 = call i64 @__mux_get_global_size(i32 0) #2
+  %add = add i64 %call, 1
+  %cmp = icmp ult i64 %add, %call1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %n, i32 addrspace(1)* %arrayidx, align 4
+  br label %if.end17
+
+if.else:                                          ; preds = %entry
+  %add2 = add i64 %call, 1
+  %cmp3 = icmp eq i64 %add2, %call1
+  br i1 %cmp3, label %if.then4, label %if.end17
+
+if.then4:                                         ; preds = %if.else
+  %0 = and i64 %call1, 1
+  %trunc = icmp eq i64 %0, 0
+  br i1 %trunc, label %sw.bb8, label %sw.bb
+
+sw.bb:                                            ; preds = %if.then4
+  %mul = shl nsw i32 %n, 1
+  %add6 = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %add6, i32 addrspace(1)* %arrayidx7, align 4
+  br label %sw.bb8
+
+sw.bb8:                                           ; preds = %sw.bb, %if.then4
+  %mul9 = mul nsw i32 %n, 3
+  %sub = add nsw i32 %mul9, -1
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %add11 = add nsw i32 %sub, %1
+  store i32 %add11, i32 addrspace(1)* %arrayidx10, align 4
+  %2 = and i64 %call1, 1
+  %trunc2 = icmp ne i64 %2, 0
+  %trunc2.off = add i1 %trunc2, true
+  %switch = icmp ult i1 %trunc2.off, true
+  br i1 %switch, label %sw.bb12, label %sw.bb14
+
+sw.bb12:                                          ; preds = %sw.bb8
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %3 = load i32, i32 addrspace(1)* %arrayidx13, align 4
+  %4 = icmp eq i32 %3, -2147483648
+  %5 = icmp eq i32 %n, -1
+  %6 = and i1 %5, %4
+  %7 = icmp eq i32 %n, 0
+  %8 = or i1 %7, %6
+  %9 = select i1 %8, i32 1, i32 %n
+  %div = sdiv i32 %3, %9
+  store i32 %div, i32 addrspace(1)* %arrayidx13, align 4
+  br label %sw.bb14
+
+sw.bb14:                                          ; preds = %sw.bb12, %sw.bb8
+  %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %10 = load i32, i32 addrspace(1)* %arrayidx15, align 4
+  %dec = add nsw i32 %10, -1
+  store i32 %dec, i32 addrspace(1)* %arrayidx15, align 4
+  br label %if.end17
+
+if.end17:                                         ; preds = %sw.bb14, %if.else, %if.then
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_size(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization13, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization13
+; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSEUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN4UNIFORM]]:
+; CHECK: %[[TRUNCUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TRUNCUNIFORM]], label %[[SWBB8UNIFORM:.+]], label %[[SWBBUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND17UNIFORM:.+]], label %[[IFTHEN4:.+]]
+
+; CHECK: [[SWBBUNIFORM]]:
+; CHECK: br label %[[SWBB8UNIFORM]]
+
+; CHECK: [[SWBB8UNIFORM]]:
+; CHECK: %[[TRUNC2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TRUNC2UNIFORM]], label %[[SWBB14UNIFORM:.+]], label %[[SWBB12UNIFORM:.+]]
+
+; CHECK: [[SWBB12UNIFORM]]:
+; CHECK: br label %[[SWBB14UNIFORM]]
+
+; CHECK: [[SWBB14UNIFORM]]:
+; CHECK: br label %[[IFEND17UNIFORM]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: br label %[[IFEND17:.+]]
+
+; CHECK: [[ENTRYBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSEUNIFORM]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFTHEN4]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: %[[TRUNC:.+]] = icmp
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[TRUNC_ACTIVE:.+]] = select i1 {{%.*}}, i1 %[[TRUNC]], i1 false
+; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]])
+; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
+
+; CHECK: [[SWBB]]:
+; CHECK: br label %[[SWBB8]]
+
+; CHECK: [[SWBB8]]:
+; CHECK: %[[TRUNC2:.+]] = icmp
+; CHECK: br i1 %[[TRUNC2]], label %[[SWBB14:.+]], label %[[SWBB12:.+]]
+
+; CHECK: [[SWBB12]]:
+; CHECK: br label %[[SWBB14]]
+
+; CHECK: [[SWBB14]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
new file mode 100644
index 0000000000000..1a3e5764611b1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization14.ll
@@ -0,0 +1,356 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c <-.
+;   |  / \  |
+;   | d   e |
+;   |/ \ /  |
+;   f   g --'
+;    \  |
+;     \ h
+;      \|
+;       i
+;
+; * where nodes a, d and g are uniform branches, and node c is a varying
+;   branch.
+; * where nodes d, e, f, g, h and i are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;      / \
+;     b   c <-.   c' <.
+;    /   / \__|__ |   |
+;   |   d   e |  `e'  |
+;   |  / \ /  |   |   |
+;   | f   g --'   d'  |
+;    \ \  |       |   |
+;     \ \ h       g' -'
+;      \ \|       |
+;       \ i       h'
+;        \|      /
+;         \     /
+;        / \   /
+;        |  \ /
+;        |   f'
+;        |   |
+;        |   i'
+;         \ /
+;          &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization14(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   if (n < 5) {
+;     for (int i = 0; i < n + 10; i++) ret++;
+;     goto f;
+;   } else {
+;     while (1) {
+;       if (id + i % 2 == 0) {
+;         if (n > 2) {
+;           goto f;
+;         }
+;       } else {
+;         for (int i = 0; i < n + 10; i++) ret++;
+;       }
+;       if (n <= 2) break;
+;     }
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization14(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %entry
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %n, 10
+  %cmp3 = icmp slt i32 %storemerge4, %add
+  br i1 %cmp3, label %for.body, label %f
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc5 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond
+
+while.body:                                       ; preds = %if.end24, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ]
+  %cmp7 = icmp eq i32 %conv, 0
+  br i1 %cmp7, label %if.then9, label %for.cond15
+
+if.then9:                                         ; preds = %while.body
+  %cmp10 = icmp sgt i32 %n, 2
+  br i1 %cmp10, label %f, label %if.end24
+
+for.cond15:                                       ; preds = %for.body19, %while.body
+  %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ]
+  %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ]
+  %add16 = add nsw i32 %n, 10
+  %cmp17 = icmp slt i32 %storemerge, %add16
+  br i1 %cmp17, label %for.body19, label %if.end24
+
+for.body19:                                       ; preds = %for.cond15
+  %inc20 = add nsw i32 %ret.2, 1
+  %inc22 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond15
+
+if.end24:                                         ; preds = %for.cond15, %if.then9
+  %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ]
+  %cmp25 = icmp slt i32 %n, 3
+  br i1 %cmp25, label %if.end29, label %while.body
+
+if.end29:                                         ; preds = %if.end24
+  %mul = mul i32 %n, 2
+  %add30 = add nsw i32 %ret.3, %mul
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.end29
+  %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ]
+  %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ]
+  %mul33 = shl nsw i32 %n, 1
+  %cmp34 = icmp slt i32 %storemerge1, %mul33
+  br i1 %cmp34, label %for.body36, label %for.end39
+
+for.body36:                                       ; preds = %for.cond32
+  %sub = sub nsw i32 %ret.4, %storemerge1
+  %inc38 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond32
+
+for.end39:                                        ; preds = %for.cond32
+  %0 = icmp eq i32 %ret.4, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.4, %5
+  br label %early
+
+f:                                                ; preds = %if.then9, %for.cond
+  %ret.5 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %if.then9 ]
+  br label %for.cond41
+
+for.cond41:                                       ; preds = %for.body45, %f
+  %ret.6 = phi i32 [ %ret.5, %f ], [ %div46, %for.body45 ]
+  %storemerge3 = phi i32 [ 0, %f ], [ %inc48, %for.body45 ]
+  %add42 = add nsw i32 %n, 5
+  %cmp43 = icmp slt i32 %storemerge3, %add42
+  br i1 %cmp43, label %for.body45, label %for.end49
+
+for.body45:                                       ; preds = %for.cond41
+  %div46 = sdiv i32 %ret.6, 2
+  %inc48 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond41
+
+for.end49:                                        ; preds = %for.cond41
+  %sub50 = sub nsw i32 %ret.6, %n
+  br label %early
+
+early:                                            ; preds = %for.end49, %for.end39
+  %storemerge2 = phi i32 [ %div, %for.end39 ], [ %sub50, %for.end49 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization14, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization14
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYPREHEADER]]:
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND15PREHEADER:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN9UNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND15PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND15UNIFORM:.+]]
+
+; CHECK: [[FORCOND15UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19UNIFORM:.+]], label %[[IFEND24LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY19UNIFORM]]:
+; CHECK: br label %[[FORCOND15UNIFORM]]
+
+; CHECK: [[IFEND24LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND24UNIFORM:.+]]
+
+; CHECK: [[IFTHEN9UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FLOOPEXIT1UNIFORM:.+]], label %[[IFEND24UNIFORM]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND15PREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[FORCOND15PREHEADER]]
+
+; CHECK: [[IFEND24UNIFORM]]:
+; CHECK: %[[CMP25UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP25UNIFORM]], label %[[IFEND29UNIFORM:.+]], label %[[WHILEBODYUNIFORM]]
+
+; CHECK: [[IFEND29UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36UNIFORM:.+]], label %[[FOREND39UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[FOREND39UNIFORM]]:
+; CHECK: br label %[[EARLYUNIFORM:.+]]
+
+; CHECK: [[FLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[FUNIFORM:.+]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[FORCOND41UNIFORM:.+]]
+
+; CHECK: [[FORCOND41UNIFORM]]:
+; CHECK: %[[CMP43UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP43UNIFORM]], label %[[FORBODY45UNIFORM:.+]], label %[[FOREND49UNIFORM:.+]]
+
+; CHECK: [[FORBODY45UNIFORM]]:
+; CHECK: br label %[[FORCOND41UNIFORM]]
+
+; CHECK: [[FOREND49UNIFORM]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[FORCOND15PREHEADER]]:
+; CHECK: br label %[[FORCOND15:.+]]
+
+; CHECK: [[IFTHEN9:.+]]:
+; CHECK: br label %[[IFEND24:.+]]
+
+; CHECK: [[FORCOND15]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY19]]:
+; CHECK: br label %[[FORCOND15]]
+
+; CHECK: [[IFEND24LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN9]]
+
+; CHECK: [[IFEND24]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFEND29:.+]]
+
+; CHECK: [[IFEND29]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[IFEND29ELSE:.+]]:
+; CHECK: br label %[[FLOOPEXIT2:.+]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[FOREND39]]:
+; CHECK: br label %[[IFEND29ELSE]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[FLOOPEXIT2]]:
+; CHECK: br label %[[F]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FORCOND41:.+]]
+
+; CHECK: [[FORCOND41]]:
+; CHECK: %[[CMP43:.+]] = icmp
+; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]]
+
+; CHECK: [[FORBODY45]]:
+; CHECK: br label %[[FORCOND41]]
+
+; CHECK: [[FOREND49]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
new file mode 100644
index 0000000000000..b1626a8d0c7cc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization15.ll
@@ -0,0 +1,415 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;           a
+;           |
+;           b <-----.
+;          / \      |
+;         c   d     |
+;        / \ /      |
+;       /   e       |
+;      /    |       |
+;     /     g <---. |
+;    /     / \    | |
+;   f     h   i   | |
+;   |    / \ / \  | |
+;   |   |   j   k | |
+;   |    \ / \ /  | |
+;   |     l   m --' |
+;   |    /          |
+;   |   o ----------'
+;   |   |
+;   n   p
+;    \ /
+;     q
+;
+; * where nodes b, c, g, h, j and o are uniform branches, and node i is a
+;   varying branch.
+; * where nodes j, k, m, l, and o are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a
+;         |
+;         b <------------------.
+;        / \                   |
+;       c   d                  |
+;      / \ /                   |
+;     /   e                    |
+;    /    |                    |
+;   f     g <---.     g' <---. |
+;   |    / \    |    / \     | |
+;   |   h   i___|__ h'  i'   | |
+;   |  / \ / \  |  \|__ |    | |
+;   | |   j   k |   |  `k'   | |
+;   |  \ / \ /  |    \ /     | |
+;   |   l   m --'     j'     | |
+;   |   |             |      | |
+;   |   |             m'-----' |
+;    \  |             |        |
+;     \ `----> & <--- l'       |
+;      \      /                |
+;       \    o ----------------'
+;        |   |
+;        n   p
+;         \ /
+;          q
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization15(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n <= 10) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n & 1) { // g
+;         // h
+;         if (n < 3) {
+;           goto l;
+;         }
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // k
+;           ret /= n * n + ret;
+;           goto m;
+;         }
+;       }
+;       // j
+;       if (n & 1) {
+;         goto l;
+;       }
+;       // m
+; m:
+;       ret++;
+;     }
+; l:
+;     ret *= 4;
+; o:
+;     if (n & 1) {
+;       // p
+;       ret++;
+;       goto p;
+;     }
+;   }
+;
+; p:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto q;
+;
+; f:
+;   ret /= n;
+;   goto n;
+;
+; n:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; q:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization15(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %l, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %mul40, %l ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.cond, label %for.cond9
+
+for.cond:                                         ; preds = %for.body, %while.body
+  %ret.1 = phi i32 [ %inc, %for.body ], [ %ret.0, %while.body ]
+  %storemerge3 = phi i32 [ %inc4, %for.body ], [ 0, %while.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge3, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc4 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 11
+  br i1 %cmp5, label %f, label %if.end17
+
+for.cond9:                                        ; preds = %for.body12, %while.body
+  %ret.2 = phi i32 [ %inc13, %for.body12 ], [ %ret.0, %while.body ]
+  %storemerge = phi i32 [ %inc15, %for.body12 ], [ 0, %while.body ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.2, 1
+  %inc15 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %m, %if.end17
+  %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %m ]
+  %storemerge1 = add nsw i32 %storemerge1.in, 1
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.else26, label %if.then21
+
+if.then21:                                        ; preds = %while.body20
+  %cmp22 = icmp slt i32 %n, 3
+  br i1 %cmp22, label %l, label %if.end34
+
+if.else26:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge1, %conv
+  %cmp27 = icmp slt i32 %add, %n
+  br i1 %cmp27, label %if.end34, label %if.then29
+
+if.then29:                                        ; preds = %if.else26
+  %mul30 = mul nsw i32 %n, %n
+  %add31 = add nsw i32 %storemerge1, %mul30
+  %0 = icmp eq i32 %add31, 0
+  %1 = select i1 %0, i32 1, i32 %add31
+  %div32 = sdiv i32 %storemerge1, %1
+  br label %m
+
+if.end34:                                         ; preds = %if.else26, %if.then21
+  %and35 = and i32 %n, 1
+  %tobool36 = icmp eq i32 %and35, 0
+  br i1 %tobool36, label %m, label %l
+
+m:                                                ; preds = %if.end34, %if.then29
+  %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ]
+  br label %while.body20
+
+l:                                                ; preds = %if.end34, %if.then21
+  %mul40 = shl nsw i32 %storemerge1, 2
+  %and41 = and i32 %n, 1
+  %tobool42 = icmp eq i32 %and41, 0
+  br i1 %tobool42, label %while.body, label %if.then43
+
+if.then43:                                        ; preds = %l
+  %inc44 = or i32 %mul40, 1
+  br label %for.cond47
+
+for.cond47:                                       ; preds = %for.body51, %if.then43
+  %ret.5 = phi i32 [ %inc44, %if.then43 ], [ %inc52, %for.body51 ]
+  %storemerge2 = phi i32 [ 0, %if.then43 ], [ %inc54, %for.body51 ]
+  %div48 = sdiv i32 %n, 4
+  %cmp49 = icmp slt i32 %storemerge2, %div48
+  br i1 %cmp49, label %for.body51, label %q
+
+for.body51:                                       ; preds = %for.cond47
+  %inc52 = add nsw i32 %ret.5, 1
+  %inc54 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond47
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %ret.1, -2147483648
+  %3 = icmp eq i32 %n, -1
+  %4 = and i1 %3, %2
+  %5 = icmp eq i32 %n, 0
+  %6 = or i1 %5, %4
+  %7 = select i1 %6, i32 1, i32 %n
+  %div56 = sdiv i32 %ret.1, %7
+  br label %for.cond59
+
+for.cond59:                                       ; preds = %for.body63, %f
+  %ret.6 = phi i32 [ %div56, %f ], [ %inc64, %for.body63 ]
+  %storemerge4 = phi i32 [ 0, %f ], [ %inc66, %for.body63 ]
+  %mul60 = shl nsw i32 %n, 1
+  %cmp61 = icmp slt i32 %storemerge4, %mul60
+  br i1 %cmp61, label %for.body63, label %q
+
+for.body63:                                       ; preds = %for.cond59
+  %inc64 = add nsw i32 %ret.6, 1
+  %inc66 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond59
+
+q:                                                ; preds = %for.cond59, %for.cond47
+  %ret.7 = phi i32 [ %ret.5, %for.cond47 ], [ %ret.6, %for.cond59 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.7, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization15, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization15
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 false, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br i1 true, label %[[WHILEBODY20UNIFORM:.+]], label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[TOBOOL:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]]
+
+; CHECK: [[IFTHEN21]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[IFELSE26]]:
+; CHECK: br label %[[IFTHEN29:.+]]
+
+; CHECK: [[WHILEBODY20UNIFORM]]:
+; CHECK: %[[TOBOOLUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOLUNIFORM]], label %[[IFELSE26UNIFORM:.+]], label %[[IFTHEN21UNIFORM:.+]]
+
+; CHECK: [[IFTHEN21UNIFORM]]:
+; CHECK: %[[CMP22UNIFORM:.+]] = icmp
+; CHECK: %[[TOBOOLNEW36UNIFORM:.+]] = icmp
+; CHECK: %[[ORCONDUNIFORM:.+]] = and i1 %[[CMP22UNIFORM]], %[[TOBOOLNEW36UNIFORM]]
+; CHECK: br i1 %[[ORCONDUNIFORM]], label %[[MUNIFORM:.+]], label %[[L:.+]]
+
+; CHECK: [[IFELSE26UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND34UNIFORM:.+]], label %[[IFELSE26UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN29UNIFORM:.+]]:
+; CHECK: br label %[[MUNIFORM:.+]]
+
+; CHECK: [[IFEND34UNIFORM]]:
+; CHECK: %[[TOBOOL36UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL36UNIFORM]], label %[[MUNIFORM]], label %[[L:.+]]
+
+; CHECK: [[IFELSE26UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN29UNIFORM]], label %[[IFELSE26UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSE26UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFTHEN29]]
+
+; CHECK: [[MUNIFORM]]:
+; CHECK: br label %[[WHILEBODY20UNIFORM]]
+
+; CHECK: [[IFTHEN29]]:
+; CHECK: br label %[[IFEND34:.+]]
+
+; CHECK: [[IFEND34]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[M]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[L]]
+
+; CHECK: [[L]]:
+; CHECK: %[[TOBOOL42:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL42]], label %[[WHILEBODY]], label %[[IFTHEN43:.+]]
+
+; CHECK: [[IFTHEN43]]:
+; CHECK: br label %[[FORCOND47:.+]]
+
+; CHECK: [[FORCOND47]]:
+; CHECK: %[[CMP49:.+]] = icmp
+; CHECK: br i1 %[[CMP49]], label %[[FORBODY51:.+]], label %[[QLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY51]]:
+; CHECK: br label %[[FORCOND47]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FORCOND59:.+]]
+
+; CHECK: [[FORCOND59]]:
+; CHECK: br i1 false, label %[[FORBODY63:.+]], label %[[QLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY63]]:
+; CHECK: br label %[[FORCOND59]]
+
+; CHECK: [[QLOOPEXIT]]:
+; CHECK: br label %[[Q:.+]]
+
+; CHECK: [[QLOOPEXIT2]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[Q]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
new file mode 100644
index 0000000000000..e9567cc00d194
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization16.ll
@@ -0,0 +1,394 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;      / \
+;     b   c <-.
+;    /   / \  |
+;   |   d   e |
+;   |  / \ /  |
+;   | f   g --'
+;   |/    |
+;   h     i
+;    \   /
+;     \ /
+;      j
+;
+; * where nodes a, d and g are uniform branches, and node c is a varying
+;   branch.
+; * where nodes d, e, f, g, i and j are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a
+;        / \
+;       b   c <-.  c' <.
+;      /   / \__|_ |   |
+;     /   d   e | `e'  |
+;    /   / \ /  |  |   |
+;   /   f   g --'  d'  |
+;  |   /    |      |   |
+;   \ h     i      g' -'
+;    \ \   /       |
+;     \ \ /        i'
+;      \ j         |
+;       \|         f'
+;        \         |
+;        /\       /
+;       |  \     /
+;       |   \   /
+;       |    \ /
+;       |     h'
+;       |     |
+;       |     j'
+;        \   /
+;         \ /
+;          &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization16(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   if (n < 5) {
+;     for (int i = 0; i < n + 10; i++) ret++;
+;     goto h;
+;   } else {
+;     while (1) {
+;       if (id + i % 2 == 0) {
+;         if (n > 2) {
+;           goto f;
+;         }
+;       } else {
+;         for (int i = 0; i < n + 10; i++) ret++;
+;       }
+;       if (n <= 2) break;
+;     }
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; h:
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization16(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %entry
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %n, 10
+  %cmp3 = icmp slt i32 %storemerge4, %add
+  br i1 %cmp3, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc5 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond
+
+while.body:                                       ; preds = %if.end24, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ]
+  %cmp7 = icmp eq i32 %conv, 0
+  br i1 %cmp7, label %if.then9, label %for.cond15
+
+if.then9:                                         ; preds = %while.body
+  %cmp10 = icmp sgt i32 %n, 2
+  br i1 %cmp10, label %for.cond41, label %if.end24
+
+for.cond15:                                       ; preds = %for.body19, %while.body
+  %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ]
+  %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ]
+  %add16 = add nsw i32 %n, 10
+  %cmp17 = icmp slt i32 %storemerge, %add16
+  br i1 %cmp17, label %for.body19, label %if.end24
+
+for.body19:                                       ; preds = %for.cond15
+  %inc20 = add nsw i32 %ret.2, 1
+  %inc22 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond15
+
+if.end24:                                         ; preds = %for.cond15, %if.then9
+  %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ]
+  %cmp25 = icmp slt i32 %n, 3
+  br i1 %cmp25, label %if.end29, label %while.body
+
+if.end29:                                         ; preds = %if.end24
+  %mul = mul i32 %n, 2
+  %add30 = add nsw i32 %ret.3, %mul
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.end29
+  %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ]
+  %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ]
+  %mul33 = shl nsw i32 %n, 1
+  %cmp34 = icmp slt i32 %storemerge1, %mul33
+  br i1 %cmp34, label %for.body36, label %for.end39
+
+for.body36:                                       ; preds = %for.cond32
+  %sub = sub nsw i32 %ret.4, %storemerge1
+  %inc38 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond32
+
+for.end39:                                        ; preds = %for.cond32
+  %0 = icmp eq i32 %ret.4, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.4, %5
+  br label %early
+
+for.cond41:                                       ; preds = %for.body45, %if.then9
+  %ret.5 = phi i32 [ %div46, %for.body45 ], [ %ret.1, %if.then9 ]
+  %storemerge2 = phi i32 [ %inc48, %for.body45 ], [ 0, %if.then9 ]
+  %add42 = add nsw i32 %n, 5
+  %cmp43 = icmp slt i32 %storemerge2, %add42
+  br i1 %cmp43, label %for.body45, label %for.end49
+
+for.body45:                                       ; preds = %for.cond41
+  %div46 = sdiv i32 %ret.5, 2
+  %inc48 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond41
+
+for.end49:                                        ; preds = %for.cond41
+  %sub50 = sub nsw i32 %ret.5, %n
+  br label %h
+
+h:                                                ; preds = %for.end49, %for.cond
+  %ret.6 = phi i32 [ %sub50, %for.end49 ], [ %ret.0, %for.cond ]
+  br label %for.cond52
+
+for.cond52:                                       ; preds = %for.body56, %h
+  %ret.7 = phi i32 [ %ret.6, %h ], [ %sub57, %for.body56 ]
+  %storemerge3 = phi i32 [ 0, %h ], [ %inc59, %for.body56 ]
+  %mul53 = shl nsw i32 %n, 1
+  %cmp54 = icmp slt i32 %storemerge3, %mul53
+  br i1 %cmp54, label %for.body56, label %early
+
+for.body56:                                       ; preds = %for.cond52
+  %sub57 = sub nsw i32 %ret.7, %storemerge3
+  %inc59 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond52
+
+early:                                            ; preds = %for.cond52, %for.end39
+  %ret.8 = phi i32 [ %div, %for.end39 ], [ %ret.7, %for.cond52 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization16, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization16
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYPREHEADER]]:
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND15PREHEADER:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN9UNIFORM:.+]], label %[[WHILEBODYUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND15PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND15UNIFORM:.+]]
+
+; CHECK: [[FORCOND15UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19UNIFORM:.+]], label %[[IFEND24LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY19UNIFORM]]:
+; CHECK: br label %[[FORCOND15UNIFORM]]
+
+; CHECK: [[IFEND24LOOPEXITUNIFORM]]:
+; CHECK: br label %[[IFEND24UNIFORM:.+]]
+
+; CHECK: [[IFTHEN9UNIFORM:.+]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORCOND41PREHEADERUNIFORM:.+]], label %[[IFEND24UNIFORM]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND15PREHEADERUNIFORM]], label %[[WHILEBODYUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[WHILEBODYUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[FORCOND15PREHEADER]]
+
+; CHECK: [[IFEND24UNIFORM]]:
+; CHECK: %[[CMP25UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP25UNIFORM]], label %[[IFEND29UNIFORM:.+]], label %[[WHILEBODYUNIFORM]]
+
+; CHECK: [[IFEND29UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36UNIFORM:.+]], label %[[FOREND39UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[FOREND39UNIFORM]]:
+; CHECK: br label %[[EARLYUNIFORM:.+]]
+
+; CHECK: [[FORCOND41PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND41UNIFORM:.+]]
+
+; CHECK: [[FORCOND41UNIFORM]]:
+; CHECK: %[[CMP43UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP43UNIFORM]], label %[[FORBODY45UNIFORM:.+]], label %[[FOREND49UNIFORM:.+]]
+
+; CHECK: [[FORBODY45UNIFORM]]:
+; CHECK: br label %[[FORCOND41UNIFORM]]
+
+; CHECK: [[FOREND49UNIFORM]]:
+; CHECK: br label %[[HUNIFORM:.+]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[FORCOND52UNIFORM:.+]]
+
+; CHECK: [[FORCOND52UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56UNIFORM:.+]], label %[[EARLYLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY56UNIFORM]]:
+; CHECK: br label %[[FORCOND52UNIFORM]]
+
+; CHECK: [[EARLYLOOPEXITUNIFORM]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[FORCOND15PREHEADER]]:
+; CHECK: br label %[[FORCOND15:.+]]
+
+; CHECK: [[IFTHEN9:.+]]:
+; CHECK: br label %[[IFEND24:.+]]
+
+; CHECK: [[FORCOND41PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND41:.+]]
+
+; CHECK: [[FORCOND15]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY19]]:
+; CHECK: br label %[[FORCOND15]]
+
+; CHECK: [[IFEND24LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN9]]
+
+; CHECK: [[IFEND24]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFEND29:.+]]
+
+; CHECK: [[IFEND29]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[IFEND29ELSE:.+]]:
+; CHECK: br label %[[FORCOND41PREHEADER]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[FOREND39]]:
+; CHECK: br label %[[IFEND29ELSE]]
+
+; CHECK: [[FORCOND41]]:
+; CHECK: %[[CMP43:.+]] = icmp
+; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]]
+
+; CHECK: [[FORBODY45]]:
+; CHECK: br label %[[FORCOND41]]
+
+; CHECK: [[FOREND49]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[FORCOND52:.+]]
+
+; CHECK: [[FORCOND52]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56:.+]], label %[[EARLYLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY56]]:
+; CHECK: br label %[[FORCOND52]]
+
+; CHECK: [[EARLYLOOPEXIT]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
new file mode 100644
index 0000000000000..2c25911eeba63
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization17.ll
@@ -0,0 +1,470 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;              a
+;              |
+;              b <----.
+;             / \     |
+;            c   d    |
+;           /   / \   |
+;          e   f   g -'
+;         / \  |   |
+;   .--> h   | i   j
+;   |   / \  |  \ /
+;   '- k   l '-> m
+;      |    \   /
+;      n     \ /
+;       \     o
+;        \   /
+;         \ /
+;          p
+;
+; * where nodes b, d, and h are uniform branches, and nodes e and g are varying
+;   branches.
+; * where nodes h, j, m, o, and p are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;              a
+;              |
+;              b <----. .-----------> b' <----.
+;             / \     | |            / \      |
+;            c   d    | |           c'  d'    |
+;           /   / \   | |          /   / \    |
+;          e   f   g -' |         e'  f'  g' -'
+;         / \__|___|\___' _____  /    |   |
+;   .--> h   | i   j\____/ .-->`h'    i'  |
+;   |   / \  |  \ /        |   / \    |   |
+;   '- k   l '-> m         '- k'  l'  |   |
+;      |    \   /              \   \  |  /
+;      n     \ /                n'  \ | /
+;       \     o                  \   \|/
+;        \   /                    `-> j'
+;         \ /                         |
+;          p                          m'
+;          |                          |
+;          |                          o'
+;          |                          |
+;           `----------> & <--------- p'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization17(__global int *out, int n, int x) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 10) {
+;       goto c;
+;     } else if (n < 5) {
+;       goto f;
+;     }
+;     if (id + i++ % 2 == 0) {
+;       break;
+;     }
+;   }
+;
+;   // j
+;   for (int i = 0; i < n + 10; i++) ret++;
+;   goto m;
+;
+; f:
+;   ret += x / 2;
+;   for (int i = 0; i < x / 2; i++) ret += i;
+;   goto m;
+;
+; c:
+;   for (int i = 0; i < n - 5; i++) ret += 2;
+;   // e
+;   if (id % 2 == 0) {
+;     goto h;
+;   } else {
+;     goto m;
+;   }
+;
+; m:
+;   ret <<= 2;
+;   goto o;
+;
+; h:
+;   for (int i = 0; i < x / 2; i++) {
+;     if (x < 5) {
+;       goto l;
+;     }
+;   }
+;   // n
+;   ret += id << 3;
+;   goto p;
+;
+; l:
+;   ret += id << 3;
+;
+; o:
+;   for (int i = 0; i < x / 2; i++) ret += i;
+;
+; p:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 noundef %n, i32 noundef %x) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end5, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end5 ]
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %for.cond28, label %if.else
+
+if.else:                                          ; preds = %while.body
+  %cmp2 = icmp slt i32 %n, 5
+  br i1 %cmp2, label %f, label %if.end5
+
+if.end5:                                          ; preds = %if.else
+  %inc = add nuw nsw i32 %i.0, 1
+  %rem = and i32 %i.0, 1
+  %add = sub nsw i32 0, %rem
+  %cmp6 = icmp eq i32 %conv, %add
+  br i1 %cmp6, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %if.end5
+  %ret.0 = phi i32 [ %inc14, %for.body ], [ 0, %if.end5 ]
+  %storemerge = phi i32 [ %inc15, %for.body ], [ 0, %if.end5 ]
+  %add11 = add nsw i32 %n, 10
+  %cmp12 = icmp slt i32 %storemerge, %add11
+  br i1 %cmp12, label %for.body, label %m
+
+for.body:                                         ; preds = %for.cond
+  %inc14 = add nuw nsw i32 %ret.0, 1
+  %inc15 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+f:                                                ; preds = %if.else
+  %div = sdiv i32 %x, 2
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.body22, %f
+  %ret.1 = phi i32 [ %div, %f ], [ %add23, %for.body22 ]
+  %storemerge3 = phi i32 [ 0, %f ], [ %inc25, %for.body22 ]
+  %div19 = sdiv i32 %x, 2
+  %cmp20 = icmp slt i32 %storemerge3, %div19
+  br i1 %cmp20, label %for.body22, label %m
+
+for.body22:                                       ; preds = %for.cond18
+  %add23 = add nsw i32 %storemerge3, %ret.1
+  %inc25 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond18
+
+for.cond28:                                       ; preds = %for.body32, %while.body
+  %ret.2 = phi i32 [ %add33, %for.body32 ], [ 0, %while.body ]
+  %storemerge4 = phi i32 [ %inc35, %for.body32 ], [ 0, %while.body ]
+  %add29 = add nsw i32 %n, 5
+  %cmp30 = icmp slt i32 %storemerge4, %add29
+  br i1 %cmp30, label %for.body32, label %for.end36
+
+for.body32:                                       ; preds = %for.cond28
+  %add33 = add nuw nsw i32 %ret.2, 2
+  %inc35 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond28
+
+for.end36:                                        ; preds = %for.cond28
+  %rem375 = and i32 %conv, 1
+  %cmp38 = icmp eq i32 %rem375, 0
+  br i1 %cmp38, label %for.cond43, label %m
+
+m:                                                ; preds = %for.end36, %for.cond18, %for.cond
+  %ret.3 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond18 ], [ %ret.2, %for.end36 ]
+  %shl = shl i32 %ret.3, 2
+  br label %o
+
+for.cond43:                                       ; preds = %for.inc52, %for.end36
+  %storemerge6 = phi i32 [ %inc53, %for.inc52 ], [ 0, %for.end36 ]
+  %div44 = sdiv i32 %x, 2
+  %cmp45 = icmp slt i32 %storemerge6, %div44
+  br i1 %cmp45, label %for.body47, label %for.end54
+
+for.body47:                                       ; preds = %for.cond43
+  %cmp48 = icmp slt i32 %x, 5
+  br i1 %cmp48, label %l, label %for.inc52
+
+for.inc52:                                        ; preds = %for.body47
+  %inc53 = add nuw nsw i32 %storemerge6, 1
+  br label %for.cond43
+
+for.end54:                                        ; preds = %for.cond43
+  %shl55 = mul i32 %conv, 8
+  %add56 = add nsw i32 %ret.2, %shl55
+  br label %p
+
+l:                                                ; preds = %for.body47
+  %shl57 = mul i32 %conv, 8
+  %add58 = add nsw i32 %ret.2, %shl57
+  br label %o
+
+o:                                                ; preds = %l, %m
+  %storemerge1 = phi i32 [ %shl, %m ], [ %add58, %l ]
+  br label %for.cond60
+
+for.cond60:                                       ; preds = %for.body64, %o
+  %ret.4 = phi i32 [ %storemerge1, %o ], [ %add65, %for.body64 ]
+  %storemerge2 = phi i32 [ 0, %o ], [ %inc67, %for.body64 ]
+  %div61 = sdiv i32 %x, 2
+  %cmp62 = icmp slt i32 %storemerge2, %div61
+  br i1 %cmp62, label %for.body64, label %p
+
+for.body64:                                       ; preds = %for.cond60
+  %add65 = add nsw i32 %storemerge2, %ret.4
+  %inc67 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond60
+
+p:                                                ; preds = %for.cond60, %for.end54
+  %ret.5 = phi i32 [ %add56, %for.end54 ], [ %ret.4, %for.cond60 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.5, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization17, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization17
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[FORCOND28PREHEADER]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FORCOND28PREHEADERELSE:.+]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[FORCOND28PREHEADERSPLIT:.+]]:
+; CHECK: br label %[[FORCOND28:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %[[CMP2]], label %[[F:.+]], label %[[IFEND5:.+]]
+
+; CHECK: [[IFEND5]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[FORCOND28PREHEADERUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: %[[CMP2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FUNIFORM:.+]], label %[[IFEND5UNIFORM:.+]]
+
+; CHECK: [[IFEND5UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFEND5UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[IFEND5UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFEND5UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFEND5UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[MLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[MLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[MUNIFORM:.+]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[FORCOND18UNIFORM:.+]]
+
+; CHECK: [[FORCOND18UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22UNIFORM:.+]], label %[[MLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY22UNIFORM]]:
+; CHECK: br label %[[FORCOND18UNIFORM]]
+
+; CHECK: [[MLOOPEXITUNIFORM]]:
+; CHECK: br label %[[MUNIFORM]]
+
+; CHECK: [[FORCOND28PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND28UNIFORM:.+]]
+
+; CHECK: [[FORCOND28UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32UNIFORM:.+]], label %[[FOREND36UNIFORM:.+]]
+
+; CHECK: [[FORBODY32UNIFORM]]:
+; CHECK: br label %[[FORCOND28UNIFORM]]
+
+; CHECK: [[FOREND36UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND43PREHEADERUNIFORM:.+]], label %[[FOREND36UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND43PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND43UNIFORM:.+]]
+
+; CHECK: [[FOREND36UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[MUNIFORM]], label %[[FORCOND43PREHEADER:.+]]
+
+; CHECK: [[FORCOND43UNIFORM]]:
+; CHECK: %[[CMP45UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP45UNIFORM]], label %[[FORBODY47UNIFORM:.+]], label %[[FOREND54UNIFORM:.+]]
+
+; CHECK: [[FORBODY47UNIFORM]]:
+; CHECK: %[[CMP48UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP48UNIFORM]], label %[[LUNIFORM:.+]], label %[[FORINC52UNIFORM:.+]]
+
+; CHECK: [[FORINC52UNIFORM]]:
+; CHECK: br label %[[FORCOND43UNIFORM]]
+
+; CHECK: [[FOREND54UNIFORM]]:
+; CHECK: br label %[[PUNIFORM:.+]]
+
+; CHECK: [[LUNIFORM]]:
+; CHECK: br label %[[OUNIFORM:.+]]
+
+; CHECK: [[MUNIFORM]]:
+; CHECK: br label %[[OUNIFORM]]
+
+; CHECK: [[OUNIFORM]]:
+; CHECK: br label %[[FORCOND60UNIFORM:.+]]
+
+; CHECK: [[FORCOND60UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64UNIFORM:.+]], label %[[PLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY64UNIFORM]]:
+; CHECK: br label %[[FORCOND60UNIFORM]]
+
+; CHECK: [[PLOOPEXITUNIFORM]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCONDPREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[MLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND28PREHEADERELSE]], label %[[FORCOND28PREHEADERSPLIT]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[FORCOND18:.+]]
+
+; CHECK: [[FORCOND18]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[MLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY22]]:
+; CHECK: br label %[[FORCOND18]]
+
+; CHECK: [[FORCOND28]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32:.+]], label %[[FOREND36:.+]]
+
+; CHECK: [[FORBODY32]]:
+; CHECK: br label %[[FORCOND28]]
+
+; CHECK: [[FOREND36]]:
+; CHECK: br label %[[FORCOND43PREHEADER]]
+
+; CHECK: [[FORCOND43PREHEADER]]:
+; CHECK: br label %[[FORCOND43:.+]]
+
+; CHECK: [[MLOOPEXIT]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[MLOOPEXIT2]]:
+; CHECK: br label %[[FORCONDPREHEADERELSE]]
+
+; CHECK: [[M]]:
+; CHECK: br label %[[O:.+]]
+
+; CHECK: [[FORCOND43]]:
+; CHECK: %[[CMP14:.+]] = icmp
+; CHECK: br i1 %[[CMP14]], label %[[FORBODY47:.+]], label %[[FOREND54:.+]]
+
+; CHECK: [[FORBODY47]]:
+; CHECK: %[[CMP48:.+]] = icmp
+; CHECK: br i1 %[[CMP48]], label %[[L:.+]], label %[[FORINC52:.+]]
+
+; CHECK: [[FORINC52]]:
+; CHECK: br label %[[FORCOND43]]
+
+; CHECK: [[FOREND54]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[L]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[O]]:
+; CHECK: br label %[[FORCOND60:.+]]
+
+; CHECK: [[FORCOND60]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64:.+]], label %[[PLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY64]]:
+; CHECK: br label %[[FORCOND60]]
+
+; CHECK: [[PLOOPEXIT]]:
+; CHECK: br label %[[P]]
+
+; CHECK: [[P]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
new file mode 100644
index 0000000000000..f9868c86a2d0b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization18.ll
@@ -0,0 +1,357 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <--.
+;      / \   |
+;     c   d -'
+;    / \  |
+;   e   f |
+;   |    \|
+;   |     g
+;   |    /
+;   |   h
+;    \ / \
+;     i   j
+;      \ /
+;       k
+;
+; * where nodes b, and h are uniform branches, and nodes c and d are varying
+;   branches.
+; * where nodes e, f, g, i and k are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <--. .-> b' <--.
+;      / \   | |  / \    |
+;     c   d -' | c'  d' -'
+;    / \__|\___' |   |
+;   e   f |`---> f'  |
+;   |    \|      |   |
+;   |     g      e'  |
+;   |    /        \ /
+;   |   h          g'
+;    \ / \         |
+;     i   j        h'
+;      \ /        / \
+;       k        |   j'
+;       |         \ /
+;       |          i'
+;       |          |
+;       `--> & <-- k'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization18(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 5) {
+;       if (id + i % 2 == 0) {
+;         goto e;
+;       } else {
+;         goto f;
+;       }
+;     }
+;     if (++i + id > 3) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   goto g;
+;
+; g:
+;   for (int i = 1; i < n * 2; i++) ret *= i;
+;   goto h;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) ret++;
+;   goto i;
+;
+; h:
+;   if (n > 3) {
+; i:
+;     ret++;
+;   } else {
+;     ret *= 3;
+;   }
+;
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp sgt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  %rem = and i32 %i.0, 1
+  %add = sub nsw i32 0, %rem
+  %cmp2 = icmp eq i32 %conv, %add
+  br i1 %cmp2, label %for.cond26, label %for.cond
+
+if.end:                                           ; preds = %while.body
+  %inc = add nuw nsw i32 %i.0, 1
+  %add5 = add nsw i32 %inc, %conv
+  %cmp6 = icmp sgt i32 %add5, 3
+  br i1 %cmp6, label %g, label %while.body
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %add14, %for.body ], [ 0, %if.then ]
+  %storemerge2 = phi i32 [ %inc15, %for.body ], [ 0, %if.then ]
+  %add11 = add nsw i32 %n, 5
+  %cmp12 = icmp slt i32 %storemerge2, %add11
+  br i1 %cmp12, label %for.body, label %g
+
+for.body:                                         ; preds = %for.cond
+  %add14 = add nuw nsw i32 %ret.0, 2
+  %inc15 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond
+
+g:                                                ; preds = %for.cond, %if.end
+  %ret.1 = phi i32 [ 0, %if.end ], [ %ret.0, %for.cond ]
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.body20, %g
+  %ret.2 = phi i32 [ %ret.1, %g ], [ %mul21, %for.body20 ]
+  %storemerge = phi i32 [ 1, %g ], [ %inc23, %for.body20 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp18 = icmp slt i32 %storemerge, %mul
+  br i1 %cmp18, label %for.body20, label %h
+
+for.body20:                                       ; preds = %for.cond17
+  %mul21 = mul nsw i32 %storemerge, %ret.2
+  %inc23 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond17
+
+for.cond26:                                       ; preds = %for.body30, %if.then
+  %ret.3 = phi i32 [ %inc31, %for.body30 ], [ 0, %if.then ]
+  %storemerge3 = phi i32 [ %inc33, %for.body30 ], [ 0, %if.then ]
+  %add27 = add nsw i32 %n, 5
+  %cmp28 = icmp slt i32 %storemerge3, %add27
+  br i1 %cmp28, label %for.body30, label %i38
+
+for.body30:                                       ; preds = %for.cond26
+  %inc31 = add nuw nsw i32 %ret.3, 1
+  %inc33 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond26
+
+h:                                                ; preds = %for.cond17
+  %cmp35 = icmp sgt i32 %n, 3
+  br i1 %cmp35, label %i38, label %if.else40
+
+i38:                                              ; preds = %h, %for.cond26
+  %ret.4 = phi i32 [ %ret.3, %for.cond26 ], [ %ret.2, %h ]
+  %inc39 = add nsw i32 %ret.4, 1
+  br label %if.end42
+
+if.else40:                                        ; preds = %h
+  %mul41 = mul nsw i32 %ret.2, 3
+  br label %if.end42
+
+if.end42:                                         ; preds = %if.else40, %i38
+  %storemerge1 = phi i32 [ %mul41, %if.else40 ], [ %inc39, %i38 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization18, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization18
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHENELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFTHENSPLIT:.+]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND26PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[GLOOPEXIT2:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFENDUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT1UNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[GLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
+
+; CHECK: [[IFTHENUNIFORM]]
+; CHECK: br i1 %{{.+}}, label %[[FORCOND26PREHEADERUNIFORM:.+]], label %[[IFTHENUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[GLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[GLOOPEXITUNIFORM]]:
+; CHECK: br label %[[GUNIFORM]]
+
+; CHECK: [[FORCOND26PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM:.+]]
+
+; CHECK: [[IFTHENUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FORCOND26UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30UNIFORM:.+]], label %[[I38LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY30UNIFORM]]:
+; CHECK: br label %[[FORCOND26UNIFORM]]
+
+; CHECK: [[I38LOOPEXITUNIFORM]]:
+; CHECK: br label %[[I38UNIFORM:.+]]
+
+; CHECK: [[GUNIFORM]]:
+; CHECK: br label %[[FORCOND17UNIFORM:.+]]
+
+; CHECK: [[FORCOND17UNIFORM]]:
+; CHECK: %[[CMP18UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP18UNIFORM]], label %[[FORBODY20UNIFORM:.+]], label %[[HUNIFORM:.+]]
+
+; CHECK: [[FORBODY20UNIFORM]]:
+; CHECK: br label %[[FORCOND17UNIFORM]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: %[[CMP35UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP35UNIFORM]], label %[[I38UNIFORM]], label %[[IFELSE40UNIFORM:.+]]
+
+; CHECK: [[IFELSE40UNIFORM]]:
+; CHECK: br label %[[IFEND42UNIFORM:.+]]
+
+; CHECK: [[I38UNIFORM]]:
+; CHECK: br label %[[IFEND42:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[GLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[FORCOND26PREHEADER]]
+
+; CHECK: [[GLOOPEXIT2]]:
+; CHECK: br label %[[GLOOPEXIT2ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT2ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND17:.+]]
+
+; CHECK: [[FORCOND17]]:
+; CHECK: %[[CMP18:.+]] = icmp
+; CHECK: br i1 %[[CMP18]], label %[[FORBODY20:.+]], label %[[H:.+]]
+
+; CHECK: [[FORBODY20]]:
+; CHECK: br label %[[FORCOND17]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[I38LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[H]]:
+; CHECK: %[[CMP35:.+]] = icmp
+; CHECK: br i1 %[[CMP35]], label %[[I38:.+]], label %[[IFELSE40:.+]]
+
+; CHECK: [[I38LOOPEXIT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[I38]]:
+; CHECK: br label %[[IFEND42]]
+
+; CHECK: [[IFELSE40]]:
+; CHECK: br label %[[I38]]
+
+; CHECK: [[IFEND42]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
new file mode 100644
index 0000000000000..37ca06b926eca
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization19.ll
@@ -0,0 +1,379 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <----.
+;      / \     |
+;     c   \    |
+;    / \   \   |
+;   d   e   f -'
+;   |   |   |
+;    \  \   g
+;     \  \ / \
+;      \  h   i <,
+;       \  \ /  /
+;        \  j  /
+;         \   /
+;          `-'
+;
+; * where nodes b, c, and g are uniform branches, and node f is a varying
+;   branch.
+; * where nodes g, h, i and j are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <----. .---> b' <----.
+;      / \     | |    / \      |
+;     c   \    | |   c'  \     |
+;    / \   \   | |  / \   \    |
+;   d   e   f -' | d'  e'  f' -'
+;   |   |   |\___' |   |   |
+;    \  \   g       \  |  /
+;     \  \ / \       \ | /
+;      \  h   i <,    \|/
+;       \  \ /  /      g'
+;        \  j  /       |
+;         \ | /        i'
+;          `-'         |
+;           |          h'
+;           |          |
+;            `--> & <- j'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; The uniform branch `g` has been linearized because both its successors are
+; divergent. Not linearizing `g`  would mean that only one of both
+; successors could be executed in addition to the other, pending a uniform
+; condition evaluates to true, whereas what we want is to possibly execute both
+; no matter what the uniform condition evaluates to.
+;
+; __kernel void partial_linearization19(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 5) {
+;       if (n == 6) {
+;         goto d;
+;       } else {
+;         goto e;
+;       }
+;     }
+;     if (++i + id > 3) {
+;       break;
+;     }
+;   }
+;
+;   // g
+;   if (n == 3) {
+;     goto h;
+;   } else {
+;     goto i;
+;   }
+;
+; d:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   goto i;
+;
+; e:
+;   for (int i = 1; i < n * 2; i++) ret += i;
+;   goto h;
+;
+; i:
+;   for (int i = 0; i < n + 5; i++) ret++;
+;   goto j;
+;
+; h:
+;   for (int i = 0; i < n; i++) ret++;
+;   goto j;
+;
+; j:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp sgt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp eq i32 %n, 6
+  br i1 %cmp2, label %for.cond, label %for.cond20
+
+if.end:                                           ; preds = %while.body
+  %inc = add nuw nsw i32 %i.0, 1
+  %add = add nsw i32 %inc, %conv
+  %cmp5 = icmp sgt i32 %add, 3
+  br i1 %cmp5, label %while.end, label %while.body
+
+while.end:                                        ; preds = %if.end
+  %cmp9 = icmp eq i32 %n, 3
+  br i1 %cmp9, label %h, label %i28
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %add17, %for.body ], [ 0, %if.then ]
+  %storemerge3 = phi i32 [ %inc18, %for.body ], [ 0, %if.then ]
+  %add14 = add nsw i32 %n, 5
+  %cmp15 = icmp slt i32 %storemerge3, %add14
+  br i1 %cmp15, label %for.body, label %i28
+
+for.body:                                         ; preds = %for.cond
+  %add17 = add nuw nsw i32 %ret.0, 2
+  %inc18 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond
+
+for.cond20:                                       ; preds = %for.body23, %if.then
+  %ret.1 = phi i32 [ %add24, %for.body23 ], [ 0, %if.then ]
+  %storemerge2 = phi i32 [ %inc26, %for.body23 ], [ 1, %if.then ]
+  %mul = shl nsw i32 %n, 1
+  %cmp21 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp21, label %for.body23, label %h
+
+for.body23:                                       ; preds = %for.cond20
+  %add24 = add nuw nsw i32 %storemerge2, %ret.1
+  %inc26 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond20
+
+i28:                                              ; preds = %for.cond, %while.end
+  %ret.2 = phi i32 [ 0, %while.end ], [ %ret.0, %for.cond ]
+  br label %for.cond30
+
+for.cond30:                                       ; preds = %for.body34, %i28
+  %ret.3 = phi i32 [ %ret.2, %i28 ], [ %inc35, %for.body34 ]
+  %storemerge = phi i32 [ 0, %i28 ], [ %inc37, %for.body34 ]
+  %add31 = add nsw i32 %n, 5
+  %cmp32 = icmp slt i32 %storemerge, %add31
+  br i1 %cmp32, label %for.body34, label %j
+
+for.body34:                                       ; preds = %for.cond30
+  %inc35 = add nuw nsw i32 %ret.3, 1
+  %inc37 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond30
+
+h:                                                ; preds = %for.cond20, %while.end
+  %ret.4 = phi i32 [ 0, %while.end ], [ %ret.1, %for.cond20 ]
+  br label %for.cond40
+
+for.cond40:                                       ; preds = %for.body43, %h
+  %ret.5 = phi i32 [ %ret.4, %h ], [ %inc44, %for.body43 ]
+  %storemerge1 = phi i32 [ 0, %h ], [ %inc46, %for.body43 ]
+  %cmp41 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp41, label %for.body43, label %j
+
+for.body43:                                       ; preds = %for.cond40
+  %inc44 = add nsw i32 %ret.5, 1
+  %inc46 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond40
+
+j:                                                ; preds = %for.cond40, %for.cond30
+  %ret.6 = phi i32 [ %ret.3, %for.cond30 ], [ %ret.5, %for.cond40 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization19, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization19
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHENELSE:.+]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[IFTHENSPLIT:.+]]:
+; CHECK: br i1 %[[CMP2MERGE:.+]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND20PREHEADER:.+]]
+
+; CHECK: [[FORCOND20PREHEADER]]:
+; CHECK: br label %[[FORCOND20:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: %[[CMP2MERGE]] = phi i1 [ %[[CMP2]], %[[IFTHEN]] ], [ false, %[[IFEND]] ]
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFENDUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEENDUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[WHILEENDUNIFORM]]:
+; CHECK: %[[CMP9UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP9UNIFORM]], label %[[HUNIFORM:.+]], label %[[I28UNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: %[[CMP2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP2UNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[FORCOND20PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND20PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND20UNIFORM:.+]]
+
+; CHECK: [[FORCOND20UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23UNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY23UNIFORM]]:
+; CHECK: br label %[[FORCOND20UNIFORM]]
+
+; CHECK: [[HLOOPEXITUNIFORM]]:
+; CHECK: br label %[[HUNIFORM]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODYUNIFORM:.+]], label %[[I28LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[I28LOOPEXITUNIFORM]]:
+; CHECK: br label %[[I28UNIFORM]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[FORCOND40UNIFORM:.+]]
+
+; CHECK: [[FORCOND40UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43UNIFORM:.+]], label %[[JLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY43UNIFORM]]:
+; CHECK: br label %[[FORCOND40UNIFORM]]
+
+; CHECK: [[JLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[JUNIFORM:.+]]
+
+; CHECK: [[I28UNIFORM]]:
+; CHECK: br label %[[FORCOND30UNIFORM:.+]]
+
+; CHECK: [[FORCOND30UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34UNIFORM:.+]], label %[[JLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY34UNIFORM]]:
+; CHECK: br label %[[FORCOND30UNIFORM]]
+
+; CHECK: [[JLOOPEXITUNIFORM]]:
+; CHECK: br label %[[J:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[WHILEENDELSE:.+]]
+
+; CHECK: [[WHILEENDELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[I28LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND20]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY23]]:
+; CHECK: br label %[[FORCOND20]]
+
+; CHECK: [[I28LOOPEXIT]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[I28:.+]]:
+; CHECK: br label %[[FORCOND30:.+]]
+
+; CHECK: [[FORCOND30]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34:.+]], label %[[JLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY34]]:
+; CHECK: br label %[[FORCOND30]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[FORCOND40:.+]]
+
+; CHECK: [[FORCOND40]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43:.+]], label %[[JLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY43]]:
+; CHECK: br label %[[FORCOND40]]
+
+; CHECK: [[JLOOPEXIT]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[JLOOPEXIT2]]:
+; CHECK: br label %[[I28]]
+
+; CHECK: [[J]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
new file mode 100644
index 0000000000000..401dfd4781787
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization2.ll
@@ -0,0 +1,340 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   X   /
+;      \ / \ /
+;       h   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b__     c________
+;    / \ \___/_\___    \
+;   d   e   f   g  `e'  g'
+;    \   \ /   /    |   |
+;     \   X   /     d'  f'
+;      \ / \ /       \ /
+;       h   i         i'
+;        \ /          |
+;         j           h'
+;          \          |
+;           `--> & <- j'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization2(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n < 10) { // uniform
+;     if (id % 3 == 0) { // varying
+;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n / 3; i++) { ret -= 2; } goto i;
+;     }
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n * 2; i++) { ret += 1; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;   goto end;
+;
+; i:
+;   ret *= 10;
+;   goto end;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else17
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %if.else
+
+if.then4:                                         ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then4
+  %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ]
+  %storemerge5 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ]
+  %sub = add nsw i32 %n, -1
+  %cmp5 = icmp slt i32 %storemerge5, %sub
+  br i1 %cmp5, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %div = sdiv i32 %ret.0, 2
+  %inc = add nsw i32 %storemerge5, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %if.then
+  br label %for.cond8
+
+for.cond8:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ]
+  %storemerge4 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div9 = sdiv i32 %n, 3
+  %cmp10 = icmp slt i32 %storemerge4, %div9
+  br i1 %cmp10, label %for.body12, label %i42
+
+for.body12:                                       ; preds = %for.cond8
+  %sub13 = add nsw i32 %ret.1, -2
+  %inc15 = add nsw i32 %storemerge4, 1
+  br label %for.cond8
+
+if.else17:                                        ; preds = %entry
+  %rem181 = and i32 %conv, 1
+  %cmp19 = icmp eq i32 %rem181, 0
+  br i1 %cmp19, label %if.then21, label %if.else30
+
+if.then21:                                        ; preds = %if.else17
+  br label %for.cond23
+
+for.cond23:                                       ; preds = %for.body26, %if.then21
+  %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ]
+  %storemerge3 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp24 = icmp slt i32 %storemerge3, %mul
+  br i1 %cmp24, label %for.body26, label %h
+
+for.body26:                                       ; preds = %for.cond23
+  %add = add nsw i32 %ret.2, 1
+  %inc28 = add nsw i32 %storemerge3, 1
+  br label %for.cond23
+
+if.else30:                                        ; preds = %if.else17
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 5
+  %cmp34 = icmp slt i32 %storemerge, %add33
+  br i1 %cmp34, label %for.body36, label %i42
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.3, 1
+  %inc39 = add nsw i32 %storemerge, 1
+  br label %for.cond32
+
+h:                                                ; preds = %for.cond23, %for.cond
+  %ret.4 = phi i32 [ %ret.0, %for.cond ], [ %ret.2, %for.cond23 ]
+  %add41 = add nsw i32 %ret.4, 5
+  br label %end
+
+i42:                                              ; preds = %for.cond32, %for.cond8
+  %ret.5 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.3, %for.cond32 ]
+  %mul43 = mul nsw i32 %ret.5, 10
+  br label %end
+
+end:                                              ; preds = %i42, %h
+  %storemerge2 = phi i32 [ %mul43, %i42 ], [ %add41, %h ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization2, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization2
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND8PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND8UNIFORM:.+]]
+
+; CHECK: [[FORCOND8UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[I42LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY12UNIFORM]]:
+; CHECK: br label %[[FORCOND8UNIFORM]]
+
+; CHECK: [[I42LOOPEXITUNIFORM]]:
+; CHECK: br label %[[I42UNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND8PREHEADERUNIFORM]], label %[[FORCOND8PREHEADER:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMP5UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[HLOOPEXITUNIFORM]]:
+; CHECK: br label %[[HUNIFORM:.+]]
+
+; CHECK: [[FORCOND8PREHEADER]]:
+; CHECK: br label %[[FORCOND8:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND8]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[I42LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND8]]
+
+; CHECK: [[IFELSE17]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND23PREHEADERUNIFORM:.+]], label %[[IFELSE17BOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: br i1 false, label %[[FORBODY36UNIFORM:.+]], label %[[I42LOOPEXIT2UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[I42LOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[I42UNIFORM]]
+
+; CHECK: [[FORCOND23PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND23UNIFORM:.+]]
+
+; CHECK: [[IFELSE17BOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM]], label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND23UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26UNIFORM:.+]], label %[[HLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY26UNIFORM]]:
+; CHECK: br label %[[FORCOND23UNIFORM]]
+
+; CHECK: [[HLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[HUNIFORM]]
+
+; CHECK: [[I42UNIFORM]]:
+; CHECK: br label %[[ENDUNIFORM:.+]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[END:.+]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND23PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND23:.+]]
+
+; CHECK: [[FORCOND23]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY26]]:
+; CHECK: br label %[[FORCOND23]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[I42LOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[I42:.+]]
+
+; CHECK: [[HLOOPEXIT1]]:
+; CHECK: br label %[[I42]]
+
+; CHECK: [[H:.+]]:
+; CHECK: br label %[[END]]
+
+; CHECK: [[I42LOOPEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[I42LOOPEXIT2]]:
+; CHECK: br label %[[FORCOND23PREHEADER]]
+
+; CHECK: [[I42]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[END]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
new file mode 100644
index 0000000000000..9e7184f5507ce
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization20.ll
@@ -0,0 +1,288 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <--------.
+;    / \         |
+;   |   c        |
+;   |  / \       |
+;   | f   h <--. |
+;   | |  / \   | |
+;   | | |   d -' |
+;   | | |   |    |
+;   | | |   e ---'
+;   | | |  /
+;   | | | /
+;   | | |/
+;   | | /
+;    \|/
+;     g
+;
+; * where nodes b, d, and e are uniform branches, and node h is a varying
+;   branch.
+; * where nodes b, d and g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <--------.      b' <--.
+;    / \         |      |     |
+;   |   c        | .-.  c'    |
+;   |  / \       | |  \/|     |
+;   | f   h <--. | |  / h' <. |
+;   | |  / \   | | | f' |   | |
+;   | | |   d -' | | |  d' -' |
+;   | | |   |\___|_' |  |     |
+;   | | |   e ---'   |  e' ---'
+;   | | |  /          \ |
+;   | | | /            \|
+;   | | |/              g'
+;   | | /               |
+;    \|/               /
+;     g ----> & <-----'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization20(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto g;
+;     }
+;     if (n == 6) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (ret++ + id >= n) {
+;         goto d;
+;       }
+;       if (n & 1) {
+;         goto g;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+; e:
+;     if (n & 1) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   for (int i = 0; i < n + 1; i++) ret++;
+; g:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization20(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %inc, %e ]
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  br i1 %0, label %g, label %if.end
+
+if.end:                                           ; preds = %while.body
+  %cmp4 = icmp eq i32 %n, 6
+  br i1 %cmp4, label %for.cond, label %while.body9
+
+while.body9:                                      ; preds = %d, %if.end
+  %ret.1 = phi i32 [ %ret.0, %if.end ], [ %inc, %d ]
+  %inc = add nsw i32 %ret.1, 1
+  %add = add nsw i32 %ret.1, %conv
+  %cmp10 = icmp sge i32 %add, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond1 = or i1 %tobool, %cmp10
+  br i1 %or.cond1, label %d, label %g
+
+d:                                                ; preds = %while.body9
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body9
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+for.cond:                                         ; preds = %for.body, %if.end
+  %ret.2 = phi i32 [ %inc27, %for.body ], [ %ret.0, %if.end ]
+  %storemerge = phi i32 [ %inc28, %for.body ], [ 0, %if.end ]
+  %cmp25 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp25, label %g, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc27 = add nsw i32 %ret.2, 1
+  %inc28 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+g:                                                ; preds = %for.cond, %e, %while.body9, %while.body
+  %ret.3 = phi i32 [ %ret.0, %while.body ], [ %inc, %e ], [ %ret.2, %for.cond ], [ %inc, %while.body9 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization20, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization20
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: %[[CMP4:.+]] = icmp
+; CHECK: br i1 %[[CMP4]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODY9PREHEADER:.+]]
+
+; CHECK: [[WHILEBODY9PREHEADER]]:
+; CHECK: br label %[[WHILEBODY9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPREHEADERELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[FORCONDPREHEADERSPLIT:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[WHILEBODY9]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT2UNIFORM:.+]], label %[[IFENDUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORM]]:
+; CHECK: %[[CMP4UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP4UNIFORM]], label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[WHILEBODY9PREHEADERUNIFORM:.+]]
+
+; CHECK: [[WHILEBODY9PREHEADERUNIFORM]]:
+; CHECK: br label %[[WHILEBODY8UNIFORM:.+]]
+
+; CHECK: [[WHILEBODY9UNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[DUNIFORM:.+]], label %[[WHILEBODY9UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[DUNIFORM]]:
+; CHECK: %[[CMP16UNIFORM:.+]] = icmp
+; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[WHILEBODY9UNIFORM]]
+
+; CHECK: [[WHILEBODY9UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[GLOOPEXIT1UNIFORM:.+]], label %[[WHILEBODY9UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[WHILEBODY9UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[D]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: %[[TOBOOL21UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL21UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[GLOOPEXIT2UNIFORM]]
+
+
+; CHECK: [[GLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXITUNIFORM:.+]], label %[[FORBODYUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[GLOOPEXITUNIFORM]]:
+; CHECK: br label %[[GUNIFORM]]
+
+; CHECK: [[GLOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[D]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY9]], label %[[WHILEBODY9PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY9PUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[GLOOPEXIT1:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[GLOOPEXIT1]]:
+; CHECK: br label %[[GLOOPEXIT1ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT1ELSE]]:
+; CHECK: br label %[[GLOOPEXIT2:.+]]
+
+; CHECK: [[GLOOPEXIT2]]:
+; CHECK: br label %[[GLOOPEXIT2ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT2ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERELSE]], label %[[FORCONDPREHEADERSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
new file mode 100644
index 0000000000000..a91c3e08f752f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization21.ll
@@ -0,0 +1,239 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <------.
+;    / \       |
+;   |   c <--. |
+;   |  / \   | |
+;   | |   d -' |
+;   | |  / \   |
+;   | | |   e -'
+;   | | |  /
+;   | | | /
+;   | | |/
+;   | | /
+;    \|/
+;     f
+;
+; * where nodes b, d, and e are uniform branches, and node c is a varying
+;   branch.
+; * where nodes b, d, e and f are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <------.   b' <--.
+;    / \       |   |     |
+;   |   c <--. |   c' <. |
+;   |  / \___|_|__ |   | |
+;   | |   d -' |  `d' -' |
+;   | |  / \   |   |     |
+;   | | |   e -'   e' ---'
+;   | | |  /       |
+;   | | | /        f'
+;   | | |/         |
+;   | | /          |
+;    \|/          /
+;     f --> & <--'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization21(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (n <= 2) {
+;         goto f;
+;       } else {
+;         if (ret + id >= n) {
+;           goto d;
+;         }
+;       }
+;       if (n & 1) {
+;         goto f;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+;
+; e:
+;     if (n & 1) {
+;       goto f;
+;     }
+;   }
+;
+; f:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization21(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  %cmp6.old = icmp eq i32 %n, 3
+  br i1 %cmp6.old, label %if.else, label %f
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %f
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %f
+
+f:                                                ; preds = %e, %if.else, %while.body5, %while.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization21, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization21
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[IFELSEPREHEADER:.+]]
+
+; CHECK: [[IFELSEPREHEADER]]:
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[WHILEBODY5:.+]]:
+
+; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
+
+; CHECK: [[IFELSEPUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMP6UNIFORM:cmp.+]] = icmp
+; CHECK: %[[ORCOND1UNIFORM:.+]] = or i1 %[[CMP6UNIFORM]]
+; CHECK: br i1 %[[ORCOND1UNIFORM]], label %[[FLOOPEXIT1UNIFORM:.+]], label %[[IFELSEPREHEADERUNIFORM:.+]]
+
+; CHECK: [[IFELSEPREHEADERUNIFORM]]:
+; CHECK: br label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[DUNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[DUNIFORM]]:
+; CHECK: %[[CMP16UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP16UNIFORM]], label %[[EUNIFORM:.+]], label %[[WHILEBODY5UNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FLOOPEXITUNIFORM:.+]], label %[[IFELSEUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFELSEUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[D]]
+
+; CHECK: [[WHILEBODY5UNIFORM]]:
+; CHECK: %[[CMP6OLDUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP6OLDUNIFORM]], label %[[IFELSEUNIFORM]], label %[[FLOOPEXITUNIFORM]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: %[[TOBOOL21UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL21UNIFORM]], label %[[WHILEBODYUNIFORM]], label %[[FLOOPEXIT1UNIFORM]]
+
+
+; CHECK: [[FLOOPEXITUNIFORM]]:
+; CHECK: br label %[[FUNIFORM:.+]]
+
+; CHECK: [[FLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[WHILEBODY5]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[FLOOPEXITELSE:.+]]
+
+; CHECK: [[FLOOPEXITELSE]]:
+; CHECK: br label %[[FLOOPEXIT1:.+]]
+
+; CHECK: [[FLOOPEXIT1]]:
+; CHECK: br label %[[F]]
+
+; CHECK: [[F]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
new file mode 100644
index 0000000000000..acd9dcba0bb7e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization3.ll
@@ -0,0 +1,332 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   h   /
+;      \   \ /
+;       \   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g, i and j are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b__     c________
+;    / \ \___/_\___    \
+;   d   e   f   g  `e'  g'
+;   |    \ /   /    |   |
+;   j     h   /     d'  f'
+;   |      \ /       \ /
+;   |       i         h'
+;   |       |         |
+;   |       `--> & <- i'
+;   |            |
+;    `---> & <-- j'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization3(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n < 10) { // uniform
+;     if (id % 3 == 0) { // varying
+;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto end;
+;     } else { // varying
+;       for (int i = 0; i < n / 3; i++) { ret -= 2; } goto h;
+;     }
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n * 2; i++) { ret += 1; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;
+; i:
+;   ret *= 10;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else17
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %if.else
+
+if.then4:                                         ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then4
+  %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ]
+  %storemerge4 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ]
+  %sub = add nsw i32 %n, -1
+  %cmp5 = icmp slt i32 %storemerge4, %sub
+  br i1 %cmp5, label %for.body, label %end
+
+for.body:                                         ; preds = %for.cond
+  %div = sdiv i32 %ret.0, 2
+  %inc = add nsw i32 %storemerge4, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %if.then
+  br label %for.cond8
+
+for.cond8:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ]
+  %storemerge3 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div9 = sdiv i32 %n, 3
+  %cmp10 = icmp slt i32 %storemerge3, %div9
+  br i1 %cmp10, label %for.body12, label %h
+
+for.body12:                                       ; preds = %for.cond8
+  %sub13 = add nsw i32 %ret.1, -2
+  %inc15 = add nsw i32 %storemerge3, 1
+  br label %for.cond8
+
+if.else17:                                        ; preds = %entry
+  %rem181 = and i32 %conv, 1
+  %cmp19 = icmp eq i32 %rem181, 0
+  br i1 %cmp19, label %if.then21, label %if.else30
+
+if.then21:                                        ; preds = %if.else17
+  br label %for.cond23
+
+for.cond23:                                       ; preds = %for.body26, %if.then21
+  %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ]
+  %storemerge2 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp24 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp24, label %for.body26, label %h
+
+for.body26:                                       ; preds = %for.cond23
+  %add = add nsw i32 %ret.2, 1
+  %inc28 = add nsw i32 %storemerge2, 1
+  br label %for.cond23
+
+if.else30:                                        ; preds = %if.else17
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 5
+  %cmp34 = icmp slt i32 %storemerge, %add33
+  br i1 %cmp34, label %for.body36, label %i42
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.3, 1
+  %inc39 = add nsw i32 %storemerge, 1
+  br label %for.cond32
+
+h:                                                ; preds = %for.cond23, %for.cond8
+  %ret.4 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.2, %for.cond23 ]
+  %add41 = add nsw i32 %ret.4, 5
+  br label %i42
+
+i42:                                              ; preds = %h, %for.cond32
+  %ret.5 = phi i32 [ %add41, %h ], [ %ret.3, %for.cond32 ]
+  %mul43 = mul nsw i32 %ret.5, 10
+  br label %end
+
+end:                                              ; preds = %i42, %for.cond
+  %ret.6 = phi i32 [ %mul43, %i42 ], [ %ret.0, %for.cond ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization3, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization3
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERUNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND8PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND8UNIFORM:.+]]
+
+; CHECK: [[FORCOND8UNIFORM]]:
+; CHECK: %[[CMP10UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP10UNIFORM]], label %[[FORBODY12UNIFORM:.+]], label %[[HLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY12UNIFORM]]:
+; CHECK: br label %[[FORCOND8UNIFORM]]
+
+; CHECK: [[HLOOPEXITUNIFORM]]:
+; CHECK: br label %[[HUNIFORM:.+]]
+
+; CHECK: [[FORCONDPREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND8PREHEADERUNIFORM]], label %[[FORCOND8PREHEADER:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMP5UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP5UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[ENDLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[ENDLOOPEXITUNIFORM]]:
+; CHECK: br label %[[END:.+]]
+
+; CHECK: [[FORCOND8PREHEADER]]:
+; CHECK: br label %[[FORCOND8:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[EXITCOND:.+]] = icmp
+; CHECK: br i1 %[[EXITCOND]], label %[[FORBODY:.+]], label %[[ENDLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND8]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND8]]
+
+; CHECK: [[IFELSE17]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND23PREHEADERUNIFORM:.+]], label %[[IFELSE17BOSCCINDIR:.+]]
+
+; CHECK: [[FORCOND32PREHEADERUNIFORM:.+]]:
+; CHECK: br label %[[FORCOND32UNIFORM:.+]]
+
+; CHECK: [[FORCOND32UNIFORM]]:
+; CHECK: br i1 false, label %[[FORBODY36UNIFORM:.+]], label %[[ENDLOOPEXIT2UNIFORM:.+]]
+
+; CHECK: [[FORBODY36UNIFORM]]:
+; CHECK: br label %[[FORCOND32UNIFORM]]
+
+; CHECK: [[ENDLOOPEXIT2UNIFORM]]:
+; CHECK: br label %[[END]]
+
+; CHECK: [[FORCOND23PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND23UNIFORM:.+]]
+
+; CHECK: [[IFELSE17BOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND32PREHEADERUNIFORM:.+]], label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND23UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26UNIFORM:.+]], label %[[HLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY26UNIFORM]]:
+; CHECK: br label %[[FORCOND23UNIFORM]]
+
+; CHECK: [[HLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[HUNIFORM]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[END]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND23PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND23:.+]]
+
+; CHECK: [[FORCOND23]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY26]]:
+; CHECK: br label %[[FORCOND23]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[ENDLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[HLOOPEXIT1]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[I42:.+]]
+
+; CHECK: [[ENDLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[ENDLOOPEXIT2]]:
+; CHECK: br label %[[FORCOND23PREHEADER]]
+
+; CHECK: [[END]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
new file mode 100644
index 0000000000000..5c6f686043c6f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization4.ll
@@ -0,0 +1,219 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <-.
+;    / \  |
+;   e   c |
+;   |  / \|
+;   | f   d
+;   |/
+;   g
+;
+; * where node b is a uniform branch, and node c is a varying branch.
+; * where nodes f, d and g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <-.     b' <--.
+;    / \  |    / \    |
+;   e   c_|_  e'  c'  |
+;   |  / \| \_|__ |   |
+;   | f   d   |  `d' -'
+;   |/         \ /
+;   g           f'
+;   |           |
+;   `---> & <-- g'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization4(__global int *out, int n) {
+;   int id = get_global_id(0);
+;
+;   int x = id / n;
+;   int y = id % n;
+;   int i = 0;
+;   for (;;) {
+;     if (n > 20) goto e;
+;     if (x + y > n) goto f;
+;     y++;
+;     x++;
+;     i++;
+;   }
+;
+; goto g;
+;
+; e:
+;   i *= 2 + n;
+;   goto g;
+;
+; f:
+;   i /= i + n;
+;
+; g:
+;   out[id] = x + y + i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization4(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %0 = icmp eq i32 %conv, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %conv, %5
+  %6 = icmp eq i32 %conv, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %rem = srem i32 %conv, %11
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end5, %entry
+  %x.0 = phi i32 [ %div, %entry ], [ %inc6, %if.end5 ]
+  %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end5 ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc7, %if.end5 ]
+  %cmp = icmp sgt i32 %n, 20
+  br i1 %cmp, label %e, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  %add = add nsw i32 %y.0, %x.0
+  %cmp2 = icmp sgt i32 %add, %n
+  br i1 %cmp2, label %f, label %if.end5
+
+if.end5:                                          ; preds = %if.end
+  %inc = add nsw i32 %y.0, 1
+  %inc6 = add nsw i32 %x.0, 1
+  %inc7 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %for.cond
+  %add8 = add nsw i32 %n, 2
+  %mul = mul nsw i32 %storemerge, %add8
+  br label %g
+
+f:                                                ; preds = %if.end
+  %add9 = add nsw i32 %storemerge, %n
+  %12 = icmp eq i32 %add9, 0
+  %13 = select i1 %12, i32 1, i32 %add9
+  %div10 = sdiv i32 %storemerge, %13
+  br label %g
+
+g:                                                ; preds = %f, %e
+  %storemerge1 = phi i32 [ %div10, %f ], [ %mul, %e ]
+  %add11 = add i32 %y.0, %x.0
+  %add12 = add i32 %add11, %storemerge1
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add12, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization4, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization4
+; CHECK: br i1 true, label %[[FORCONDUNIFORM:.+]], label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[E:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br label %[[IFEND5:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[EUNIFORM:.+]], label %[[IFENDUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[FUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFEND5UNIFORM:.+]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND5UNIFORM:.+]], label %[[IFENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFEND5]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFEND5]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPUREEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br label %[[FORCONDPUREEXIT]]
+
+; CHECK: [[EELSE:.+]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[ESPLIT:.+]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FELSE:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[EELSE]], label %[[ESPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
new file mode 100644
index 0000000000000..f7536ca9ad196
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization5.ll
@@ -0,0 +1,264 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c
+;   |\ / \
+;   | d   e
+;   |  \ /
+;   |   f
+;    \ /
+;     g
+;
+; * where node c is a uniform branch, and nodes a and b are varying branches.
+; * where nodes b, c, d, f, g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a________
+;    / \       \
+;   b   c       c'
+;   |\_/_\__   / \
+;   | d   e \ |   e'
+;   |  \ /   \ \ /
+;   |   f     \ b'
+;    \ /       \|
+;     g         d'
+;     |         |
+;     |         f'
+;     |         |
+;     `--> & <- g'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization5(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (id % 2 == 0) { // a
+;     if (id == 4) { // b
+;       goto g;
+;     } else {
+;       goto d;
+;     }
+;   } else { // c
+;     if (n % 2 == 0) {
+;       goto d;
+;     } else {
+;       goto e;
+;     }
+;   }
+;
+; d:
+;   for (int i = 0; i < n / 4; i++) { ret += i - 2; }
+;   goto f;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) { ret += i + 5; }
+;
+; f:
+;   ret *= ret % n;
+;   ret *= ret + 4;
+;
+; g:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %rem1 = and i32 %conv, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  %cmp2 = icmp eq i32 %conv, 4
+  br i1 %cmp2, label %g, label %d
+
+if.else5:                                         ; preds = %entry
+  %rem62 = and i32 %n, 1
+  %cmp7 = icmp eq i32 %rem62, 0
+  br i1 %cmp7, label %d, label %e
+
+d:                                                ; preds = %if.else5, %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %d
+  %ret.0 = phi i32 [ 0, %d ], [ %add, %for.body ]
+  %storemerge3 = phi i32 [ 0, %d ], [ %inc, %for.body ]
+  %div = sdiv i32 %n, 4
+  %cmp11 = icmp slt i32 %storemerge3, %div
+  br i1 %cmp11, label %for.body, label %f
+
+for.body:                                         ; preds = %for.cond
+  %sub = add i32 %ret.0, -2
+  %add = add i32 %sub, %storemerge3
+  %inc = add nsw i32 %storemerge3, 1
+  br label %for.cond
+
+e:                                                ; preds = %if.else5
+  br label %for.cond14
+
+for.cond14:                                       ; preds = %for.body18, %e
+  %ret.1 = phi i32 [ 0, %e ], [ %add20, %for.body18 ]
+  %storemerge = phi i32 [ 0, %e ], [ %inc22, %for.body18 ]
+  %add15 = add nsw i32 %n, 5
+  %cmp16 = icmp slt i32 %storemerge, %add15
+  br i1 %cmp16, label %for.body18, label %f
+
+for.body18:                                       ; preds = %for.cond14
+  %add19 = add i32 %ret.1, 5
+  %add20 = add i32 %add19, %storemerge
+  %inc22 = add nsw i32 %storemerge, 1
+  br label %for.cond14
+
+f:                                                ; preds = %for.cond14, %for.cond
+  %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond14 ]
+  %0 = icmp eq i32 %ret.2, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %rem24 = srem i32 %ret.2, %5
+  %mul = mul nsw i32 %rem24, %ret.2
+  %add25 = add nsw i32 %mul, 4
+  %mul26 = mul nsw i32 %add25, %mul
+  br label %g
+
+g:                                                ; preds = %f, %if.then
+  %ret.3 = phi i32 [ %mul26, %f ], [ 0, %if.then ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization5, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization5
+; CHECK: br i1 %{{.+}}, label %[[IFTHENUNIFORM:.+]], label %[[ENTRYBOSCCINDIR:.+]]
+
+; CHECK: [[IFELSE5UNIFORM:.+]]:
+; CHECK: %[[CMP7UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP7UNIFORM]], label %[[DUNIFORM:.+]], label %[[FORCOND14PREHEADERUNIFORM:.+]]
+
+; CHECK: [[FORCOND14PREHEADERUNIFORM]]:
+; CHECK: br label %[[FORCOND14UNIFORM:.+]]
+
+; CHECK: [[FORCOND14UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18UNIFORM:.+]], label %[[FLOOPEXIT1UNIFORM:.+]]
+
+; CHECK: [[FORBODY18UNIFORM]]:
+; CHECK: br label %[[FORCOND14UNIFORM]]
+
+; CHECK: [[FLOOPEXIT1UNIFORM]]:
+; CHECK: br label %[[FUNIFORM:.+]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[GUNIFORM:.+]], label %[[IFTHENUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[ENTRYBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFELSE5UNIFORM]], label %[[IFELSE5:.+]]
+
+; CHECK: [[DUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: %[[CMP11UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP11UNIFORM]], label %[[FORBODYUNIFORM:.+]], label %[[FLOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FLOOPEXITUNIFORM]]:
+; CHECK: br label %[[FUNIFORM]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: %[[CMP7:.+]] = icmp
+; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
+
+; CHECK: [[FORCOND14PREHEADER]]:
+; CHECK: br label %[[FORCOND14:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP11:.+]] = icmp
+; CHECK: br i1 %[[CMP11]], label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND14]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18:.+]], label %[[FLOOPEXIT1:.+]]
+
+; CHECK: [[FORBODY18]]:
+; CHECK: br label %[[FORCOND14]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[FLOOPEXIT1]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
+
+; CHECK: [[IFTHENUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[DUNIFORM]], label %[[D]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
new file mode 100644
index 0000000000000..f1b5f3582dd7a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization6.ll
@@ -0,0 +1,228 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization6 -vecz-passes="function(simplifycfg),vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ g
+;      \|
+;       h
+;
+; * where nodes b and c are uniform branches, and node f is a varying
+;   branch.
+; * where nodes g and h are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <-. .---> b' <-.
+;      / \  | |    / \   |
+;     c   d | |   c'  d' |
+;    / \ /  | |  / \ /   |
+;   e   f --' | e'  f' --'
+;    \  |\____'  \  |
+;     \ g         \ |
+;      \|          \|
+;       h           g'
+;       |           |
+;       `---> & <-- h'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization6(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n % 2 == 0) {
+;       if (n > 2) {
+;         goto e;
+;       }
+;     } else {
+;       ret += n + 1;
+;     }
+;     if (id == n) break;
+;   }
+;
+;   ret += n * 2;
+;   ret /= n;
+;   goto early;
+;
+; e:
+;   ret += n * 4;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end10, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.1, %if.end10 ]
+  %rem1 = and i32 %n, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 2
+  br i1 %cmp2, label %e, label %if.end6
+
+if.else:                                          ; preds = %while.body
+  %add = add nsw i32 %n, 1
+  %add5 = add nsw i32 %add, %ret.0
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.else, %if.then
+  %ret.1 = phi i32 [ %add5, %if.else ], [ %ret.0, %if.then ]
+  %cmp7 = icmp eq i32 %conv, %n
+  br i1 %cmp7, label %while.end, label %if.end10
+
+if.end10:                                         ; preds = %if.end6
+  br label %while.body
+
+while.end:                                        ; preds = %if.end6
+  %mul = shl nsw i32 %n, 1
+  %add11 = add nsw i32 %ret.1, %mul
+  %0 = icmp eq i32 %add11, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %add11, %5
+  br label %early
+
+e:                                                ; preds = %if.then
+  %mul12 = mul i32 %n, 4
+  %n.neg = sub i32 0, %n
+  %add13 = add i32 %mul12, %n.neg
+  %sub = add i32 %add13, %ret.0
+  br label %early
+
+early:                                            ; preds = %e, %while.end
+  %storemerge = phi i32 [ %div, %while.end ], [ %sub, %e ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization6, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization6
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %[[CMP2]], label %[[E:.+]], label %[[IFEND6:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFEND6]]
+
+; CHECK: [[IFEND6]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: %[[CMPUNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMPUNIFORM]], label %[[IFTHENUNIFORM:.+]], label %[[IFELSEUNIFORM:.+]]
+
+; CHECK: [[IFELSEUNIFORM]]:
+; CHECK: br label %[[IFEND6UNIFORM:.+]]
+
+; CHECK: [[IFTHENUNIFORM]]:
+; CHECK: %[[CMP2UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP2UNIFORM]], label %[[EUNIFORM:.+]], label %[[IFEND6EUNIFORM:.+]]
+
+; CHECK: [[IFEND6UNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEENDUNIFORM:.+]], label %[[IFEND6UNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[WHILEENDUNIFORM]]:
+; CHECK: br label %[[EARLYUNIFORM:.+]]
+
+; CHECK: [[IFEND6UNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[IFEND6UNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFEND6UNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[WHILEENDELSE:.+]]
+
+; CHECK: [[WHILEENDELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[EELSE:.+]], label %[[ESPLIT:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[EELSE]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[ESPLIT]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
new file mode 100644
index 0000000000000..ab42eddff1897
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization7.ll
@@ -0,0 +1,262 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;      / \
+;     b   c
+;    / \ / \
+;   d   e   f
+;    \ / \ /
+;     g   h
+;      \ /
+;       i
+;
+; * where nodes a, c and e are uniform branches, and node b is a varying
+;   branch.
+; * where nodes d, e, g and i are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;          a
+;         / \
+;        /   \
+;       /     \
+;      /       \
+;     b____     c
+;    / \   \   / \
+;   d   e   d'|   |
+;    \ / \   \|   |
+;     g   h   e'  f
+;      \ /     \ /
+;       i       h'
+;       |       |
+;       |       g'
+;       |       |
+;       |       i'
+;        \     /
+;         \   /
+;          \ /
+;           &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization7(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int i = 0;
+;
+;   if (n > 10) { // a
+;     if (n + id > 10) { // b
+;       i = n * 10; // d
+;       goto g;
+;     } else {
+;       goto e;
+;     }
+;   } else {
+;     if (n < 5) { // c
+;       goto e;
+;     } else {
+;       for (int j = 0; j < n; j++) { i++; }
+;       goto h;
+;     }
+;   }
+;
+; e:
+;   if (n > 5) {
+;     goto g;
+;   } else {
+;     i = n * 3 / 5;
+;     goto h;
+;   }
+;
+; g:
+;   for (int j = 0; j < n; j++) { i++; }
+;   goto i;
+;
+; h:
+;   i = n + id / 3;
+;
+; i:
+;   out[id] = i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %conv, %n
+  %cmp2 = icmp sgt i32 %add, 10
+  br i1 %cmp2, label %if.then4, label %e
+
+if.then4:                                         ; preds = %if.then
+  %mul = mul nsw i32 %n, 10
+  br label %g
+
+if.else5:                                         ; preds = %entry
+  %cmp6 = icmp slt i32 %n, 5
+  br i1 %cmp6, label %e, label %if.else9
+
+if.else9:                                         ; preds = %if.else5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.else9
+  %storemerge = phi i32 [ 0, %if.else9 ], [ %inc12, %for.body ]
+  %cmp10 = icmp slt i32 %storemerge, %n
+  br i1 %cmp10, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %inc12 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %if.else5, %if.then
+  %cmp13 = icmp sgt i32 %n, 5
+  br i1 %cmp13, label %g, label %h
+
+g:                                                ; preds = %e, %if.then4
+  %i.1 = phi i32 [ %mul, %if.then4 ], [ 0, %e ]
+  br label %for.cond19
+
+for.cond19:                                       ; preds = %for.body22, %g
+  %i.2 = phi i32 [ %i.1, %g ], [ %inc23, %for.body22 ]
+  %storemerge1 = phi i32 [ 0, %g ], [ %inc25, %for.body22 ]
+  %cmp20 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp20, label %for.body22, label %i29
+
+for.body22:                                       ; preds = %for.cond19
+  %inc23 = add nsw i32 %i.2, 1
+  %inc25 = add nsw i32 %storemerge1, 1
+  br label %for.cond19
+
+h:                                                ; preds = %e, %for.cond
+  %div27 = sdiv i32 %conv, 3
+  %add28 = add nsw i32 %div27, %n
+  br label %i29
+
+i29:                                              ; preds = %h, %for.cond19
+  %i.3 = phi i32 [ %add28, %h ], [ %i.2, %for.cond19 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %i.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization7, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization7
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE5:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN4UNIFORM:.+]], label %[[IFTHENBOSCCINDIR:.+]]
+
+; CHECK: [[IFTHEN4UNIFORM]]:
+; CHECK: br label %[[GUNIFORM:.+]]
+
+; CHECK: [[IFTHENBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[IFTHEN4:.+]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: %[[CMP13UNIFORM:.+]] = icmp
+; CHECK: br i1 %[[CMP13UNIFORM]], label %[[GUNIFORM]], label %[[HUNIFORM:.+]]
+
+; CHECK: [[HUNIFORM]]:
+; CHECK: br label %[[I29UNIFORM:.+]]
+
+; CHECK: [[GUNIFORM]]:
+; CHECK: br label %[[FORCOND19UNIFORM:.+]]
+
+; CHECK: [[FORCOND19UNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22UNIFORM:.+]], label %[[I29LOOPEXITUNIFORM:.+]]
+
+; CHECK: [[FORBODY22UNIFORM]]:
+; CHECK: br label %[[FORCOND19UNIFORM]]
+
+; CHECK: [[I29LOOPEXITUNIFORM]]:
+; CHECK: br label %[[I29:.+]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: %[[CMP6:.+]] = icmp
+; CHECK: br i1 %[[CMP6]], label %[[E]], label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[E]]:
+; CHECK: %[[CMP13:.+]] = icmp
+; CHECK: br i1 %[[CMP13]], label %[[G:.+]], label %[[H:.+]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND19:.+]]
+
+; CHECK: [[FORCOND19]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[I29LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY22]]:
+; CHECK: br label %[[FORCOND19]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[I29LOOPEXIT]]:
+; CHECK: br label %[[I29]]
+
+; CHECK: [[I29]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
new file mode 100644
index 0000000000000..1245dc2ca0c0e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization8.ll
@@ -0,0 +1,220 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization8 -vecz-passes=cfg-convert -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <-.
+;    / \  |
+;   e   c |
+;   |  / \|
+;   | f   d
+;   |/
+;   g
+;
+; * where nodes b and c varying branches.
+; * where nodes e, f, d and g are divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <-.   b' <.
+;    / \__|__ |   |
+;   e   c_|__`c'  |
+;   |  / \|  \|   |
+;   | f   d   d' -'
+;   |/        |
+;   g         f'
+;   |         |
+;   |         e'
+;   |         |
+;   `--> & <- g'
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization8(__global int *out, int n) {
+;   int id = get_global_id(0);
+;
+;   int x = id / n;
+;   int y = id % n;
+;   int i = 0;
+;   for (;;) {
+;     if (i + id > n) goto e;
+;     if (x + y > n) goto f;
+;     y++;
+;     x++;
+;     i++;
+;   }
+;
+; goto g;
+;
+; e:
+;   i *= 2 + n;
+;   goto g;
+;
+; f:
+;   i /= i + n;
+;
+; g:
+;   out[id] = x + y + i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization8(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %0 = icmp eq i32 %conv, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %conv, %5
+  %6 = icmp eq i32 %conv, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %rem = srem i32 %conv, %11
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end6, %entry
+  %x.0 = phi i32 [ %div, %entry ], [ %inc7, %if.end6 ]
+  %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end6 ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc8, %if.end6 ]
+  %add = add nsw i32 %storemerge, %conv
+  %cmp = icmp sgt i32 %add, %n
+  br i1 %cmp, label %e, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  %add2 = add nsw i32 %y.0, %x.0
+  %cmp3 = icmp sgt i32 %add2, %n
+  br i1 %cmp3, label %f, label %if.end6
+
+if.end6:                                          ; preds = %if.end
+  %inc = add nsw i32 %y.0, 1
+  %inc7 = add nsw i32 %x.0, 1
+  %inc8 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %for.cond
+  %add9 = add nsw i32 %n, 2
+  %mul = mul nsw i32 %storemerge, %add9
+  br label %g
+
+f:                                                ; preds = %if.end
+  %add10 = add nsw i32 %storemerge, %n
+  %12 = icmp eq i32 %add10, 0
+  %13 = select i1 %12, i32 1, i32 %add10
+  %div11 = sdiv i32 %storemerge, %13
+  br label %g
+
+g:                                                ; preds = %f, %e
+  %storemerge1 = phi i32 [ %div11, %f ], [ %mul, %e ]
+  %add12 = add i32 %y.0, %x.0
+  %add13 = add i32 %add12, %storemerge1
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add13, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization8, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization8
+; CHECK: br i1 true, label %[[FORCONDUNIFORM:.+]], label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[EUNIFORM:.+]], label %[[FORCONDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFENDUNIFORM:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FUNIFORM:.+]], label %[[IFENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[IFEND6UNIFORM:.+]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FUNIFORM]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFEND6UNIFORM]], label %[[IFENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[IFENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFEND6:.+]]
+
+; CHECK: [[EUNIFORM]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[FORCONDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[IFENDUNIFORM]], label %[[FORCONDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[FORCONDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[IFEND]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br label %[[IFEND6]]
+
+; CHECK: [[IFEND6]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPUREEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[E:.+]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FELSE:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[E]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
new file mode 100644
index 0000000000000..43bb9c44eb492
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/partial_linearization9.ll
@@ -0,0 +1,173 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization9 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -vecz-choices=LinearizeBOSCC -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;   a
+;   |
+;   b <--.
+;   |    |
+;   c <. |
+;   |  | |
+;   d -' |
+;   |    |
+;   e ---'
+;   |
+;   f
+;
+; * where node e is a varying branch.
+; * where node f is divergent.
+;
+; With BOSCC, it will be transformed as follows:
+;
+;   a
+;   |
+;   b <--. .> b' <--.
+;   |    | |  |     |
+;   c <. | |  c' <. |
+;   |  | | |  |   | |
+;   d -' | |  d' -' |
+;   |    | |  |     |
+;   e ---' |  e' ---'
+;   |\_____'  |
+;   f         f'
+;    \       /
+;     \     /
+;      \   /
+;       \ /
+;        &
+;
+; where '&' represents merge blocks of BOSCC regions.
+;
+; __kernel void partial_linearization9(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int i = 0;
+;
+;   while (1) {
+;     int j = 0;
+;     for (; ; i++) {
+;       if (j++ > n) break;
+;     }
+;     if (i++ + id > n) break;
+;   }
+;
+;   out[id] = i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization9(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end7, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc3, %if.end7 ]
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %while.body
+  %i.1 = phi i32 [ %i.0, %while.body ], [ %inc3, %for.inc ]
+  %j.0 = phi i32 [ 0, %while.body ], [ %inc, %for.inc ]
+  %cmp = icmp sgt i32 %j.0, %n
+  %inc3 = add nsw i32 %i.1, 1
+  br i1 %cmp, label %for.end, label %for.inc
+
+for.inc:                                          ; preds = %for.cond
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %add = add nsw i32 %i.1, %conv
+  %cmp4 = icmp sgt i32 %add, %n
+  br i1 %cmp4, label %while.end, label %if.end7
+
+if.end7:                                          ; preds = %for.end
+  br label %while.body
+
+while.end:                                        ; preds = %for.end
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %inc3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization9, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization9
+; CHECK: br i1 true, label %[[WHILEBODYUNIFORM:.+]], label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FOREND:.+]], label %[[FORINC:.+]]
+
+; CHECK: [[FORINC]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEBODYUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM:.+]]
+
+; CHECK: [[FORCONDUNIFORM]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FORENDUNIFORM:.+]], label %[[FORINCUNIFORM:.+]]
+
+; CHECK: [[FORINCUNIFORM]]:
+; CHECK: br label %[[FORCONDUNIFORM]]
+
+; CHECK: [[FORENDUNIFORM]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEEND]], label %[[FORENDUNIFORMBOSCCINDIR:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: ret void
+
+; CHECK: [[FORENDUNIFORMBOSCCINDIR]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODYUNIFORM]], label %[[FORENDUNIFORMBOSCCSTORE:.+]]
+
+; CHECK: [[FORENDUNIFORMBOSCCSTORE]]:
+; CHECK: br label %[[WHILEBODY]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
new file mode 100644
index 0000000000000..83976b565214c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/Boscc/scalable_linearization.ll
@@ -0,0 +1,25 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that we don't crash when costing a scalable reduction
+; RUN: veczc -vecz-scalable -vecz-passes="pre-linearize" -vecz-choices=LinearizeBOSCC -S < %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @boscc_merge() {
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
new file mode 100644
index 0000000000000..a7d72cd259d0d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store.ll
@@ -0,0 +1,82 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
+; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) #[[ATTRS:[0-9]+]]
+; CHECK: ret void
+
+; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
new file mode 100644
index 0000000000000..1f70bde790233
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/define_interleaved_store_as_masked.ll
@@ -0,0 +1,82 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
+; CHECK:  %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}) #[[ATTRS:[0-9]+]]
+; CHECK: ret void
+
+; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
new file mode 100644
index 0000000000000..7d9b0385dbb90
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_uniform.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %call1 = call i64 @__mux_get_global_size(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
+  %0 = extractelement <4 x i1> %cmp2, i64 0
+  br i1 %0, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
+
+; This test checks if a uniform <4 x i32> phi is not scalarized
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %[[STOREMERGE:.+]] = phi <4 x i32> [ %[[INC:.+]], %for.body ], [ zeroinitializer, %entry.ROSCC ]
+; CHECK: %[[INC]] = add <4 x i32> %storemerge, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
new file mode 100644
index 0000000000000..692a8cc7ecc5a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/PartialScalarization/vector_phi_varying.ll
@@ -0,0 +1,94 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %in2, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %initaddr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in2, i64 %call
+  %init = load <4 x i32>, <4 x i32> addrspace(1)* %initaddr
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ %init, %entry ]
+  %call1 = call i64 @__mux_get_global_size(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %0 = extractelement <4 x i32> %storemerge, i64 0
+  %cmp2 = icmp slt i32 %0, %conv
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
+
+; This test checks if a varying <4 x i32> phi gets scalarized
+; if it is only accessed through individually extracted elements.
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
+; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
+; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
+; CHECK: %storemerge{{[0-9]+}} = phi <4 x i32> [ %{{[0-9]+}}, %entry.ROSCC ], [ %inc{{[0-9]+}}, %for.cond ]
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: %inc{{[0-9]+}} = add <4 x i32> %storemerge{{[0-9]+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
new file mode 100644
index 0000000000000..36b06c64063d8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/broadcast_vector.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define dso_local spir_kernel void @vector_broadcast_const(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fadd <4 x float> %1, <float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000>
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fadd <4 x float> %1, %addend
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast_illegal(<32 x float> addrspace(1)* nocapture readonly %in, <32 x float> %addend, <32 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)*
+  %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64
+  %2 = fadd <32 x float> %1, %addend
+  %arrayidx3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %out, i64 %call
+  store <32 x float> %2, <32 x float> addrspace(1)* %arrayidx3, align 64
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast_regression(<4 x float> addrspace(1)* nocapture readonly %in, i32 %nancode, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x i32> addrspace(1)*
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %0, align 16
+  %and1.i.i.i1.i = and <4 x i32> %1, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  %cmp.i.i.i2.i = icmp ne <4 x i32> %and1.i.i.i1.i, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  %and2.i.i.i3.i = and <4 x i32> %1, <i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  %cmp3.i.i.i4.i = icmp eq <4 x i32> %and2.i.i.i3.i, zeroinitializer
+  %2 = or <4 x i1> %cmp.i.i.i2.i, %cmp3.i.i.i4.i
+  %3 = bitcast <4 x i32> %1 to <4 x float>
+  %4 = select <4 x i1> %2, <4 x float> %3, <4 x float> <float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000>
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+; Check that new instructions aren't inserting before pre-existing allocas
+define dso_local spir_kernel void @vector_broadcast_insertpt(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, i32 %nancode, <4 x float> addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) local_unnamed_addr #0 {
+entry:
+  %existing.alloc = alloca <4 x i32>
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  store <4 x i32> zeroinitializer, <4 x i32>* %existing.alloc
+  %scalar = bitcast <4 x i32>* %existing.alloc to i32*
+  store i32 1, i32* %scalar
+  %v = load <4 x i32>, <4 x i32>* %existing.alloc
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %arrayidx4, align 16
+
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %op = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %v4 = fadd <4 x float> %op, %addend
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %v4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_mask_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x i1> %input, <4 x float> %woof, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fcmp oeq <4 x float> %1, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %3 = and <4 x i1> %2, %input
+  %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> %woof
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> splat (float 0x7FF8000020000000), ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:  [[VS2:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND:%.*]], i64 0)
+; CHECK-NEXT:  [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT:  [[VS1:%.*]] = and <vscale x 16 x i32> [[IDX0]], splat (i32 3)
+; CHECK-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:  [[TMP0:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4
+; CHECK-NEXT:  [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i32> [[VS1]], i64 [[TMP0]])
+; CHECK-NEXT:  [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:  [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:  [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:  [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:  store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:  ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_illegal(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <32 x float>, align 128
+; CHECK-NEXT:    store <32 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 128
+; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 128 x i32> @llvm.stepvector.nxv128i32()
+; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 128 x i32> [[IDX0]], splat (i32 31)
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 128 x i32> [[IDX1]] to <vscale x 128 x i64>
+; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 128 x i64> [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 128 x float> @llvm.masked.gather.nxv128f32.nxv128p0(<vscale x 128 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 128 x i1> splat (i1 true), <vscale x 128 x float> poison)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <32 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 128 x float>, ptr addrspace(1) [[ARRAYIDX]], align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 128 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <32 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 128 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 64
+; CHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], splat (i32 2139095040)
+; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], splat (i32 2139095040)
+; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], splat (i32 8388607)
+; CHECK-NEXT:    [[CMP3_I_I_I4_I4:%.*]] = icmp eq <vscale x 16 x i32> [[AND2_I_I_I3_I3]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> splat (float 0x7FF0000020000000)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_insertpt(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   [[VS22:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[ADDEND]], i64 0)
+; CHECK-NEXT:   [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT:   [[VS14:%.*]] = and <vscale x 16 x i32> [[IDX03]], splat (i32 3)
+; CHECK-NEXT:   [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:   [[TMP1:%.*]] = shl {{(nuw )?}}i64 [[TMP0]], 4
+; CHECK-NEXT:   [[TMP2:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS22]], <vscale x 16 x i32> [[VS14]], i64 [[TMP1]])
+; CHECK-NEXT:   [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.v4i32(<vscale x 16 x i32> poison, <4 x i32> <i32 1, i32 0, i32 0, i32 0>, i64 0)
+; CHECK-NEXT:   [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2]], <vscale x 16 x i32> [[VS14]], i64 [[TMP1]])
+; CHECK-NEXT:   [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:   [[ARRAYIDX4:%.*]] = getelementptr <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
+; CHECK-NEXT:   store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
+; CHECK-NEXT:   [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN]], i64 [[CALL]]
+; CHECK-NEXT:   [[TMP4:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:   [[V45:%.*]] = fadd <vscale x 16 x float> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:   [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT]], i64 [[CALL]]
+; CHECK-NEXT:   store <vscale x 16 x float> [[V45]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:   ret void
+
+; CHECK-LABEL: @__vecz_nxv4_vector_mask_broadcast(
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   [[VS21:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.v4f32(<vscale x 16 x float> poison, <4 x float> [[WOOF:%.*]], i64 0)
+; CHECK-NEXT:   [[IDX02:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-NEXT:   [[VS13:%.*]] = and <vscale x 16 x i32> [[IDX02]], splat (i32 3)
+; CHECK-NEXT:   [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:   [[TMP1:%.*]] = shl {{(nuw )?}}i64 [[TMP0]], 4
+; CHECK-NEXT:   [[TMP2:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS21]], <vscale x 16 x i32> [[VS13]], i64 [[TMP1]])
+; CHECK-NEXT:   [[TMP3:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
+; CHECK-NEXT:   [[VS2:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v4i8(<vscale x 16 x i8> poison, <4 x i8> [[TMP3]], i64 0)
+; CHECK-NEXT:   [[IDX0:%.*]] = call <vscale x 16 x i16> @llvm.stepvector.nxv16i16()
+; CHECK-NEXT:   [[VS1:%.*]] = and <vscale x 16 x i16> [[IDX0]], splat (i16 3)
+; CHECK-NEXT:   [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2]], <vscale x 16 x i16> [[VS1]], i64 [[TMP1]])
+; CHECK-NEXT:   [[TMP5:%.*]] = trunc <vscale x 16 x i8> [[TMP4]] to <vscale x 16 x i1>
+; CHECK-NEXT:   [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:   [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN]], i64 [[CALL]]
+; CHECK-NEXT:   [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:   [[TMP7:%.*]] = fcmp oeq <vscale x 16 x float> [[TMP6]], splat (float 1.000000e+00)
+; CHECK-NEXT:   [[TMP8:%.*]] = and <vscale x 16 x i1> [[TMP7]], [[TMP5]]
+; CHECK-NEXT:   [[TMP9:%.*]] = select <vscale x 16 x i1> [[TMP8]], <vscale x 16 x float> [[TMP6]], <vscale x 16 x float> [[TMP2]]
+; CHECK-NEXT:   [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT]], i64 [[CALL]]
+; CHECK-NEXT:   store <vscale x 16 x float> [[TMP9]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
new file mode 100644
index 0000000000000..2895d1848afea
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans.ll
@@ -0,0 +1,182 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %b = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %c = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %d = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %e = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %f = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %g = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %h = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %i = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %j = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %k = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %l = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %m = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  ret void
+}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
+; CHECK:   %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64()
+; CHECK:   %[[VL:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE]], 4
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]]
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
+; CHECK:   %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   %[[VLSCALE:.+]] = call i64 @llvm.vscale.i64()
+; CHECK:   %[[VL:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE]], 4
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]]
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up code:
+; CHECK:   %[[VLSCALE2:.+]] = call i64 @llvm.vscale.i64()
+; CHECK:   %[[VL2:.+]] = mul {{(nuw )?}}i64 %[[VLSCALE2]], 4
+; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
+
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+; Make sure the floating point version of the slide1up intrinsic is created
+; CHECK:   call <vscale x 4 x float> @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{(<vscale x 4 x float> poison, )?}}<vscale x 4 x float> %{{.+}}, float 0.000000e+00, i64 %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
new file mode 100644
index 0000000000000..fbd4bcf657f63
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/define_subgroup_scans_vp.ll
@@ -0,0 +1,183 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dummy -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %b = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %c = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %d = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %e = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %f = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %g = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %h = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %i = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %j = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %k = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %l = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %m = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %n = call <vscale x 4 x double> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj(<vscale x 4 x double> zeroinitializer, i32 0)
+  ret void
+}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   %[[VL:.+]] = zext i32 %1 to i64
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %[[SHUFFLE]]
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   %[[VL:.+]] = zext i32 %1 to i64
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.riscv.vrgather.vv.nxv4i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %[[MASK]], i64 %[[VL]])
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up code:
+; CHECK:   %[[VL2:.+]] = zext i32 %1 to i64
+; CHECK:   %[[RESULT:.+]] = call <vscale x 4 x i32> @llvm.riscv.vslide1up.nxv4i32.i32.i64({{(<vscale x 4 x i32> poison, )?}}<vscale x 4 x i32> %[[SCAN]], i32 0, i64 %[[VL2]])
+
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+; Make sure the floating point version of the slide1up intrinsic is created
+; CHECK:   call <vscale x 4 x float> @llvm.riscv.vfslide1up.nxv4f32.f32.i64({{(<vscale x 4 x float> poison, )?}}<vscale x 4 x float> %{{.+}}, float 0.000000e+00, i64 %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x double> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj(<vscale x 4 x double>, i32)
+; CHECK-LABEL: define <vscale x 4 x double> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4dj(<vscale x 4 x double>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x double> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x double> @llvm.minnum.nxv4f64(<vscale x 4 x double> %[[VEC]], <vscale x 4 x double> %{{.+}})
+; CHECK:   call <vscale x 4 x double> @llvm.riscv.vfslide1up.nxv4f64.f64.i64({{(<vscale x 4 x double> poison, )?}}<vscale x 4 x double> %{{.+}}, double 0x7FF0000000000000, i64 %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
new file mode 100644
index 0000000000000..8c98f98249bf4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/extract_element.ll
@@ -0,0 +1,167 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k extract_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE
+; RUN: not veczc -k extract_element_ilegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
+; RUN: veczc -k extract_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI
+; RUN: veczc -k extract_element_uniform_vec -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI-VEC
+; RUN: veczc -k extract_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-INDICES
+; RUN: veczc -k extract_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-BOOL
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @extract_element(<4 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = extractelement <4 x float> %1, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; NOTE: Base packetization failing for this case.
+
+define spir_kernel void @extract_element_ilegal(<32 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)*
+  %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64
+  %2 = extractelement <32 x float> %1, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_uniform(<4 x float> %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %0 = extractelement <4 x float> %in, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %0, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_uniform_vec(<4 x float> %in, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %i = urem i64 %call, 4
+  %0 = extractelement <4 x float> %in, i64 %i
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %0, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_varying_indices(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %idxs, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx
+  %i = urem i32 %idx, 4
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx
+  %1 = extractelement <4 x float> %0, i32 %i
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %1, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %idx, i32 addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4
+  %2 = icmp slt <4 x i32> %0, %1
+  %i = urem i64 %call, 4
+  %3 = extractelement <4 x i1> %2, i64 %i
+  %4 = sext i1 %3 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %4, i32 addrspace(1)* %arrayidx3, align 4
+  %5 = sext <4 x i1> %2 to <4 x i32>
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+; EE-LABEL: @__vecz_nxv4_extract_element(
+; EE:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; EE-NEXT:    [[TMP2:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2
+; EE-NEXT:    [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0
+; EE-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; EE-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EE-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], splat (i32 2)
+; EE-NEXT:    [[VS1:%.*]] = add <vscale x 4 x i32> [[IDXSCALE]], [[SPLAT]]
+; EE-NEXT:    [[T3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-NEXT:    [[T4:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T3]], i64 [[TMP2]])
+; EE-NEXT:    [[T5:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T4]], i64 0)
+
+; Both the vector and index are uniform, so check we're not unnecessarily packetizing 
+
+; EE-UNI-LABEL: @__vecz_nxv4_extract_element_uniform(
+; EE-UNI: [[T0:%.*]] = extractelement <4 x float> %in, i32 %idx
+; EE-UNI: [[T1:%.*]] = insertelement <vscale x 4 x float> poison, float [[T0]], {{(i32|i64)}} 0
+; EE-UNI: [[T2:%.*]] = shufflevector <vscale x 4 x float> [[T1]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; EE-UNI: store <vscale x 4 x float> [[T2]], ptr addrspace(1) {{%.*}}, align 4
+
+; The vector is uniform and the index is varying, so we must broadcast the vector
+; FIXME: Do we really need to broadcast? Can we mod the indices with the original vector length?
+
+; EE-UNI-VEC-LABEL: @__vecz_nxv4_extract_element_uniform_vec(
+; EE-UNI-VEC:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; EE-UNI-VEC:         [[T3:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2
+; EE-UNI-VEC-NEXT:    [[T:%.*]] = trunc <vscale x 4 x i64> [[T2:%.*]] to <vscale x 4 x i32>
+; EE-UNI-VEC-NEXT:    [[I1:%.*]] = and <vscale x 4 x i32> [[T]], {{splat \(i32 3\)|trunc \(<vscale x 4 x i64> splat \(i64 3\) to <vscale x 4 x i32>\)}}
+; EE-UNI-VEC-NEXT:    [[IDX02:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EE-UNI-VEC-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX02]], splat (i32 2)
+
+; EE-UNI-VEC-NEXT:    [[VS1:%.*]] = or disjoint <vscale x 4 x i32> [[IDXSCALE]], [[I1]]
+
+; EE-UNI-VEC-NEXT:    [[T4:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-UNI-VEC-NEXT:    [[T5:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T1:%.*]], <vscale x 16 x i32> [[T4]], i64 [[T3]])
+; EE-UNI-VEC-NEXT:    [[T6:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T5]], i64 0)
+
+; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices(
+; EE-INDICES:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; EE-INDICES-NEXT:    [[T4:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2
+; EE-INDICES-NEXT:    [[IDX0:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+; EE-INDICES-NEXT:    [[IDXSCALE:%.*]] = shl <vscale x 4 x i32> [[IDX0]], splat (i32 2)
+; EE-INDICES-NEXT:    [[VS1:%.*]] = or disjoint <vscale x 4 x i32> [[IDXSCALE]], [[I1:%.*]]
+; EE-INDICES-NEXT:    [[T5:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[VS1]], i64 0)
+; EE-INDICES-NEXT:    [[T6:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[T3:%.*]], <vscale x 16 x i32> [[T5]], i64 [[T4]])
+; EE-INDICES-NEXT:    [[T7:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[T6]], i64 0)
+
+; Check we promote from i1 to i8 before doing our memops and use vrgatherei16.
+; EE-BOOL-LABEL: @__vecz_nxv4_extract_element_bool(
+; EE-BOOL:       [[T6:%.*]] = sext <vscale x 16 x i1> [[T5:%.*]] to <vscale x 16 x i8>
+; EE-BOOL-NEXT:  [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; EE-BOOL-NEXT:  [[T7:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 2
+; EE-BOOL-NEXT:  [[T8:%.*]] = trunc <vscale x 4 x i64> [[T0:%.*]] to <vscale x 4 x i16>
+; EE-BOOL-NEXT:  [[T9:%.*]] = and <vscale x 4 x i16> [[T8]], {{splat \(i16 3\)|trunc \(<vscale x 4 x i64> splat \(i64 3\) to <vscale x 4 x i16>\)}}
+; EE-BOOL-NEXT:  [[T10:%.*]] = call <vscale x 4 x i16> @llvm.stepvector.nxv4i16()
+; EE-BOOL-NEXT:  [[T11:%.*]] = shl <vscale x 4 x i16> [[T10]], splat (i16 2)
+; EE-BOOL-NEXT:  [[VS1:%.*]] = or disjoint <vscale x 4 x i16> [[T11]], [[T9]]
+; EE-BOOL-NEXT:  [[T12:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> poison, <vscale x 4 x i16> [[VS1]], i64 0)
+; EE-BOOL-NEXT:  [[T13:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[T6]], <vscale x 16 x i16> [[T12]], i64 [[T7]])
+; EE-BOOL-NEXT:  [[T14:%.*]] = call <vscale x 4 x i8> @llvm.vector.extract.nxv4i8.nxv16i8(<vscale x 16 x i8> [[T13]], i64 0)
+; EE-BOOL-NEXT:  [[T15:%.*]] = trunc <vscale x 4 x i8> [[T14]] to <vscale x 4 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
new file mode 100644
index 0000000000000..782982d0447ee
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/insert_element.ll
@@ -0,0 +1,136 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k insert_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE
+; RUN: veczc -k insert_element_uniform -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-UNI
+; RUN: veczc -k insert_element_varying_indices -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-INDICES
+; RUN: not veczc -k insert_element_illegal -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s
+; RUN: veczc -k insert_element_bool -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-BOOL
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @insert_element(<4 x float> addrspace(1)* nocapture readonly %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = insertelement <4 x float> %1, float %val, i32 %idx
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_uniform(<4 x float> %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %0 = insertelement <4 x float> %in, float %val, i32 %idx
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_varying_indices(<4 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx
+  %i = urem i32 %idx, 4
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %fidx = uitofp i64 %call to float
+  %2 = insertelement <4 x float> %1, float %fidx, i32 %i
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_illegal(<32 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <32 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx, align 4
+  %i = urem i32 %idx, 32
+  %arrayidx = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <32 x float> addrspace(1)* %arrayidx to <32 x float> addrspace(1)*
+  %1 = load <32 x float>, <32 x float> addrspace(1)* %0, align 64
+  %fidx = uitofp i64 %call to float
+  %2 = insertelement <32 x float> %1, float %fidx, i32 %i
+  %arrayidx3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %out, i64 %call
+  store <32 x float> %2, <32 x float> addrspace(1)* %arrayidx3, align 64
+  ret void
+}
+
+define spir_kernel void @insert_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %val, i32 %idx, <4 x i32> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4
+  %2 = icmp slt <4 x i32> %0, %1
+  %i = urem i64 %call, 4
+  %v = trunc i32 %val to i1
+  %3 = insertelement <4 x i1> %2, i1 %v, i64 %i
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+; IE-LABEL: @__vecz_nxv4_insert_element(
+; IE:         [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[VAL:%.*]], {{(i32|i64)}} 0
+; IE:         [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; IE:         [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; IE-NEXT:    [[TMP2:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4
+; IE-NEXT:    [[SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[IDX:%.*]], {{(i32|i64)}} 0
+; IE-NEXT:    [[SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; IE-NEXT:    [[ELTS:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[SPLAT]], i64 0)
+; IE-NEXT:    [[STEP:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; IE-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[STEP]], splat (i32 3)
+; IE-NEXT:    [[OUTER:%.*]] = lshr <vscale x 16 x i32> [[STEP]], splat (i32 2)
+; IE-NEXT:    [[VM:%.*]] = icmp eq <vscale x 16 x i32> [[SPLAT2]], [[INNER]]
+; IE-NEXT:    [[TMP8:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(<vscale x 16 x float> [[TMP1:%.*]], <vscale x 16 x float> [[ELTS]], <vscale x 16 x i32> [[OUTER]], <vscale x 16 x i1> [[VM]], i64 [[TMP2]]{{(, i64 1)?}})
+
+; Both the vector and index are uniform, so check we're not unnecessarily packetizing
+
+; IE-UNI-LABEL: @__vecz_nxv4_insert_element_uniform(
+; IE-UNI: {{%.*}} = insertelement <4 x float> %in, float %val, {{(i32|i64)}} %idx
+
+; IE-INDICES-LABEL: @__vecz_nxv4_insert_element_varying_indices(
+; IE-INDICES:         [[FIDX2:%.*]] = uitofp <vscale x 4 x i64> [[TMP0:%.*]] to <vscale x 4 x float>
+; IE-INDICES-NEXT:    [[XLEN:%.*]] = call i64 @llvm.vscale.i64()
+; IE-INDICES-NEXT:    [[TMP5:%.*]] = shl {{(nuw )?}}i64 [[XLEN]], 4
+; IE-INDICES-NEXT:    [[VS2:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> {{%.*}}, i64 0)
+; IE-INDICES:         [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; IE-INDICES-NEXT:    [[IDX1:%.*]] = lshr <vscale x 16 x i32> [[IDX0]], splat (i32 2)
+; IE-INDICES-NEXT:    [[TMP9:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i32> [[IDX1]], i64 [[TMP5]])
+; IE-INDICES-NEXT:    [[VS25:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[FIDX2]], i64 0)
+; IE-INDICES-NEXT:    [[INNER:%.*]] = and <vscale x 16 x i32> [[IDX0]], splat (i32 3)
+; IE-INDICES-NEXT:    [[VM:%.*]] = icmp eq <vscale x 16 x i32> [[TMP9]], [[INNER]]
+; IE-INDICES-NEXT:    [[TMP11:%.*]] = call <vscale x 16 x float> @llvm.riscv.vrgather.vv.mask.nxv16f32.i64(<vscale x 16 x float> [[TMP4:%.*]], <vscale x 16 x float> [[VS25]], <vscale x 16 x i32> [[IDX1]], <vscale x 16 x i1> [[VM]], i64 [[TMP5]]{{(, i64 1)?}})
+
+; Check we promote from i1 to i8 before doing our memops
+; IE-BOOL-LABEL: @__vecz_nxv4_insert_element_bool(
+; IE-BOOL-DAG:     [[T1:%.*]] = sext <vscale x 16 x i1> {{%.*}} to <vscale x 16 x i8>
+; IE-BOOL-DAG:     [[T0:%.*]] = sext <vscale x 4 x i1> {{%.*}} to <vscale x 4 x i8>
+; IE-BOOL:         [[TMP18:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8.i64(<vscale x 16 x i8> [[TMP7:%.*]], <vscale x 16 x i8> {{%.*}}, <vscale x 16 x i16> [[TMP16:%.*]], <vscale x 16 x i1> [[VM:%.*]], i64 [[TMP8:%.*]])
+;                            %12 = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.mask.nxv16i8.i64(<vscale x 16 x i8> %6, <vscale x 16 x i8> %vs25, <vscale x 16 x i16> %vs16, <vscale x 16 x i1> %vm, i64 %7, i64 1)
+; IE-BOOL-NEXT:    [[TMP19:%.*]] = trunc <vscale x 16 x i8> [[TMP18]] to <vscale x 16 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
new file mode 100644
index 0000000000000..8b3578af8d21e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/lit.local.cfg
@@ -0,0 +1,20 @@
+# Copyright (C) Codeplay Software Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+# Exceptions; you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+if not 'RISCV' in config.root.targets:
+    config.unsupported = True
+
+config.substitutions.append(('%vattr', '+v'))
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
new file mode 100644
index 0000000000000..c80338ff7de9c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
+  %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
+  %out.data = shufflevector <4 x i32> %in.data, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
+  store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; It checks that a single-operand shuffle that doesn't change the length is packetized to a gather intrinsic.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[DATA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  store <vscale x 16 x i32> %[[GATHER]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
new file mode 100644
index 0000000000000..d0ec0c5e6ce07
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_bool.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
+  %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
+  %in.bool = icmp ne <4 x i32> %in.data, zeroinitializer
+  %out.data = shufflevector <4 x i1> %in.bool, <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %out.sext = sext <4 x i1> %out.data to <4 x i32>
+  %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
+  store <4 x i32> %out.sext, <4 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; It checks that a single-operand shuffle that doesn't change the length is packetized to a gather intrinsic,
+; and that it works with a vector of i1 type by temporarily extending to i8.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[DATA_i1:.+]] = icmp ne <vscale x 16 x i32> %[[DATA]], zeroinitializer
+; CHECK:  %[[DATA_i8:.+]] = zext <vscale x 16 x i1> %[[DATA_i1]] to <vscale x 16 x i8>
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i8> @llvm.riscv.vrgatherei16.vv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> %[[DATA_i8]], <vscale x 16 x i16> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[GATHER_i1:.+]] = trunc <vscale x 16 x i8> %[[GATHER]] to <vscale x 16 x i1>
+; CHECK:  %[[RESULT:.+]] = sext <vscale x 16 x i1> %[[GATHER_i1]] to <vscale x 16 x i32>
+; CHECK:  store <vscale x 16 x i32> %[[RESULT]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
new file mode 100644
index 0000000000000..cf0b5b3ac8d5a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_concat.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<2 x i32> addrspace(1)* %a, <2 x i32> addrspace(1)* %b, <4 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %a.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 %gid
+  %a.data = load <2 x i32>, <2 x i32> addrspace(1)* %a.ptr
+  %b.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 %gid
+  %b.data = load <2 x i32>, <2 x i32> addrspace(1)* %b.ptr
+  %out.data = shufflevector <2 x i32> %a.data, <2 x i32> %b.data, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
+  store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; It checks that a two-operand shuffle is packetized to a gather intrinsics and a select.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[DATB:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[WIDENA:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATA]], i64 0)
+; CHECK:  %[[GATHERA:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[WIDENA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[WIDENB:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATB]], i64 0)
+; CHECK:  %[[GATHERB:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[WIDENB]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[SELECT:.+]] = select <vscale x 16 x i1> %{{.+}}, <vscale x 16 x i32> %[[GATHERB]], <vscale x 16 x i32> %[[GATHERA]]
+; CHECK:  store <vscale x 16 x i32> %[[SELECT]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
new file mode 100644
index 0000000000000..8c28d3762451d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_narrow.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x i32> addrspace(1)* %in, <2 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %in.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %gid
+  %in.data = load <4 x i32>, <4 x i32> addrspace(1)* %in.ptr
+  %out.data = shufflevector <4 x i32> %in.data, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+  %out.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i64 %gid
+  store <2 x i32> %out.data, <2 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; It checks that a single-operand shuffle that narrows the vector is packetized to a gather intrinsic.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 16 x i32>, {{(<vscale x 16 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[DATA]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  %[[EXTRACT:.+]] = call <vscale x 8 x i32> @llvm.vector.extract.nxv8i32.nxv16i32(<vscale x 16 x i32> %[[GATHER]], i64 0)
+; CHECK:  store <vscale x 8 x i32> %[[EXTRACT]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
new file mode 100644
index 0000000000000..789e091a7e7b3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/packetize_shuffle_wider.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -vecz-passes=packetizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<2 x i32> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %in.ptr = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %in, i64 %gid
+  %in.data = load <2 x i32>, <2 x i32> addrspace(1)* %in.ptr
+  %out.data = shufflevector <2 x i32> %in.data, <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %out.ptr = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %gid
+  store <4 x i32> %out.data, <4 x i32> addrspace(1)* %out.ptr, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; It checks that a single-operand shuffle that widens the vector is packetized to a gather intrinsic.
+; CHECK: define spir_kernel void @__vecz_nxv4_f({{.*}}) {{.*}} {
+; CHECK: entry:
+; CHECK:  %[[DATA:.+]] = load <vscale x 8 x i32>, {{(<vscale x 8 x i32> addrspace\(1\)\*)|(ptr addrspace\(1\))}} %{{.*}}
+; CHECK:  %[[WIDEN:.+]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> %[[DATA]], i64 0)
+; CHECK:  %[[GATHER:.+]] = call <vscale x 16 x i32> @llvm.riscv.vrgather.vv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> %[[WIDEN]], <vscale x 16 x i32> %{{.+}}, i64 %{{.+}})
+; CHECK:  store <vscale x 16 x i32> %[[GATHER]]
+; CHECK:  ret void
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
new file mode 100644
index 0000000000000..8af9cb06320bf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/select_scalar_vector.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k select_scalar_vector -vecz-target-triple="riscv64-unknown-unknown" -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @select_scalar_vector(i32* %aptr, i32* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, <2 x i32> %c, <2 x i32> <i32 4, i32 4>
+  store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_vector
+; CHECK: [[rhs:%.*]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[cmp1:%.*]] = icmp slt <vscale x 4 x i32>
+; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
+; CHECK: [[idx0:%.*]] = call <vscale x 8 x i16> @llvm.stepvector.nxv8i16()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i16> [[idx0]], splat (i16 1)
+; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vrgatherei16.vv.nxv8i8.i64(<vscale x 8 x i8> poison, <vscale x 8 x i8> [[vs2:%.*]], <vscale x 8 x i16> [[vs1:%.*]], i64 [[xlen:%.*]])
+; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
+; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> splat (i32 4)
+; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
new file mode 100644
index 0000000000000..7ab9888ec9b91
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_memops.ll
@@ -0,0 +1,99 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-4
+; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-8
+; RUN: veczc -k store_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-STORE-16
+; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=4 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-4
+; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=8 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-8
+; RUN: veczc -k load_element -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+f,+d,%vattr -vecz-simd-width=16 -vecz-scalable -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-LOAD-16
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @store_element(i32 %0, i32 addrspace(1)* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp ne i64 %call, 0
+  br i1 %cond, label %do, label %ret
+
+do:
+  %dest = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %0, i32 addrspace(1)* %dest, align 4
+  br label %ret
+
+ret:
+  ret void
+}
+
+; CHECK-STORE-4:       define void @__vecz_b_masked_store4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 4 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]])
+; CHECK-STORE-4-NEXT:  entry:
+; CHECK-STORE-4-NEXT:    call void @llvm.vp.store.nxv4i32.p1(<vscale x 4 x i32> [[TMP0]], ptr addrspace(1) [[TMP1]], <vscale x 4 x i1> [[TMP2]], i32 [[TMP3]])
+; CHECK-STORE-4-NEXT:    ret void
+
+; CHECK-STORE-8:       define void @__vecz_b_masked_store4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(<vscale x 8 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 8 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]])
+; CHECK-STORE-8-NEXT:  entry:
+; CHECK-STORE-8-NEXT:    call void @llvm.vp.store.nxv8i32.p1(<vscale x 8 x i32> [[TMP0]], ptr addrspace(1) [[TMP1]], <vscale x 8 x i1> [[TMP2]], i32 [[TMP3]])
+; CHECK-STORE-8-NEXT:    ret void
+
+; CHECK-STORE-16:       define void @__vecz_b_masked_store4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(<vscale x 16 x i32> [[TMP0:%.*]], ptr addrspace(1) [[TMP1:%.*]], <vscale x 16 x i1> [[TMP2:%.*]], i32 [[TMP3:%.*]])
+; CHECK-STORE-16-NEXT:  entry:
+; CHECK-STORE-16-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-STORE-16-NEXT:    [[SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], {{i32|i64}} 0
+; CHECK-STORE-16-NEXT:    [[SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-STORE-16-NEXT:    [[TMP6:%.*]] = icmp ult <vscale x 16 x i32> [[TMP5]], [[SPLAT]]
+; CHECK-STORE-16-NEXT:    [[TMP7:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
+; CHECK-STORE-16-NEXT:    call void @llvm.masked.store.nxv16i32.p1(<vscale x 16 x i32> [[TMP0]], ptr addrspace(1) [[TMP1]], i32 4, <vscale x 16 x i1> [[TMP7]])
+; CHECK-STORE-16-NEXT:    ret void
+
+define spir_kernel void @load_element(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp ne i64 %call, 0
+  br i1 %cond, label %do, label %ret
+
+do:
+  %src = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %dest = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  %do.res = load i32, i32 addrspace(1)* %src, align 4
+  store i32 %do.res, i32 addrspace(1)* %dest, align 4
+  br label %ret
+
+ret:
+  ret void
+}
+
+; CHECK-LOAD-4:      define <vscale x 4 x i32> @__vecz_b_masked_load4_vp_u5nxv4ju3ptrU3AS1u5nxv4bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 4 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]])
+; CHECK-LOAD-4-NEXT: entry:
+; CHECK-LOAD-4-NEXT:   [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p1(ptr addrspace(1) [[TMP0]], <vscale x 4 x i1> [[TMP1]], i32 [[TMP2]])
+; CHECK-LOAD-4-NEXT:   ret <vscale x 4 x i32> [[TMP4]]
+
+; CHECK-LOAD-8:      define <vscale x 8 x i32> @__vecz_b_masked_load4_vp_u5nxv8ju3ptrU3AS1u5nxv8bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 8 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]])
+; CHECK-LOAD-8-NEXT: entry:
+; CHECK-LOAD-8-NEXT:   [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.vp.load.nxv8i32.p1(ptr addrspace(1) [[TMP0]], <vscale x 8 x i1> [[TMP1]], i32 [[TMP2]])
+; CHECK-LOAD-8-NEXT:   ret <vscale x 8 x i32> [[TMP4]]
+
+; CHECK-LOAD-16:      define <vscale x 16 x i32> @__vecz_b_masked_load4_vp_u6nxv16ju3ptrU3AS1u6nxv16bj(ptr addrspace(1) [[TMP0:%.*]], <vscale x 16 x i1> [[TMP1:%.*]], i32 [[TMP2:%.*]])
+; CHECK-LOAD-16-NEXT: entry:
+; CHECK-LOAD-16-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; CHECK-LOAD-16-NEXT: [[TMPSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], {{i32|i64}} 0
+; CHECK-LOAD-16-NEXT: [[TMPSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[TMPSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-LOAD-16-NEXT: [[TMP5:%.*]] = icmp ult <vscale x 16 x i32> [[TMP4]], [[TMPSPLAT]]
+; CHECK-LOAD-16-NEXT: [[TMP6:%.*]] = select <vscale x 16 x i1> [[TMP1]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; CHECK-LOAD-16-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.masked.load.nxv16i32.p1(ptr addrspace(1) [[TMP0]], i32 4, <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i32> poison)
+; CHECK-LOAD-16-NEXT: ret <vscale x 16 x i32> [[TMP7]]
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
new file mode 100644
index 0000000000000..7823d56291ac5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/RISCV/vp_vsetvli.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-target-triple="riscv64-unknown-unknown" -vecz-target-features=+v -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @load_add_store(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_vp_load_add_store
+; CHECK: %local.id = call i64 @__mux_get_local_id(i32 0)
+; CHECK: %local.size = call i64 @__mux_get_local_size(i32 0)
+; CHECK: %work.remaining = sub nuw nsw i64 %local.size, %local.id
+; CHECK: %[[vli64:.+]] = call i64 @llvm.riscv.vsetvli.i64(i64 %work.remaining, i64 2, i64 1)
+; CHECK: %[[vl:.+]] = trunc nuw i64 %[[vli64]] to i32
+; CHECK: %[[lhs:.+]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0({{.*}}, i32 %[[vl]])
+; CHECK: %[[rhs:.+]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0({{.*}}, i32 %[[vl]])
+; CHECK: %[[sum:.+]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %[[lhs]], <vscale x 4 x i32> %[[rhs]], {{.*}}, i32 %[[vl]])
+; CHECK: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> %[[sum]], {{.*}}, i32 %[[vl]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
new file mode 100644
index 0000000000000..aa4559aad057e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/broadcast_vector.ll
@@ -0,0 +1,179 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-passes="function(instcombine),packetizer,gvn,function(instcombine)" -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define dso_local spir_kernel void @vector_broadcast_const(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fadd <4 x float> %1, <float 0x7FF8000020000000, float 0x7FF8000020000000, float 0x7FF8000020000000, float 0x7FF8000020000000>
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fadd <4 x float> %1, %addend
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_broadcast_regression(<4 x float> addrspace(1)* nocapture readonly %in, i32 %nancode, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x i32> addrspace(1)*
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %0, align 16
+  %and1.i.i.i1.i = and <4 x i32> %1, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  %cmp.i.i.i2.i = icmp ne <4 x i32> %and1.i.i.i1.i, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+  %and2.i.i.i3.i = and <4 x i32> %1, <i32 8388607, i32 8388607, i32 8388607, i32 8388607>
+  %cmp3.i.i.i4.i = icmp eq <4 x i32> %and2.i.i.i3.i, zeroinitializer
+  %2 = or <4 x i1> %cmp.i.i.i2.i, %cmp3.i.i.i4.i
+  %3 = bitcast <4 x i32> %1 to <4 x float>
+  %4 = select <4 x i1> %2, <4 x float> %3, <4 x float> <float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000, float 0x7FF0000020000000>
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+; Check that new instructions aren't inserting before pre-existing allocas
+define dso_local spir_kernel void @vector_broadcast_insertpt(<4 x float> addrspace(1)* nocapture readonly %in, <4 x float> %addend, i32 %nancode, <4 x float> addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) local_unnamed_addr #0 {
+entry:
+  %existing.alloc = alloca <4 x i32>
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  store <4 x i32> zeroinitializer, <4 x i32>* %existing.alloc
+  %scalar = bitcast <4 x i32>* %existing.alloc to i32*
+  store i32 1, i32* %scalar
+  %v = load <4 x i32>, <4 x i32>* %existing.alloc
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call
+  store <4 x i32> %v, <4 x i32> addrspace(1)* %arrayidx4, align 16
+
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %op = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %v4 = fadd <4 x float> %op, %addend
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %v4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+
+define dso_local spir_kernel void @vector_mask_broadcast(<4 x float> addrspace(1)* nocapture readonly %in, <4 x i1> %input, <4 x float> %woof, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = fcmp oeq <4 x float> %1, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %3 = and <4 x i1> %2, %input
+  %4 = select <4 x i1> %3, <4 x float> %1, <4 x float> %woof
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %4, <4 x float> addrspace(1)* %arrayidx3, align 16
+  ret void
+}
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> {{shufflevector \(<vscale x 16 x float> insertelement \(<vscale x 16 x float> poison, float 0x7FF8000020000000, (i32|i64) 0\), <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(float 0x7FF8000020000000\)}}, ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <4 x float>, align 16
+; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC]], align 16
+; CHECK-NEXT:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
+; CHECK-NEXT:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 3, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> poison)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 16 x float> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_regression(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 16 x i32>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:    [[AND1_I_I_I1_I1:%.*]] = and <vscale x 16 x i32> [[TMP1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 2139095040, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2139095040\)}}
+; CHECK-NEXT:    [[CMP_I_I_I2_I2:%.*]] = icmp ne <vscale x 16 x i32> [[AND1_I_I_I1_I1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 2139095040, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2139095040\)}}
+; CHECK-NEXT:    [[AND2_I_I_I3_I3:%.*]] = and <vscale x 16 x i32> [[TMP1]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 8388607, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 8388607\)}}
+; CHECK-NEXT:    [[CMP3_I_I_I4_I4:%.*]] = icmp eq <vscale x 16 x i32> [[AND2_I_I_I3_I3]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = or <vscale x 16 x i1> [[CMP_I_I_I2_I2]], [[CMP3_I_I_I4_I4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 16 x i32> [[TMP1]] to <vscale x 16 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> {{shufflevector \(<vscale x 16 x float> insertelement \(<vscale x 16 x float> poison, float 0x7FF0000020000000, (i32|i64) 0\), <vscale x 16 x float> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(float 0x7FF0000020000000\)}}
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> [[TMP4]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+;
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_broadcast_insertpt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[EXISTING_ALLOC:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    [[FIXLEN_ALLOC:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    [[FIXLEN_ALLOC1:%.*]] = alloca <4 x float>, align 16
+; CHECK-NEXT:    store <4 x float> [[ADDEND:%.*]], ptr [[FIXLEN_ALLOC1]], align 16
+; CHECK-NEXT:    [[IDX03:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
+; CHECK-NEXT:    [[IDX14:%.*]] = and <vscale x 16 x i32> [[IDX03]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 3, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
+; CHECK-NEXT:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[VEC_ALLOC5:%.*]] = getelementptr inbounds float, ptr [[FIXLEN_ALLOC1]], <vscale x 16 x i64> [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.masked.gather.nxv16f32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC5]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x float> poison)
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[EXISTING_ALLOC]], align 16
+; CHECK-NEXT:    store i32 1, ptr [[EXISTING_ALLOC]], align
+; CHECK-NEXT:    [[V:%.*]] = load <4 x i32>, ptr [[EXISTING_ALLOC]], align 16
+; CHECK-NEXT:    store <4 x i32> [[V]], ptr [[FIXLEN_ALLOC]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX14]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i32, ptr [[FIXLEN_ALLOC]], <vscale x 16 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 4, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x i32> poison)
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr <4 x i32>, ptr addrspace(1) [[OUT2:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x i32> [[TMP3]], ptr addrspace(1) [[ARRAYIDX4]], align 16
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[IN:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <vscale x 16 x float>, ptr addrspace(1) [[ARRAYIDX]], align 16
+; CHECK-NEXT:    [[V46:%.*]] = fadd <vscale x 16 x float> [[TMP6]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr <4 x float>, ptr addrspace(1) [[OUT:%.*]], i64 [[CALL]]
+; CHECK-NEXT:    store <vscale x 16 x float> [[V46]], ptr addrspace(1) [[ARRAYIDX3]], align 16
+; CHECK-NEXT:    ret void
+;
+; CHECK-LABEL: @__vecz_nxv4_vector_mask_broadcast(
+; CHECK-NEXT:  entry:
+; CHECK:    [[FIXLEN_MASK_ALLOC:%.*]] = alloca <4 x i8>, align 4
+; CHECK:    [[IDX0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
+; CHECK:    [[IDX1:%.*]] = and <vscale x 16 x i32> [[IDX0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 3, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 3\)}}
+; CHECK:    [[SEXT:%.*]] = sext <4 x i1> [[INPUT:%.*]] to <4 x i8>
+; CHECK:    store <4 x i8> [[SEXT]], ptr [[FIXLEN_MASK_ALLOC]], align 4
+; CHECK:    [[TMP0:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[IDX1]] to <vscale x 16 x i64>
+; CHECK:    [[VEC_ALLOC:%.*]] = getelementptr inbounds i8, ptr [[FIXLEN_MASK_ALLOC]], <vscale x 16 x i64> [[TMP0]]
+; CHECK:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[VEC_ALLOC]], i32 1, <vscale x 16 x i1> {{shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 16 x i8> poison)
+; CHECK:    [[BMASK:%.*]] = trunc <vscale x 16 x i8> [[TMP1]] to <vscale x 16 x i1>
+; CHECK:    {{.*}} = and <vscale x 16 x i1> {{.*}}, [[BMASK]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
new file mode 100644
index 0000000000000..f58b2bd62f539
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/builtins.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k builtins -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @builtins(float* %aptr, float* %bptr, i32* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds float, float* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load float, float* %arrayidxa, align 4
+  %b = load float, float* %arrayidxb, align 4
+  %cmp = call spir_func i32 @_Z9isgreaterff(float %a, float %b)
+  store i32 %cmp, i32* %arrayidxz, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z9isgreaterff(float, float)
+
+; CHECK: void @__vecz_nxv4_builtins
+; CHECK:   = fcmp ogt <vscale x 4 x float> %{{.*}}, %{{.*}}
+; CHECK:   = zext <vscale x 4 x i1> %relational2 to <vscale x 4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
new file mode 100644
index 0000000000000..484415bb395db
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cast.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k cast -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @cast(i32* %aptr, float* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %c = sitofp i32 %a to float
+  store float %c, float* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv8_cast
+; CHECK: sitofp <vscale x 8 x i32> {{%[0-9]+}} to <vscale x 8 x float>
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
new file mode 100644
index 0000000000000..5d29c785dab6a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/cmpxchg.ll
@@ -0,0 +1,93 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-scalable -vecz-passes=packetizer,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_nxv4_test_fn(ptr %p, ptr %q, ptr %r)
+define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
+entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <vscale x 4 x ptr> [[SPLAT_PTR_INS]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
+  %call = call i64 @__mux_get_global_id(i32 0)
+
+; Test that this cmpxchg is packetized by generating a call to an all-true masked version.
+; CHECK: [[A0:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @__vecz_b_nxv4_masked_cmpxchg_align4_acquire_monotonic_1_u9nxv4u3ptru5nxv4ju5nxv4ju5nxv4b(
+; CHECK-SAME: <vscale x 4 x ptr> [[SPLAT_PTR]],
+; CHECK-SAME: <vscale x 4 x i32> {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 1, i64 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 1\)}}
+; CHECK-SAME: <vscale x 4 x i32> {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 2, i64 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
+; CHECK-SAME: <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, i64 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}
+  %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
+; CHECK: [[EXT0:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], 0
+  %val0 = extractvalue { i32, i1 } %old0, 0
+; CHECK: [[EXT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], 1
+  %success0 = extractvalue { i32, i1 } %old0, 1
+
+  %out = getelementptr i32, ptr %q, i64 %call
+; Stored as a vector
+; CHECK: store <vscale x 4 x i32> [[EXT0]], ptr
+  store i32 %val0, ptr %out, align 4
+
+; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call
+  %outsuccess = getelementptr i8, ptr %r, i64 %call
+; CHECK: [[ZEXT0:%.*]] = zext <vscale x 4 x i1> [[EXT1]] to <vscale x 4 x i8>
+  %outbyte = zext i1 %success0 to i8
+; Stored as a vector
+; CHECK: store <vscale x 4 x i8> [[ZEXT0]], ptr [[PTR]], align 1
+  store i8 %outbyte, ptr %outsuccess, align 1
+
+  ; Test a couple of insert/extract patterns
+
+  ; Test inserting a uniform value into a varying literal struct
+; CHECK: [[INS0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], <vscale x 4 x i1> zeroinitializer, 1
+; CHECK: [[EXT2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[INS0]], 1
+; CHECK: [[ZEXT1:%.*]] = zext <vscale x 4 x i1> [[EXT2]] to <vscale x 4 x i8>
+; CHECK: store <vscale x 4 x i8> [[ZEXT1]], ptr [[PTR]], align 1
+  %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1
+  %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1
+  %outbyte0 = zext i1 %testextract0 to i8
+  store i8 %outbyte0, ptr %outsuccess, align 1
+
+  ; Test inserting a varying value into a varying literal struct
+; CHECK: [[LD:%.*]] = load <vscale x 4 x i8>, ptr
+; CHECK: [[VBOOL:%.*]] = trunc <vscale x 4 x i8> [[LD]] to <vscale x 4 x i1>
+; CHECK: [[INS1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[A0]], <vscale x 4 x i1> [[VBOOL]], 1
+; CHECK: [[EXT3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[INS1]], 1
+; CHECK: [[ZEXT2:%.*]] = zext <vscale x 4 x i1> [[EXT3]] to <vscale x 4 x i8>
+; CHECK: store <vscale x 4 x i8> [[ZEXT2]], ptr [[PTR]], align 1
+  %byte1 = load i8, ptr %outsuccess, align 1
+  %bool1 = trunc i8 %byte1 to i1
+  %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1
+  %testextract1 = extractvalue { i32, i1 } %testinsertvarying0, 1
+  %outbyte1 = zext i1 %testextract1 to i8
+  store i8 %outbyte1, ptr %outsuccess, align 1
+
+  ; Test inserting a varying value into a uniform literal struct
+; CHECK: [[INS2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } poison, <vscale x 4 x i1> [[VBOOL]], 1
+; CHECK: [[EXT4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[INS2]], 1
+; CHECK: [[ZEXT3:%.*]] = zext <vscale x 4 x i1> [[EXT4]] to <vscale x 4 x i8>
+; CHECK: store <vscale x 4 x i8> [[ZEXT3]], ptr [[PTR]], align 1
+  %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1
+  %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1
+  %outbyte2 = zext i1 %testextract2 to i8
+  store i8 %outbyte2, ptr %outsuccess, align 1
+
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
new file mode 100644
index 0000000000000..315e721aea82d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1(<vscale x 4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: %2 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
+; CHECK: %3 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %2
+; CHECK: %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
+; CHECK: call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32{{( immarg)?}} 8, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
new file mode 100644
index 0000000000000..314ee922d7be6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_interleaved_store_as_masked.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare void @__mux_work_group_barrier(i32, i32, i32)
+
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_u5nxv4du3ptrU3AS1(<vscale x 4 x double> %0, ptr addrspace(1) %1) [[ATTRS:#[0-9]+]] {
+; CHECK: entry:
+; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %2 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
+; CHECK:   %3 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %2
+; CHECK:   %4 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %3
+; CHECK:   call void @llvm.masked.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %4, i32 immarg 8, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}})
+; CHECK:   ret void
+; CHECK: }
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
new file mode 100644
index 0000000000000..bec291abbf638
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_load.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dont_mask_workitem_builtins -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call2 = call i64 @__mux_get_global_id(i32 0)
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call i64 @__mux_get_local_size(i32 0)
+  %call9 = call i64 @__mux_get_group_id(i32 0)
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare void @__mux_work_group_barrier(i32, i32, i32)
+
+declare i64 @__mux_get_local_id(i32)
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i64 @__mux_get_local_size(i32)
+
+declare i64 @__mux_get_group_id(i32)
+
+; Test if the masked load is defined correctly
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_load4_u5nxv4ju3ptrU3AS2u5nxv4b(ptr addrspace(2){{( %0)?}}, <vscale x 4 x i1>{{( %1)?}})
+; CHECK: entry:
+; CHECK: %2 = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <vscale x 4 x i1> %1, <vscale x 4 x i32> poison)
+; CHECK: ret <vscale x 4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
new file mode 100644
index 0000000000000..24815c1ae56d1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_masked_scatter_gather.ll
@@ -0,0 +1,89 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %idxprom4 = sext i32 %2 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4
+  store i32 42, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 42, i32 addrspace(1)* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Test if the masked scatter store is defined correctly
+; CHECK: define void @__vecz_b_masked_scatter_store4_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4b(<vscale x 4 x i32>{{( %0)?}}, <vscale x 4 x ptr addrspace(1)>{{( %1)?}}, <vscale x 4 x i1>{{( %2)?}})
+; CHECK: entry:
+; CHECK: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 4, <vscale x 4 x i1> %2)
+; CHECK: ret void
+
+; Test if the masked gather load is defined correctly
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4b(<vscale x 4 x ptr addrspace(1)>{{( %0)?}}, <vscale x 4 x i1>{{( %1)?}})
+; CHECK: entry:
+; CHECK: %2 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <vscale x 4 x i1> %1, <vscale x 4 x i32> poison)
+; CHECK: ret <vscale x 4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
new file mode 100644
index 0000000000000..e151d82fa7981
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans.ll
@@ -0,0 +1,182 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %b = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %c = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %d = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %e = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %f = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %g = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %h = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %i = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32> zeroinitializer)
+  %j = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %k = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %l = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  %m = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float> zeroinitializer)
+  ret void
+}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
+; CHECK:   %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 4 x i32> poison)
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]]
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4j(<vscale x 4 x i32>{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; CHECK:   %[[SCALE:.+]] = call i32 @llvm.vscale.i32()
+; CHECK:   %[[SIZE:.+]] = mul {{(nuw )?}}i32 %[[SCALE]], 4
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}, <vscale x 4 x i32> poison)
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %[[SIZE]]
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up code:
+; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm{{(\.experimental)?}}.vector.splice.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %[[SCAN]], i32 -1)
+; CHECK:   %[[RESULT:.+]] = insertelement <vscale x 4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
+
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32>)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float>)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_u5nxv4f(<vscale x 4 x float>{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
new file mode 100644
index 0000000000000..cdd9ef6de8e02
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/define_subgroup_scans_vp.ll
@@ -0,0 +1,186 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dummy -vecz-scalable -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %b = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %c = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %d = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %e = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %f = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %g = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %h = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %i = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32> zeroinitializer, i32 0)
+  %j = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %k = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %l = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  %m = call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float> zeroinitializer, i32 0)
+  ret void
+}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
+; CHECK:   %[[VLSTEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; CHECK:   %[[VLINS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %1, {{i32|i64}} 0
+; CHECK:   %[[VLSPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[VLINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[VLMASK:.+]] = icmp ult <vscale x 4 x i32> %3, %[[VLSPLAT]]
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> %[[VLMASK]], <vscale x 4 x i32> poison)
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %[[SHUFFLE]]
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <vscale x 4 x i32>
+; CHECK:   %[[STEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <vscale x 4 x i32> [ %[[STEP]], %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[N_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <vscale x 4 x i32> %[[VEC]], {{(<vscale x 4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <vscale x 4 x i32> %[[MASK]]
+; CHECK:   %[[VLSTEP:.+]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; CHECK:   %[[VLINS:.+]] = insertelement <vscale x 4 x i32> poison, i32 %1, {{i32|i64}} 0
+; CHECK:   %[[VLSPLAT:.+]] = shufflevector <vscale x 4 x i32> %[[VLINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %[[VLMASK:.+]] = icmp ult <vscale x 4 x i32> %3, %[[VLSPLAT]]
+; CHECK:   %[[SHUFFLE:.+]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0{{(i32)?}}(<vscale x 4 x [[PTRTY]]> %[[INDEX]], i32 4, <vscale x 4 x i1> %[[VLMASK]], <vscale x 4 x i32> poison)
+
+; CHECK:   %[[ACCUM:.+]] = add <vscale x 4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <vscale x 4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <vscale x 4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <vscale x 4 x i1> %[[WHICH]], <vscale x 4 x i32> %[[ACCUM]], <vscale x 4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <vscale x 4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <vscale x 4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up code:
+; CHECK:   %[[SLIDE:.+]] = call <vscale x 4 x i32> @llvm{{(\.experimental)?}}.vector.splice.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %[[SCAN]], i32 -1)
+; CHECK:   %[[RESULT:.+]] = insertelement <vscale x 4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
+
+; CHECK:   ret <vscale x 4 x i32> %[[RESULT]]
+; CHECK: }
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <vscale x 4 x float> %[[VEC]], %{{.+}}
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32>, i32)
+; CHECK-LABEL: define <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> %[[VEC]], <vscale x 4 x i32> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
+
+declare <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float>, i32)
+; CHECK-LABEL: define <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_u5nxv4fj(<vscale x 4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <vscale x 4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> %[[VEC]], <vscale x 4 x float> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
new file mode 100644
index 0000000000000..28785e54202a0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/extract_element.ll
@@ -0,0 +1,144 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k extract_element -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE
+; RUN: veczc -k extract_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI
+; RUN: veczc -k extract_element_uniform_vec -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-UNI-VEC
+; RUN: veczc -k extract_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-INDICES
+; RUN: veczc -k extract_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=EE-BOOL
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @extract_element(<4 x float> addrspace(1)* nocapture readonly %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = extractelement <4 x float> %1, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_uniform(<4 x float> %in, i32 %idx, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %0 = extractelement <4 x float> %in, i32 %idx
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %0, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_uniform_vec(<4 x float> %in, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %i = urem i64 %call, 4
+  %0 = extractelement <4 x float> %in, i64 %i
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %0, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_varying_indices(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %idxs, float addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx
+  %i = urem i32 %idx, 4
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx
+  %1 = extractelement <4 x float> %0, i32 %i
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %1, float addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @extract_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %idx, i32 addrspace(1)* nocapture %out, <4 x i32> addrspace(1)* nocapture %out2) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4
+  %2 = icmp slt <4 x i32> %0, %1
+  %i = urem i64 %call, 4
+  %3 = extractelement <4 x i1> %2, i64 %i
+  %4 = sext i1 %3 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %4, i32 addrspace(1)* %arrayidx3, align 4
+  %5 = sext <4 x i1> %2 to <4 x i32>
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out2, i64 %call
+  store <4 x i32> %5, <4 x i32> addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+
+; EE-LABEL: @__vecz_nxv4_extract_element(
+; EE: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
+; EE: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
+; EE: [[IDX:%.*]] = sext i32 %idx to i64
+; EE: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], i64 [[IDX]]
+; EE: [[GATHER:%.*]] = call <vscale x 4 x float> @__vecz_b_interleaved_load4_4_u5nxv4fu3ptr(ptr [[ADDR]])
+
+; Both the vector and index are uniform, so check we're not unnecessarily packetizing 
+
+; EE-UNI-LABEL: @__vecz_nxv4_extract_element_uniform(
+; EE-UNI: [[T0:%.*]] = extractelement <4 x float> %in, i32 %idx
+; EE-UNI: [[T1:%.*]] = insertelement <vscale x 4 x float> poison, float [[T0]], {{(i32|i64)}} 0
+; EE-UNI: [[T2:%.*]] = shufflevector <vscale x 4 x float> [[T1]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; EE-UNI: store <vscale x 4 x float> [[T2]], ptr addrspace(1) {{%.*}}, align 4
+
+; The vector is uniform and the index is varying, so we must broadcast the vector
+; FIXME: Do we really need to broadcast? Can we mod the indices with the original vector length?
+
+; EE-UNI-VEC-LABEL: @__vecz_nxv4_extract_element_uniform_vec(
+; EE-UNI-VEC: [[T3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 %call, {{(i32|i64)}} 0
+; EE-UNI-VEC: [[T4:%.*]] = shufflevector <vscale x 4 x i64> [[T3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; EE-UNI-VEC: [[STEP:%.*]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
+; EE-UNI-VEC: [[T5:%.*]] = add <vscale x 4 x i64> [[T4]], [[STEP]]
+; EE-UNI-VEC: [[MOD:%.*]] = and <vscale x 4 x i64> [[T5]], {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 3, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 3\)}}
+; EE-UNI-VEC: [[T6:%.*]] = shl <vscale x 4 x i64> [[STEP]], {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 2, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 2\)}}
+
+; LLVM 16 deduces add/or equivalence and uses `or` instead.
+; EE-UNI-VEC: [[T7:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i64> [[T6]], [[MOD]]
+
+; EE-UNI-VEC: [[T8:%.*]] = getelementptr float, ptr {{%.*}}, <vscale x 4 x i64> [[T7]]
+; EE-UNI-VEC: [[T9:%.*]] = call <vscale x 4 x float> @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x ptr> [[T8]])
+; EE-UNI-VEC: store <vscale x 4 x float> [[T9]], ptr addrspace(1) {{%.*}}, align 4
+
+; EE-INDICES-LABEL: @__vecz_nxv4_extract_element_varying_indices(
+; EE-INDICES: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
+; EE-INDICES: [[T0:%.*]] = getelementptr i32, ptr addrspace(1) %idxs, i64 %call
+; EE-INDICES: [[T2:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) [[T0]], align 4
+; EE-INDICES: [[T3:%.*]] = and <vscale x 4 x i32> [[T2]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 3, (i32|i64) 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 3\)}}
+; EE-INDICES: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
+; EE-INDICES: [[STEP:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; EE-INDICES: [[T4:%.*]] = shl <vscale x 4 x i32> [[STEP]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 2, (i32|i64) 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
+; EE-INDICES: [[T5:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T4]], [[T3]]
+; EE-INDICES: [[IDX:%.*]] = sext <vscale x 4 x i32> [[T5]] to <vscale x 4 x i64>
+; EE-INDICES: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], <vscale x 4 x i64> [[IDX]]
+; EE-INDICES: [[GATHER:%.*]] = call <vscale x 4 x float> @__vecz_b_gather_load4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x ptr> [[ADDR]])
+
+; Check we promote from i1 to i8 before doing our memops
+; EE-BOOL-LABEL: @__vecz_nxv4_extract_element_bool(
+; EE-BOOL: [[T0:%.*]] = sext <vscale x 16 x i1> {{%.*}} to <vscale x 16 x i8>
+; EE-BOOL: store <vscale x 16 x i8> {{.*}}
+; EE-BOOL: [[T1:%.*]] = call <vscale x 4 x i8> @__vecz_b_gather_load1_u5nxv4hu9nxv4u3ptr(<vscale x 4 x ptr> {{%.*}}
+; EE-BOOL: [[T2:%.*]] = trunc <vscale x 4 x i8> [[T1]] to <vscale x 4 x i1>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
new file mode 100644
index 0000000000000..023a617b6e2bc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fadd.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k fadd -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @fadd(float* %aptr, float* %bptr, float* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds float, float* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load float, float* %arrayidxa, align 4
+  %b = load float, float* %arrayidxb, align 4
+  %sum = fadd float %a, %b
+  store float %sum, float* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_fadd
+; CHECK: load <vscale x 4 x float>, ptr
+; CHECK: load <vscale x 4 x float>, ptr
+; CHECK: fadd <vscale x 4 x float>
+; CHECK: store <vscale x 4 x float>
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
new file mode 100644
index 0000000000000..9528dd86c8a77
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/fail_builtins.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: not veczc -k fail_builtins -vecz-scalable -vecz-simd-width=4 -S < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @fail_builtins(float* %aptr, float* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load float, float* %arrayidxa, align 4
+  %math = call spir_func float @_Z4tanff(float %a)
+  store float %math, float* %arrayidxz, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func float @_Z4tanff(float)
+
+; We can't scalarize this builtin call
+; CHECK: Error: Failed to vectorize function 'fail_builtins'
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
new file mode 100644
index 0000000000000..107603f898c7e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/insert_element.ll
@@ -0,0 +1,120 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k insert_element -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE
+; RUN: veczc -k insert_element_uniform -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-UNI
+; RUN: veczc -k insert_element_varying_indices -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-INDICES
+; RUN: veczc -k insert_element_bool -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix=IE-BOOL
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @insert_element(<4 x float> addrspace(1)* nocapture readonly %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %2 = insertelement <4 x float> %1, float %val, i32 %idx
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_uniform(<4 x float> %in, float %val, i32 %idx, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %0 = insertelement <4 x float> %in, float %val, i32 %idx
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_varying_indices(<4 x float> addrspace(1)* nocapture readonly %in, i32 addrspace(1)* %idxs, <4 x float> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidxidx = getelementptr inbounds i32, i32 addrspace(1)* %idxs, i64 %call
+  %idx = load i32, i32 addrspace(1)* %arrayidxidx
+  %i = urem i32 %idx, 4
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = bitcast <4 x float> addrspace(1)* %arrayidx to <4 x float> addrspace(1)*
+  %1 = load <4 x float>, <4 x float> addrspace(1)* %0, align 16
+  %fidx = uitofp i64 %call to float
+  %2 = insertelement <4 x float> %1, float %fidx, i32 %i
+  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %2, <4 x float> addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+define spir_kernel void @insert_element_bool(<4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b, i32 %val, i32 %idx, <4 x i32> addrspace(1)* nocapture %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #6
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %b, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxa, align 4
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidxb, align 4
+  %2 = icmp slt <4 x i32> %0, %1
+  %i = urem i64 %call, 4
+  %v = trunc i32 %val to i1
+  %3 = insertelement <4 x i1> %2, i1 %v, i64 %i
+  %4 = sext <4 x i1> %3 to <4 x i32>
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx4, align 4
+  ret void
+}
+
+; IE-LABEL: @__vecz_nxv4_insert_element(
+; IE: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
+; IE: [[VAL0:%.*]] = insertelement <vscale x 4 x float> poison, float %val, {{(i32|i64)}} 0
+; IE: [[VAL1:%.*]] = shufflevector <vscale x 4 x float> [[VAL0]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; IE: store <vscale x 16 x float> {{.*}}, ptr [[ALLOC]], align 64
+; IE: [[IDX:%.*]] = sext i32 %idx to i64
+; IE: [[ADDR:%.*]] = getelementptr float, ptr [[ALLOC]], i64 [[IDX]]
+; IE: call void @__vecz_b_interleaved_store4_4_u5nxv4fu3ptr(<vscale x 4 x float> [[VAL1]], ptr [[ADDR]])
+; IE: = load <vscale x 16 x float>, ptr [[ALLOC]], align 64
+
+; Both the vector and index are uniform, so check we're not unnecessarily packetizing
+
+; IE-UNI-LABEL: @__vecz_nxv4_insert_element_uniform(
+; IE-UNI: {{%.*}} = insertelement <4 x float> %in, float %val, {{(i32|i64)}} %idx
+
+; IE-INDICES-LABEL: @__vecz_nxv4_insert_element_varying_indices(
+; IE-INDICES: [[ALLOC:%.*]] = alloca <vscale x 16 x float>, align 64
+; IE-INDICES: [[VAL:%.*]] = uitofp <vscale x 4 x i64> {{%.*}} to <vscale x 4 x float>
+; IE-INDICES: store <vscale x 16 x float> {{%.*}}, ptr [[ALLOC]], align 64
+; IE-INDICES: [[T1:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; IE-INDICES: [[T2:%.*]] = shl <vscale x 4 x i32> [[T1]], {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 2, (i32|i64) 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 2\)}}
+
+; LLVM 16 deduces add/or equivalence and uses `or` instead.
+; IE-INDICES: [[T3:%.*]] = {{add|or}} {{(disjoint )?}}<vscale x 4 x i32> [[T2]], {{%.*}}
+
+; IE-INDICES: [[T4:%.*]] = sext <vscale x 4 x i32> [[T3]] to <vscale x 4 x i64>
+; IE-INDICES: [[ADDR:%.*]] = getelementptr float, ptr %0, <vscale x 4 x i64> [[T4]]
+; IE-INDICES: call void @__vecz_b_scatter_store4_u5nxv4fu9nxv4u3ptr(<vscale x 4 x float> [[VAL]], <vscale x 4 x ptr> [[ADDR]])
+; IE-INDICES: = load <vscale x 16 x float>, ptr [[ALLOC]], align 64
+
+; Check we promote from i1 to i8 before doing our memops
+; IE-BOOL-LABEL: @__vecz_nxv4_insert_element_bool(
+; IE-BOOL: [[ALLOC:%.*]] = alloca <vscale x 16 x i8>, align 16
+; IE-BOOL-DAG: [[T0:%.*]] = sext <vscale x 4 x i1> {{%.*}} to <vscale x 4 x i8>
+; IE-BOOL-DAG: [[T1:%.*]] = sext <vscale x 16 x i1> {{%.*}} to <vscale x 16 x i8>
+; IE-BOOL: store <vscale x 16 x i8> [[T1]], ptr [[ALLOC]], align 16
+; IE-BOOL: call void @__vecz_b_scatter_store1_u5nxv4hu9nxv4u3ptr(<vscale x 4 x i8> [[T0]], <vscale x 4 x ptr> {{%.*}})
+; IE-BOOL: [[T2:%.*]] = load <vscale x 16 x i8>, ptr [[ALLOC]], align 16
+; IE-BOOL: [[T3:%.*]] = trunc <vscale x 16 x i8> [[T2]] to <vscale x 16 x i1>
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
new file mode 100644
index 0000000000000..212adee0fff9f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/interleaved_load.ll
@@ -0,0 +1,60 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k load_interleaved -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @load_interleaved(i32 addrspace(1)* nocapture readonly %input, i32 addrspace(1)* nocapture %output, i32 %stride) local_unnamed_addr {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
+  %0 = trunc i64 %call to i32
+  %conv1 = mul i32 %0, %stride
+  %idxprom = sext i32 %conv1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom
+  store i32 %1, i32 addrspace(1)* %arrayidx3, align 4
+  %add = add nsw i32 %conv1, 1
+  %idxprom4 = sext i32 %add to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom4
+  store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+  %add6 = add nsw i32 %conv1, 2
+  %idxprom7 = sext i32 %add6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idxprom7
+  store i32 1, i32 addrspace(1)* %arrayidx8, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define void @__vecz_b_interleaved_store4_V_u5nxv4ju3ptrU3AS1(<vscale x 4 x i32> [[ARG0:%.*]], ptr addrspace(1) [[ARG1:%.*]], i64 [[ARG2:%.*]]) [[ATTRS:#[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) [[ARG1]], {{i32|i64}} 0
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <vscale x 4 x ptr addrspace(1)> [[TMP0]], <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[ARG2]], {{i32|i64}} 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <vscale x 4 x i64> [[TMP2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul <vscale x 4 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <vscale x 4 x ptr addrspace(1)> [[TMP1]], <vscale x 4 x i64> [[TMP5]]
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> [[ARG0]], <vscale x 4 x ptr addrspace(1)> [[TMP6]], i32 immarg 4, <vscale x 4 x i1> {{shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\)}}) [[MASKED_ATTRS:#[0-9]+]]
+; CHECK-NEXT: ret void
+; CHECK-NEXT: }
+
+; CHECK-DAG: attributes [[ATTRS]] = { norecurse nounwind }
+; CHECK-DAG: attributes [[MASKED_ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
new file mode 100644
index 0000000000000..021b103ac4e56
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/intrinsics.ll
@@ -0,0 +1,195 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k ctpop -vecz-scalable -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix CTPOP
+; RUN: veczc -k ctlz -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s --check-prefix CTLZ
+; RUN: veczc -k cttz -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s --check-prefix CTTZ
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a)
+  %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b)
+  store i32 %ctpopi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %ctlzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %cttzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.uadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.ssub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.usub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i64 @__mux_get_global_id(i32)
+
+; CTPOP: void @__vecz_nxv2_ctpop
+; CTPOP: = call {{.*}}<vscale x 2 x i32> @llvm.ctpop.nxv2i32(<vscale x 2 x i32> %{{.*}})
+; CTPOP: = call {{.*}}<vscale x 4 x i8> @llvm.ctpop.nxv4i8(<vscale x 4 x i8> %{{.*}})
+
+; CTLZ: void @__vecz_nxv4_ctlz
+; ... but it does widen ctlz
+; CTLZ: = call {{.*}}<vscale x 4 x i32> @llvm.ctlz.nxv4i32(<vscale x 4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<vscale x 8 x i8> @llvm.ctlz.nxv8i8(<vscale x 8 x i8> %{{.*}}, i1 false)
+
+; CTTZ: void @__vecz_nxv8_cttz
+; ... and cttz
+; CTTZ: = call {{.*}}<vscale x 8 x i32> @llvm.cttz.nxv8i32(<vscale x 8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<vscale x 16 x i8> @llvm.cttz.nxv16i8(<vscale x 16 x i8> %{{.*}}, i1 false)
+
+; SADD_SAT: void @__vecz_nxv2_sadd_sat
+; SADD_SAT: = call <vscale x 2 x i32> @llvm.sadd.sat.nxv2i32(
+; SADD_SAT: = call <vscale x 4 x i8> @llvm.sadd.sat.nxv4i8(
+
+; UADD_SAT: void @__vecz_nxv2_uadd_sat
+; UADD_SAT: = call <vscale x 2 x i32> @llvm.uadd.sat.nxv2i32(
+; UADD_SAT: = call <vscale x 4 x i8> @llvm.uadd.sat.nxv4i8(
+
+; SSUB_SAT: void @__vecz_nxv2_ssub_sat
+; SSUB_SAT: = call <vscale x 2 x i32> @llvm.ssub.sat.nxv2i32(
+; SSUB_SAT: = call <vscale x 4 x i8> @llvm.ssub.sat.nxv4i8(
+
+; USUB_SAT: void @__vecz_nxv2_usub_sat
+; USUB_SAT: = call <vscale x 2 x i32> @llvm.usub.sat.nxv2i32(
+; USUB_SAT: = call <vscale x 4 x i8> @llvm.usub.sat.nxv4i8(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
new file mode 100644
index 0000000000000..d44cbf1bf4a12
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_add_store.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load_add_store(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_load_add_store
+; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
+; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
+; CHECK: [[sum:%[0-9a-z]+]] = add <vscale x 4 x i32> [[lhs]], [[rhs]]
+; CHECK: store <vscale x 4 x i32> [[sum]],
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
new file mode 100644
index 0000000000000..a3026450fd767
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/load_binops_store.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k load_binops_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load_binops_store(i32* %aptr, i32* %bptr, i32* %cptr, i32* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxc = getelementptr inbounds i32, i32* %cptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %c = load i32, i32* %arrayidxc, align 4
+  %sum = add i32 %a, %b
+  %mpy = mul i32 %sum, %c
+  %shf = ashr i32 %mpy, 3
+  %dvu = udiv i32 %shf, %sum
+  store i32 %dvu, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_load_binops_store
+; CHECK: load <vscale x 4 x i32>, ptr
+; CHECK: load <vscale x 4 x i32>, ptr
+; CHECK: add <vscale x 4 x i32>
+; CHECK: mul <vscale x 4 x i32>
+; CHECK: ashr <vscale x 4 x i32>
+; CHECK: store <vscale x 4 x i32>
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
new file mode 100644
index 0000000000000..e97fd6da75ffe
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/metadata.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-scalable -vecz-simd-width=8 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idx
+  store i32 %load, i32 addrspace(1)* %slot
+  ret void
+}
+
+; CHECK: define spir_kernel void @test(ptr addrspace(1) %in) !codeplay_ca_vecz.base !0
+; CHECK: define spir_kernel void @__vecz_nxv8_test(ptr addrspace(1) %in) #0 !codeplay_ca_vecz.derived !2
+
+; CHECK: attributes #0 = { "mux-base-fn-name"="__vecz_nxv8_test" }
+
+; CHECK: !0 = !{!1, ptr @__vecz_nxv8_test}
+
+; CHECK: !1 = !{i32 8, i32 1, i32 0, i32 0}
+; CHECK: !2 = !{!1, ptr @test}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
new file mode 100644
index 0000000000000..fbadfebf05d4f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/packetize_mask_varying.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; A kernel which should produce a uniform masked vector load where the mask is
+; a single varying splatted bit.
+define spir_kernel void @mask_varying(<4 x i32>* %aptr, <4 x i32>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %mod_idx = urem i64 %idx, 2
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
+  %ins = insertelement <4 x i1> poison, i1 true, i32 0
+  %cmp = icmp slt i64 %idx, 64
+  br i1 %cmp, label %if.then, label %if.end
+if.then:
+  %v = load <4 x i32>, <4 x i32>* %aptr
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16
+  br label %if.end
+if.end:
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_mask_varying
+; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 2, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2\)}}
+
+; Note that since we just did a lshr 2 on the input of the extend, it doesn't
+; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
+; CHECK: [[idx2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
+
+; CHECK: [[t1:%.*]] = getelementptr i8, ptr {{.*}}, <vscale x 16 x i64> [[idx2]]
+; CHECK: [[t2:%.*]] = call <vscale x 16 x i8> @llvm.masked.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> [[t1]],
+; CHECK: [[splat:%.*]] = trunc <vscale x 16 x i8> [[t2]] to <vscale x 16 x i1>
+; CHECK: call void @__vecz_b_masked_store16_u6nxv16ju3ptru6nxv16b(<vscale x 16 x i32> {{.*}}, ptr %arrayidxz, <vscale x 16 x i1> [[splat]])
+
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPDv4_jDv4_b(<4 x i32>*, <4 x i1>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
new file mode 100644
index 0000000000000..c6e25c5f327e1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/scalable_auto.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k cast -vecz-scalable -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @cast(i32* %aptr, float* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %c = sitofp i32 %a to float
+  store float %c, float* %arrayidxz, align 4
+  ret void
+}
+
+; Check that passing -vecz-scalable with no width automatically chooses an
+; appropriate scalable vectorization factor.
+; CHECK: define spir_kernel void @__vecz_nxv[[VF:[0-9]+]]_cast
+; CHECK: sitofp <vscale x [[VF]] x i32> {{%[0-9]+}} to <vscale x [[VF]] x float>
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
new file mode 100644
index 0000000000000..55d888bf1b4b8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @select_scalar_scalar(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, i32 %b, i32 4
+  store i32 %sel, i32* %arrayidxz, align 4
+  ret void
+}
+
+define spir_kernel void @select_vector_vector(<2 x i32>* %aptr, <2 x i32>* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds <2 x i32>, <2 x i32>* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i32>, <2 x i32>* %bptr, i64 %idx
+  %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx
+  %a = load <2 x i32>, <2 x i32>* %arrayidxa, align 4
+  %b = load <2 x i32>, <2 x i32>* %arrayidxb, align 4
+  %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4
+  %cmp = icmp slt <2 x i32> %a, %b
+  %sel = select <2 x i1> %cmp, <2 x i32> %c, <2 x i32> <i32 4, i32 4>
+  store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_scalar
+; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
+; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 4 x i32>, ptr
+; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt <vscale x 4 x i32> [[lhs]], [[rhs]]
+; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 4 x i1> [[cmp]], <vscale x 4 x i32> [[rhs]], <vscale x 4 x i32> {{shufflevector \(<vscale x 4 x i32> insertelement \(<vscale x 4 x i32> poison, i32 4, (i32|i64) 0\), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i32 4\)}}
+; CHECK: store <vscale x 4 x i32> [[sel]],
+
+; CHECK: define spir_kernel void @__vecz_nxv4_select_vector_vector
+; CHECK: [[x:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[y:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[z:%[0-9a-z]+]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[cmp:%[0-9a-z]+]] = icmp slt <vscale x 8 x i32> [[x]], [[y]]
+; CHECK: [[sel:%[0-9a-z]+]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[z]], <vscale x 8 x i32> {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> poison, i32 4, (i32|i64) 0\), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer\)|splat \(i32 4\)}}
+; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
new file mode 100644
index 0000000000000..501f4245ec090
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/select_scalar_vector.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k select_scalar_vector -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @select_scalar_vector(i32* %aptr, i32* %bptr, <2 x i32>* %cptr, <2 x i32>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxc = getelementptr inbounds <2 x i32>, <2 x i32>* %cptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i32>, <2 x i32>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %c = load <2 x i32>, <2 x i32>* %arrayidxc, align 4
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, <2 x i32> %c, <2 x i32> <i32 4, i32 4>
+  store <2 x i32> %sel, <2 x i32>* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_select_scalar_vector
+; CHECK: [[rhs:%.*]] = load <vscale x 8 x i32>, ptr
+; CHECK: [[cmp1:%.*]] = icmp slt <vscale x 4 x i32>
+; CHECK: [[sext:%.*]] = sext <vscale x 4 x i1> [[cmp1]] to <vscale x 4 x i8>
+; CHECK: store <vscale x 4 x i8> [[sext]], ptr [[alloc:%.*]], align 4
+; CHECK: [[idx0:%.*]] = call <vscale x 8 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv8i32()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 8 x i32> [[idx0]], {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> poison, i32 1, (i32|i64) 0\), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer\)|splat \(i32 1\)}}
+
+; Note that since we just did a lshr 1 on the input of the extend, it doesn't
+; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
+; CHECK: [[sext2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 8 x i32> [[idx1]] to <vscale x 8 x i64>
+
+; CHECK: [[addrs:%.*]] = getelementptr i8, ptr [[alloc]], <vscale x 8 x i64> [[sext2]]
+; CHECK: [[gather:%.*]] = call <vscale x 8 x i8> @llvm.masked.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> [[addrs]],
+; CHECK: [[cmp:%.*]] = trunc <vscale x 8 x i8> [[gather]] to <vscale x 8 x i1>
+; CHECK: [[sel:%.*]] = select <vscale x 8 x i1> [[cmp]], <vscale x 8 x i32> [[rhs]], <vscale x 8 x i32> {{shufflevector \(<vscale x 8 x i32> insertelement \(<vscale x 8 x i32> poison, i32 4, (i32|i64) 0\), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer\)|splat \(i32 4\)}}
+; CHECK: store <vscale x 8 x i32> [[sel]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
new file mode 100644
index 0000000000000..9d9f141cf12ff
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/shuffle.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @do_shuffle_splat(i32* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
+  %insert = insertelement <4 x i32> poison, i32 %a, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %splat, <4 x i32>* %arrayidxz
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat
+; CHECK: [[idx0:%.*]] = call <vscale x 16 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv16i32()
+; CHECK: [[idx1:%.*]] = lshr <vscale x 16 x i32> [[idx0]], {{shufflevector \(<vscale x 16 x i32> insertelement \(<vscale x 16 x i32> poison, i32 2, (i32|i64) 0\), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i32 2\)}}
+
+; Note that since we just did a lshr 2 on the input of the extend, it doesn't
+; make any difference whether it's a zext or sext, but LLVM 16 prefers zext.
+; CHECK: [[idx2:%.*]] = {{s|z}}ext{{( nneg)?}} <vscale x 16 x i32> [[idx1]] to <vscale x 16 x i64>
+
+; CHECK: [[alloc:%.*]] = getelementptr i32, ptr %{{.*}}, <vscale x 16 x i64> [[idx2]]
+; CHECK: [[splat:%.*]] = call <vscale x 16 x i32> @llvm.masked.gather.nxv16i32.nxv16p0(<vscale x 16 x ptr> [[alloc]],
+; CHECK: store <vscale x 16 x i32> [[splat]], ptr
+}
+
+define spir_kernel void @do_shuffle_splat_uniform(i32 %a, <4 x i32>* %bptr, <4 x i32>* %zptr) {
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
+  %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
+  %insert = insertelement <4 x i32> poison, i32 %a, i32 0
+  %splat = shufflevector <4 x i32> %insert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %splat, <4 x i32>* %arrayidxz
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_do_shuffle_splat_uniform
+; CHECK: [[ins:%.*]] = insertelement <vscale x 16 x i32> poison, i32 %a, {{(i32|i64)}} 0
+; CHECK: [[splat:%.*]] = shufflevector <vscale x 16 x i32> [[ins]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK: store <vscale x 16 x i32> [[splat]], ptr
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll
new file mode 100644
index 0000000000000..28f4e99f7fb28
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/store_literal_struct.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that we do something correct when scalably packetizing struct literals.
+; Right now we fail to packetize, but if we could packetize this we'd have to
+; be careful as storing a struct literal containing scalable vectors is invalid
+; IR.
+; RUN: veczc -w 4 -vecz-scalable -vecz-passes=verify,packetizer,verify \
+; RUN:   --pass-remarks-missed=vecz -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: Vecz: Could not packetize  %v = load { i32, i32 }, ptr %arrayidx.p, align 4
+define spir_kernel void @test_fn(ptr %p, ptr %q) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.p = getelementptr { i32, i32 }, ptr %p, i64 %idx
+  %v = load { i32, i32 }, ptr %arrayidx.p, align 4
+  %arrayidx.q = getelementptr { i32, i32 }, ptr %q, i64 %idx
+  store { i32, i32 } %v, ptr %arrayidx.q, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
new file mode 100644
index 0000000000000..994c87fce14f5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_builtins.ll
@@ -0,0 +1,79 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i32 @__mux_get_sub_group_id()
+declare spir_func i32 @__mux_get_sub_group_size()
+declare spir_func i32 @__mux_get_sub_group_local_id()
+declare spir_func i32 @__mux_sub_group_broadcast_i32(i32, i32)
+
+define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call.i = tail call spir_func i32 @__mux_get_sub_group_id()
+  %conv = zext i32 %call.i to i64
+  %call2 = tail call spir_func i32 @__mux_get_sub_group_size()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_size(
+; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK: [[W:%.*]] = shl {{(nuw )?}}i32 [[VSCALE]], 2
+; CHECK: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[W]])
+; CHECK: store i32 [[RED]], ptr addrspace(1) {{.*}}
+}
+
+define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %call, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_get_sub_group_local_id(
+; CHECK: %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+; CHECK: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK: [[SHL:%.*]] = shl {{(nuw )?}}i32 [[VSCALE]], 2
+; CHECK: [[MUL:%.*]] = mul i32 %call, [[SHL]]
+; CHECK: [[SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[MUL]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[STEPVEC:%.*]] = call <vscale x 4 x i32> @llvm.{{(experimental\.)?}}stepvector.nxv4i32()
+; CHECK: [[LID:%.*]] = add <vscale x 4 x i32> [[SPLAT]], [[STEPVEC]]
+; CHECK: [[EXT:%.*]] = sext i32 %call to i64
+; CHECK: %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: store <vscale x 4 x i32> [[LID]], ptr addrspace(1) %arrayidx
+}
+
+define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
+  %v = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_nxv4_sub_group_broadcast(
+; CHECK: [[LD:%.*]] = load <vscale x 4 x i32>, ptr addrspace(1) {{%.*}}, align 4
+; CHECK: [[EXT:%.*]] = extractelement <vscale x 4 x i32> [[LD]], {{(i32|i64)}} 0
+; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 0)
+; CHECK: [[INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[BDCAST]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: store <vscale x 4 x i32> [[SPLAT]], ptr addrspace(1)
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
new file mode 100644
index 0000000000000..612a67f496406
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans.ll
@@ -0,0 +1,204 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -S -vecz-passes=packetizer < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64)
+declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_i32(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4j(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_add_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <vscale x 4 x i32> [[SCAN]], [[SPLAT]]
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_i64(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i64> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4m(<vscale x 4 x i64> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_sub_group_scan_exclusive_add_i64(i64 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[HEAD]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <vscale x 4 x i64> [[SCAN]], [[SPLAT]]
+; CHECK: store <vscale x 4 x i64> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_add_f32(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_u5nxv4f(<vscale x 4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.0{{.*}}, <vscale x 4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fadd_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[HEAD]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = fadd <vscale x 4 x float> [[SCAN]], [[SPLAT]]
+; CHECK: store <vscale x 4 x float> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_smin_i32(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_u5nxv4i(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smin_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x i32> @llvm.smin.nxv4i32(<vscale x 4 x i32> [[SCAN]], <vscale x 4 x i32> [[SPLAT]])
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_umin_i32(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_u5nxv4j(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umin_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x i32> @llvm.umin.nxv4i32(<vscale x 4 x i32> [[SCAN]], <vscale x 4 x i32> [[SPLAT]])
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_smax_i32(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_u5nxv4i(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smax_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> [[SCAN]], <vscale x 4 x i32> [[SPLAT]])
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_umax_i32(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_u5nxv4j(<vscale x 4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umax_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[HEAD]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> [[SCAN]], <vscale x 4 x i32> [[SPLAT]])
+; CHECK: store <vscale x 4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmin_f32(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_u5nxv4f(<vscale x 4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmin_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[HEAD]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x float> @llvm.minnum.nxv4f32(<vscale x 4 x float> [[SCAN]], <vscale x 4 x float> [[SPLAT]])
+; CHECK: store <vscale x 4 x float> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_fmax_f32(
+; CHECK: [[SCAN:%.*]] = call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_u5nxv4f(<vscale x 4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmax_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <vscale x 4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[HEAD]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[SCAN]], <vscale x 4 x float> [[SPLAT]])
+; CHECK: store <vscale x 4 x float> [[FINAL]],
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..06d079f2128ac
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,171 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -w 4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1)
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_mul_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_excl_mul_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_mul_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_mul_u5nxv4f(<vscale x 4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_excl_mul_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_mul_u5nxv4f(<vscale x 4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_and_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_and_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_or_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_or_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_xor_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_u5nxv4j(<vscale x 4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_and(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_and_u5nxv4b(<vscale x 4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_or(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_or_u5nxv4b(<vscale x 4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_reduce_scan_incl_logical_xor(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_u5nxv4b(<vscale x 4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
new file mode 100644
index 0000000000000..7ad386dedb3e4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_spv_khr_uniform_group_instructions_vp.ll
@@ -0,0 +1,171 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -w 4 -S -vecz-choices=VectorPredication < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1)
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_mul_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_excl_mul_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_mul_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_mul_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_excl_mul_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_exclusive_mul_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_and_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_and_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_or_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_or_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_xor_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_and(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_and_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_or(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_or_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_logical_xor(
+; CHECK: call <vscale x 4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_vp_u5nxv4bj(<vscale x 4 x i1> %{{.*}}, i32 %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
new file mode 100644
index 0000000000000..14bee4967bfbc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/subgroup_scans_vp.ll
@@ -0,0 +1,150 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S -vecz-passes=packetizer < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64)
+declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_i64(
+; CHECK: call <vscale x 4 x i64> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4mj(<vscale x 4 x i64> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_add_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_smin_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_u5nxv4ij(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_umin_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_smax_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_u5nxv4ij(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_umax_i32(
+; CHECK: call <vscale x 4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_u5nxv4jj(<vscale x 4 x i32> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_fmin_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.+}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_nxv4_vp_reduce_scan_incl_fmax_f32(
+; CHECK: call <vscale x 4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_u5nxv4fj(<vscale x 4 x float> %{{.*}}, i32 %{{.+}})
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
new file mode 100644
index 0000000000000..f8ed17cf10c67
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/vectors.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k load_add_store -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load_add_store(<4 x i32>* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  %a = load <4 x i32>, <4 x i32>* %arrayidxa, align 4
+  %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 4
+  %sum = add <4 x i32> %a, %b
+  store <4 x i32> %sum, <4 x i32>* %arrayidxz, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define spir_kernel void @__vecz_nxv4_load_add_store
+; CHECK: [[lhs:%[0-9a-z]+]] = load <vscale x 16 x i32>, ptr
+; CHECK: [[rhs:%[0-9a-z]+]] = load <vscale x 16 x i32>, ptr
+; CHECK: [[sum:%[0-9a-z]+]] = add <vscale x 16 x i32> [[lhs]], [[rhs]]
+; CHECK: store <vscale x 16 x i32> [[sum]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
new file mode 100644
index 0000000000000..a4ff1d7c228f4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/verification_fail_phi.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that we fail to vectorize but don't leave behind an invalid function.
+; RUN: not veczc -k regression_phis -vecz-scalable -w 1 -vecz-passes=packetizer,verify -S < %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @regression_phis(i64 addrspace(1)* %xs, i64 addrspace(1)* %ys, i32 addrspace(1)* %out, i64 %lim) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.x = getelementptr inbounds i64, i64 addrspace(1)* %xs, i64 %call
+  %x = load i64, i64 addrspace(1)* %arrayidx.x, align 4
+  %cond = icmp eq i64 %call, 0
+  br i1 %cond, label %if.then, label %exit
+
+if.then:
+  %arrayidx.y = getelementptr inbounds i64, i64 addrspace(1)* %ys, i64 %call
+  %y = load i64, i64 addrspace(1)* %arrayidx.y, align 4
+  br label %exit
+
+exit:
+  ; We previously left behind an invalid PHI with too few operands, owing to us
+  ; bailing our while PHIs were still pending post-vectorization fixup.
+  %retval = phi i64 [ %x, %entry ], [ %y, %if.then ]
+  %0 = icmp eq i64 %lim, 0
+  %1 = select i1 %0, i64 1, i64 %lim
+  %rem = urem i64 %retval, %1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %rem
+  %2 = atomicrmw add i32 addrspace(1)* %arrayidx, i32 1 monotonic
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
new file mode 100644
index 0000000000000..43f40444837b3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/widen_vload.ll
@@ -0,0 +1,34 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k widen_vload -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @widen_vload(<4 x i32>* %aptr, <4 x i32>* %zptr) {
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %mod_idx = urem i64 %idx, 2
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %mod_idx
+  %v = load <4 x i32>, <4 x i32>* %arrayidxa, align 16
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_widen_vload(
+; CHECK: %v4 = call <vscale x 16 x i32> @__vecz_b_gather_load16_u6nxv16ju10nxv16u3ptr(<vscale x 16 x ptr> %{{.*}})
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
new file mode 100644
index 0000000000000..c8e7e27514fa9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ScalableVectors/workitem_funcs.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k store_ult -vecz-scalable -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; Check that we can scalably-vectorize a call to get_global_id by using the
+; stepvector intrinsic
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @store_ult(i32* %out, i64* %N) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
+  %0 = load i64, i64* %N, align 8
+  %cmp = icmp ult i64 %call, %0
+  %conv = zext i1 %cmp to i32
+  %arrayidx = getelementptr inbounds i32, i32* %out, i64 %call
+  store i32 %conv, i32* %arrayidx, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define spir_kernel void @__vecz_nxv4_store_ult
+; CHECK:   [[step:%[0-9.a-z]+]] = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
+; CHECK:   %{{.*}} = add <vscale x 4 x i64> %{{.*}}, [[step]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
new file mode 100644
index 0000000000000..05f7c9483f2f8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/boscc_reduction.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k foo -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @foo(float addrspace(1)* readonly %a, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %1, 42
+  store i32 %add, i32 addrspace(1)* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv2_vp_foo(ptr addrspace(1) readonly %a, ptr addrspace(1) %out)
+; CHECK:  [[CMP:%.*]] = fcmp oeq <vscale x 2 x float> %{{.*}}, zeroinitializer
+; CHECK:  %{{.*}} = call i1 @llvm.vp.reduce.or.nxv2i1(i1 false, <vscale x 2 x i1> [[CMP]], {{.*}}, i32 {{.*}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
new file mode 100644
index 0000000000000..9835c56732a32
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/choice.ll
@@ -0,0 +1,33 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Just check that the VectorPredication choice is valid
+; RUN: veczc -k foo -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @foo(float* %aptr, float* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds float, float* %aptr, i64 %idx
+  %arrayidxz = getelementptr inbounds float, float* %zptr, i64 %idx
+  %a = load float, float* %arrayidxa, align 4
+  store float %a, float* %arrayidxz, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
new file mode 100644
index 0000000000000..38abafeb2cb77
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/compute_vector_length.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k get_sub_group_size -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-F2
+; RUN: veczc -k get_sub_group_size -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK-S4
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i32 @__mux_get_sub_group_id()
+declare spir_func i32 @__mux_get_sub_group_size()
+
+define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call.i = tail call spir_func i32 @__mux_get_sub_group_id()
+  %conv = zext i32 %call.i to i64
+  %call2 = tail call spir_func i32 @__mux_get_sub_group_size()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Makes sure the vector length is properly computed and substituted for get_sub_group_size()
+
+; CHECK-F2-LABEL: define spir_kernel void @__vecz_v2_vp_get_sub_group_size(
+; CHECK-F2: [[ID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK-F2: [[SZ:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK-F2: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]]
+; CHECK-F2: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 2)
+; CHECK-F2: [[VL1:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[VL0]] to i32
+; CHECK-F2: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]])
+; CHECK-F2: store i32 [[RED]], ptr addrspace(1) {{.*}}
+
+; CHECK-S4-LABEL: define spir_kernel void @__vecz_nxv4_vp_get_sub_group_size(
+; CHECK-S4: [[ID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK-S4: [[SZ:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK-S4: [[WL:%.*]] = sub {{.*}} i64 [[SZ]], [[ID]]
+; CHECK-S4: [[VF0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-S4: [[VF1:%.*]] = shl {{(nuw )?}}i64 [[VF0]], 2
+; CHECK-S4: [[VL0:%.*]] = call i64 @llvm.umin.i64(i64 [[WL]], i64 [[VF1]])
+; CHECK-S4: [[VL1:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[VL0]] to i32
+; CHECK-S4: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 [[VL1]])
+; CHECK-S4: store i32 [[RED]], ptr addrspace(1) {{.*}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
new file mode 100644
index 0000000000000..a913198ca3f2b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_interleaved_load_store.ll
@@ -0,0 +1,81 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication:FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+; Test if the interleaved load is defined correctly
+; Vector-predicated interleaved loads are always masked
+; CHECK: define <vscale x 4 x double> @__vecz_b_masked_interleaved_load8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(ptr addrspace(1){{( %0)?}}, <vscale x 4 x i1>{{( %1)?}}, i32{{( %2)?}}) [[ATTRS:#[0-9]+]] {
+; CHECK: entry:
+; CHECK:   %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
+; CHECK:   %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   %3 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
+; CHECK:   %4 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %3
+; CHECK:   %5 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %4
+; CHECK:   %6 = call <vscale x 4 x double> @llvm.vp.gather.nxv4f64.nxv4p1(<vscale x 4 x ptr addrspace(1)> %5, <vscale x 4 x i1> %1, i32 %2)
+; CHECK:   ret <vscale x 4 x double> %6
+; CHECK: }
+
+
+; Test if the interleaved store is defined correctly
+; Vector-predicated interleaved stores are always masked
+; CHECK: define void @__vecz_b_masked_interleaved_store8_vp_4_u5nxv4du3ptrU3AS1u5nxv4bj(<vscale x 4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <vscale x 4 x i1>{{( %2)?}}, i32{{( %3)?}}) [[ATTRS]]
+; CHECK: entry:
+; CHECK:  %BroadcastAddr.splatinsert = insertelement <vscale x 4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:  %BroadcastAddr.splat = shufflevector <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <vscale x 4 x ptr addrspace(1)> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:  %4 = call <vscale x 4 x i64> @llvm.{{(experimental\.)?}}stepvector.nxv4i64()
+; CHECK:  %5 = mul <vscale x 4 x i64> {{shufflevector \(<vscale x 4 x i64> insertelement \(<vscale x 4 x i64> poison, i64 4, (i32|i64) 0\), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i64 4\)}}, %4
+; CHECK:  %6 = getelementptr double, <vscale x 4 x ptr addrspace(1)> %BroadcastAddr.splat, <vscale x 4 x i64> %5
+; CHECK:  call void @llvm.vp.scatter.nxv4f64.nxv4p1(<vscale x 4 x double> %0, <vscale x 4 x ptr addrspace(1)> %6, <vscale x 4 x i1> %2, i32 %3)
+; CHECK:  ret void
+; CHECK: }
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
new file mode 100644
index 0000000000000..7ef8742f87f58
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_load_store.ll
@@ -0,0 +1,77 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call2 = call i64 @__mux_get_global_id(i32 0)
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call i64 @__mux_get_local_size(i32 0)
+  %call9 = call i64 @__mux_get_group_id(i32 0)
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_local_id(i32)
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i64 @__mux_get_local_size(i32)
+
+declare i64 @__mux_get_group_id(i32)
+
+; Test if the masked store is defined correctly
+; CHECK: define void @__vecz_b_masked_store4_vp_Dv4_ju3ptrU3AS1Dv4_bj(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}, i32{{( %3)?}}) [[ATTRS:#[0-9]+]] {
+; CHECK: entry:
+; CHECK: call void @llvm.vp.store.v4i32.p1(<4 x i32> %0, ptr addrspace(1) %1, <4 x i1> %2, i32 %3)
+; CHECK: ret void
+
+; Test if the masked load is defined correctly
+; CHECK: define <4 x i32> @__vecz_b_masked_load4_vp_Dv4_ju3ptrU3AS2Dv4_bj(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}}, i32{{( %2)?}}) [[ATTRS]] {
+; CHECK: entry:
+; CHECK: %3 = call <4 x i32> @llvm.vp.load.v4i32.p2(ptr addrspace(2) %0, <4 x i1> %1, i32 %2)
+; CHECK: ret <4 x i32> %3
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
new file mode 100644
index 0000000000000..5353ab9a90aae
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_masked_scatter_gather.ll
@@ -0,0 +1,89 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %idxprom4 = sext i32 %2 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4
+  store i32 42, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; Test if the vector-predicated scatter store is defined correctly
+; CHECK: define void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32>{{( %0)?}}, <vscale x 4 x ptr addrspace(1)>{{( %1)?}}, <vscale x 4 x i1>{{( %2)?}}, i32{{( %3)?}})
+; CHECK: entry:
+; CHECK: call void @llvm.vp.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3)
+; CHECK: ret void
+
+define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 42, i32 addrspace(1)* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Test if the vector-predicated gather load is defined correctly
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)>{{( %0)?}}, <vscale x 4 x i1>{{( %1)?}}, i32{{( %2)?}})
+; CHECK: entry:
+; CHECK: %3 = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2)
+; CHECK: ret <vscale x 4 x i32> %3
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
new file mode 100644
index 0000000000000..0d1b86390d6d1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/define_subgroup_scans.ll
@@ -0,0 +1,183 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %b = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %c = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %d = call <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %e = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %f = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %g = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %h = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %i = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32> zeroinitializer, i32 0)
+  %j = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %k = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %l = call <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  %m = call <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float> zeroinitializer, i32 0)
+  ret void
+}
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <4 x i32>
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <4 x i32> %[[N_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <4 x i32> %[[VEC]], {{(<4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <4 x i32> %[[MASK]]
+; CHECK:   %[[VLINS:.+]] = insertelement <4 x i32> poison, i32 %1, {{i32|i64}} 0
+; CHECK:   %[[VLSPLAT:.+]] = shufflevector <4 x i32> %[[VLINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK:   %[[VLMASK:.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %[[VLSPLAT]]
+; CHECK:   %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> poison)
+
+; CHECK:   %[[ACCUM:.+]] = add <4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <4 x i1> %[[WHICH]], <4 x i32> %[[ACCUM]], <4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[RESULT:.+]] = phi <4 x i32> [ %[[NEWVEC]], %loop ]
+; CHECK:   ret <4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}}) {
+; CHECK: entry:
+; CHECK:   %[[SHUFFLE_ALLOC:.+]] = alloca <4 x i32>
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   %[[IV:.+]] = phi i32 [ 1, %entry ], [ %[[N2:.+]], %loop ]
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ], [ %[[NEWVEC:.+]], %loop ]
+; CHECK:   %[[MASKPHI:.+]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %[[NEWMASK:.+]], %loop ]
+; CHECK:   %[[N_INS:.+]] = insertelement <4 x i32> poison, i32 %[[IV]], {{i32|i64}} 0
+; CHECK:   %[[N_SPLAT:.+]] = shufflevector <4 x i32> %[[N_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK:   %[[MASK:.+]] = xor <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+
+;------- target-dependent dynamic shuffle code:
+; CHECK:   store <4 x i32> %[[VEC]], {{(<4 x i32>\*)|(ptr)}} %[[SHUFFLE_ALLOC]]
+;------- there will be a bitcast here if pointers are typed
+; CHECK:   %[[INDEX:.+]] = getelementptr inbounds i32, [[PTRTY:(i32\*)|ptr]] %{{.+}}, <4 x i32> %[[MASK]]
+; CHECK:   %[[VLINS:.+]] = insertelement <4 x i32> poison, i32 %1, {{i32|i64}} 0
+; CHECK:   %[[VLSPLAT:.+]] = shufflevector <4 x i32> %[[VLINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK:   %[[VLMASK:.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %[[VLSPLAT]]
+; CHECK:   %[[SHUFFLE:.+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0{{(i32)?}}(<4 x [[PTRTY]]> %[[INDEX]], i32 4, <4 x i1> %[[VLMASK]], <4 x i32> poison)
+
+; CHECK:   %[[ACCUM:.+]] = add <4 x i32> %[[VEC]], %{{.+}}
+; CHECK:   %[[BIT:.+]] = and <4 x i32> %[[MASKPHI]], %[[N_SPLAT]]
+; CHECK:   %[[WHICH:.+]] = icmp ne <4 x i32> %[[BIT]], zeroinitializer
+; CHECK:   %[[NEWVEC]] = select <4 x i1> %[[WHICH]], <4 x i32> %[[ACCUM]], <4 x i32> %[[VEC]]
+; CHECK:   %[[NEWMASK]] = or <4 x i32> %[[MASK]], %[[N_SPLAT]]
+; CHECK:   %[[N2]] = shl nuw nsw i32 %[[IV]], 1
+; CHECK:   %[[CMP:.+]] = icmp ult i32 %[[N2]], %1
+; CHECK:   br i1 %[[CMP]], label %loop, label %exit
+; CHECK: exit:
+; CHECK:   %[[SCAN:.+]] = phi <4 x i32> [ %[[NEWVEC]], %loop ]
+
+;------- target-dependent slide-up goes here
+; CHECK:  %[[SLIDE:.+]] = shufflevector <4 x i32> %[[SCAN]], <4 x i32> poison, <4 x i32> <i32 {{[0-9]+}}, i32 0, i32 1, i32 2>
+; CHECK:  %[[RESULT:.+]] = insertelement <4 x i32> %[[SLIDE]], i32 0, {{i32|i64}} 0
+
+; CHECK:   ret <4 x i32> %[[RESULT]]
+; CHECK: }
+
+
+; We know the generated code is correct for one scan type,
+; now verify that all the others use the correct binary operations.
+
+declare <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_add_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <4 x float> %[[VEC]], %{{.+}}
+
+declare <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_add_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = fadd <4 x float> %[[VEC]], %{{.+}}
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_smin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32>, i32)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_vp_Dv4_jj(<4 x i32>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x i32> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %[[VEC]], <4 x i32> %{{.+}})
+
+declare <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_min_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x float> @llvm.minnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}})
+
+declare <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_inclusive_max_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}})
+
+declare <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_min_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x float> @llvm.minnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}})
+
+declare <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float>, i32)
+; CHECK-LABEL: define <4 x float> @__vecz_b_sub_group_scan_exclusive_max_vp_Dv4_fj(<4 x float>{{.*}}, i32{{.*}})
+; CHECK: loop:
+; CHECK:   %[[VEC:.+]] = phi <4 x float> [ %0, %entry ],
+; CHECK:   %{{.+}} = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %[[VEC]], <4 x float> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
new file mode 100644
index 0000000000000..92ac9161e8c89
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/load_add_store.ll
@@ -0,0 +1,103 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k load_add_store_i32 -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_4F
+; RUN: veczc -k load_add_store_i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_1S
+; RUN: veczc -k load_add_store_v4i32 -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_V4_2F
+; RUN: veczc -k load_add_store_v4i32 -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s --check-prefix CHECK_V4_1S
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @load_add_store_i32(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK_4F: define spir_kernel void @__vecz_v4_vp_load_add_store_i32(
+; CHECK_4F: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK_4F: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK_4F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK_4F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 4)
+; CHECK_4F: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T0]] to i32
+; CHECK_4F: [[LHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]])
+; CHECK_4F: [[RHS:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0(ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]])
+; CHECK_4F: [[ADD:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[LHS]], <4 x i32> [[RHS]], <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]])
+; CHECK_4F: call void @llvm.vp.store.v4i32.p0(<4 x i32> [[ADD]], ptr {{%.*}}, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[VL]])
+
+; CHECK_1S: define spir_kernel void @__vecz_nxv4_vp_load_add_store_i32(
+; CHECK_1S: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK_1S: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK_1S: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK_1S: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 2
+; CHECK_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
+; CHECK_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
+; CHECK_1S: [[LHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 4 x i1> (shufflevector \(<vscale x 4 x i1> insertelement \(<vscale x 4 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
+; CHECK_1S: [[RHS:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
+; CHECK_1S: [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> [[LHS]], <vscale x 4 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
+; CHECK_1S: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
+
+define spir_kernel void @load_add_store_v4i32(<4 x i32>* %aptr, <4 x i32>* %bptr, <4 x i32>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <4 x i32>, <4 x i32>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  %a = load <4 x i32>, <4 x i32>* %arrayidxa, align 16
+  %b = load <4 x i32>, <4 x i32>* %arrayidxb, align 16
+  %sum = add <4 x i32> %a, %b
+  store <4 x i32> %sum, <4 x i32>* %arrayidxz, align 16
+  ret void
+}
+
+; CHECK_V4_2F: define spir_kernel void @__vecz_v2_vp_load_add_store_v4i32(
+; CHECK_V4_2F: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK_V4_2F: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK_V4_2F: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK_V4_2F: [[T0:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 2)
+; CHECK_V4_2F: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T0]] to i32
+; Each WI performs 4 elements, so multiply the VL by 4
+; CHECK_V4_2F: [[SVL:%.*]] = shl nuw nsw i32 [[VL]], 2
+; CHECK_V4_2F: [[LHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]])
+; CHECK_V4_2F: [[RHS:%.*]] = call <8 x i32> @llvm.vp.load.v8i32.p0(ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]])
+; CHECK_V4_2F: [[ADD:%.*]] = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> [[LHS]], <8 x i32> [[RHS]], <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]])
+; CHECK_V4_2F: call void @llvm.vp.store.v8i32.p0(<8 x i32> [[ADD]], ptr {{%.*}}, <8 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, i32 [[SVL]])
+
+; CHECK_V4_1S: define spir_kernel void @__vecz_nxv4_vp_load_add_store_v4i32(
+; CHECK_V4_1S: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK_V4_1S: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK_V4_1S: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK_V4_1S: [[T0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK_V4_1S: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 2
+; CHECK_V4_1S: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
+; CHECK_V4_1S: [[VL:%.*]] = trunc {{(nuw )?(nsw )?}}i64 [[T2]] to i32
+; Each WI performs 4 elements, so multiply the VL by 4
+; CHECK_V4_1S: [[SVL:%.*]] = shl i32 [[VL]], 2
+; CHECK_V4_1S: [[LHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 16 x i1> (shufflevector \(<vscale x 16 x i1> insertelement \(<vscale x 16 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[SVL]])
+; CHECK_V4_1S: [[RHS:%.*]] = call <vscale x 16 x i32> @llvm.vp.load.nxv16i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]])
+; CHECK_V4_1S: [[ADD:%.*]] = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> [[LHS]], <vscale x 16 x i32> [[RHS]], [[TRUEMASK]], i32 [[SVL]])
+; CHECK_V4_1S: call void @llvm.vp.store.nxv16i32.p0(<vscale x 16 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[SVL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll
new file mode 100644
index 0000000000000..03492705536f4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/masked_atomics.ll
@@ -0,0 +1,106 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_fn(<vscale x 1 x ptr> %p) {
+  %ret0 = call <vscale x 1 x i32> @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 4)
+  %ret1 = call { <vscale x 1 x i32>, <vscale x 1 x i1> } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> zeroinitializer, <vscale x 1 x i32> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 4)
+  ret void
+}
+
+declare <vscale x 1 x i32> @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> %val, <vscale x 1 x i1> %mask, i32 %vl)
+
+declare { <vscale x 1 x i32>, <vscale x 1 x i1> } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> %cmp, <vscale x 1 x i32> %newval, <vscale x 1 x i1> %mask, i32 %vl)
+
+; CHECK: define <vscale x 1 x i32> @__vecz_b_nxv1_vp_masked_atomicrmw_add_align4_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> %val, <vscale x 1 x i1> %mask, i32 %vl) {
+; CHECK: entry:
+; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0
+; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry
+
+; CHECK: earlyexit:
+; CHECK: ret <vscale x 1 x i32> poison
+
+; CHECK: loopentry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ]
+; CHECK: [[RET_PREV:%.*]] = phi <vscale x 1 x i32> [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <vscale x 1 x i1> %mask, i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <vscale x 1 x ptr> %p, i32 [[IDX]]
+; CHECK: [[VAL:%.*]] = extractelement <vscale x 1 x i32> %val, i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4
+; CHECK: [[RET_NEXT:%.*]] = insertelement <vscale x 1 x i32> [[RET_PREV]], i32 [[ATOM]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE:%.*]] = phi <vscale x 1 x i32> [ [[RET_PREV]], %loopIR ], [ [[RET_NEXT]], %if.then ]
+; CHECK: [[INC]] = add i32 [[IDX]], 1
+; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl
+; CHECK: br i1 [[CMP]], label %loopIR, label %exit
+
+; CHECK: exit:
+; CHECK: ret <vscale x 1 x i32> [[MERGE]]
+
+; CHECK: define { <vscale x 1 x i32>, <vscale x 1 x i1> } @__vecz_b_nxv1_vp_masked_cmpxchg_align4_acquire_acquire_1_u9nxv1u3ptru5nxv1ju5nxv1ju5nxv1b(<vscale x 1 x ptr> %p, <vscale x 1 x i32> %cmp, <vscale x 1 x i32> %newval, <vscale x 1 x i1> %mask, i32 %vl) {
+; CHECK: entry:
+; CHECK: [[VLZERO:%.*]] = icmp eq i32 %vl, 0
+; CHECK: br i1 [[VLZERO]], label %earlyexit, label %loopentry
+
+; CHECK: earlyexit:
+; CHECK: ret { <vscale x 1 x i32>, <vscale x 1 x i1> } poison
+
+; CHECK: loopentry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %loopentry ], [ [[INC:%.*]], %if.else ]
+; CHECK: [[RET_PREV:%.*]] = phi <vscale x 1 x i32> [ poison, %loopentry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[SUCCESS_PREV:%.*]] = phi <vscale x 1 x i1> [ poison, %loopentry ], [ [[MERGE_SUCCESS:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <vscale x 1 x i1> %mask, i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <vscale x 1 x ptr> %p, i32 [[IDX]]
+; CHECK: [[CMP:%.*]] = extractelement <vscale x 1 x i32> %cmp, i32 [[IDX]]
+; CHECK: [[NEWVAL:%.*]] = extractelement <vscale x 1 x i32> %newval, i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEWVAL]] acquire acquire, align 4
+; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0
+; CHECK: [[RET:%.*]] = insertelement <vscale x 1 x i32> [[RET_PREV]], i32 [[EXT0]], i32 [[IDX]]
+; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1
+; CHECK: [[SUCCESS:%.*]] = insertelement <vscale x 1 x i1> [[SUCCESS_PREV]], i1 [[EXT1]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE:%.*]] = phi <vscale x 1 x i32> [ [[RET_PREV]], %loopIR ], [ [[RET]], %if.then ]
+; CHECK: [[MERGE_SUCCESS:%.*]] = phi <vscale x 1 x i1> [ [[SUCCESS_PREV]], %loopIR ], [ [[SUCCESS]], %if.then ]
+; CHECK: [[INC]] = add i32 [[IDX]], 1
+; CHECK: [[CMP:%.*]] = icmp ult i32 [[INC]], %vl
+; CHECK: br i1 [[CMP]], label %loopIR, label %exit
+
+; CHECK: exit:
+; CHECK: [[RETTMP:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i1> } poison, <vscale x 1 x i32> [[MERGE]], 0
+; CHECK: [[RETVAL:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i1> } [[RETTMP]], <vscale x 1 x i1> [[MERGE_SUCCESS]], 1
+; CHECK: ret { <vscale x 1 x i32>, <vscale x 1 x i1> } [[RETVAL]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
new file mode 100644
index 0000000000000..0ce65b9f4ca00
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/packetize_mask_varying.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k mask_varying -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; A kernel which should produce a uniform masked vector load where the mask is
+; a single varying splatted bit.
+define spir_kernel void @mask_varying(<4 x i32>* %aptr, <4 x i32>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %mod_idx = urem i64 %idx, 2
+  %arrayidxa = getelementptr inbounds <4 x i32>, <4 x i32>* %aptr, i64 %idx
+  %ins = insertelement <4 x i1> poison, i1 true, i32 0
+  %cmp = icmp slt i64 %idx, 64
+  br i1 %cmp, label %if.then, label %if.end
+if.then:
+  %v = load <4 x i32>, <4 x i32>* %aptr
+  %arrayidxz = getelementptr inbounds <4 x i32>, <4 x i32>* %zptr, i64 %idx
+  store <4 x i32> %v, <4 x i32>* %arrayidxz, align 16
+  br label %if.end
+if.end:
+  ret void
+; CHECK: define spir_kernel void @__vecz_nxv4_vp_mask_varying
+; CHECK: [[CMP:%.*]] = icmp slt <vscale x 4 x i64> %{{.*}},
+; CHECK: [[RED:%.*]] = call i1 @llvm.vp.reduce.or.nxv4i1(i1 false, <vscale x 4 x i1> [[CMP]], {{.*}}, i32 {{.*}})
+; CHECK: [[VAL:%.*]] = load <4 x i32>, ptr %aptr
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPDv4_jDv4_b(<4 x i32>*, <4 x i1>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
new file mode 100644
index 0000000000000..9660f9a601365
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/scatter_gather.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-scalable -vecz-simd-width=4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+; With VP all gathers become masked ones.
+define spir_kernel void @unmasked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %rem
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_vp_unmasked_gather(
+; CHECK: [[v:%.*]] = call <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)> %{{.*}})
+; CHECK: call void @llvm.vp.store.nxv4i32.p1(<vscale x 4 x i32> [[v]],
+
+
+; With VP all scatters become masked ones.
+define spir_kernel void @unmasked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %rem
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv4_vp_unmasked_scatter(
+; CHECK: [[v:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p1(
+; CHECK: call void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> [[v]],
+
+; CHECK: define <vscale x 4 x i32> @__vecz_b_masked_gather_load4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2) [[ATTRS:#[0-9]+]] {
+; CHECK:   %3 = call <vscale x 4 x i32> @llvm.vp.gather.nxv4i32.nxv4p1(<vscale x 4 x ptr addrspace(1)> %0, <vscale x 4 x i1> %1, i32 %2)
+; CHECK:   ret <vscale x 4 x i32> %3
+
+; CHECK: define void @__vecz_b_masked_scatter_store4_vp_u5nxv4ju14nxv4u3ptrU3AS1u5nxv4bj(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3) [[ATTRS]] {
+; CHECK: entry:
+; CHECK:   call void @llvm.vp.scatter.nxv4i32.nxv4p1(<vscale x 4 x i32> %0, <vscale x 4 x ptr addrspace(1)> %1, <vscale x 4 x i1> %2, i32 %3)
+; CHECK:   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
new file mode 100644
index 0000000000000..c5f015913aead
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions.ll
@@ -0,0 +1,240 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
+
+declare spir_func i1 @__mux_sub_group_all_i1(i1)
+declare spir_func i1 @__mux_sub_group_any_i1(i1)
+
+declare spir_func i32 @__mux_sub_group_reduce_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_add_i64(i64)
+declare spir_func float @__mux_sub_group_reduce_fadd_f32(float)
+declare spir_func i32 @__mux_sub_group_reduce_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_umax_i32(i32)
+declare spir_func float @__mux_sub_group_reduce_fmin_f32(float)
+declare spir_func float @__mux_sub_group_reduce_fmax_f32(float)
+
+define spir_kernel void @reduce_all_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %1 = icmp ne i32 %0, 0
+  %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 %1)
+  %2 = sext i1 %call2 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_all_i32(
+; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]])
+; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
+; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_any_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %1 = icmp ne i32 %0, 0
+  %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %1)
+  %2 = sext i1 %call2 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_any_i32(
+; CHECK: [[C:%.*]] = icmp ne <4 x i32> {{%.*}}, zeroinitializer
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]])
+; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
+; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i32(
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.add.v4i32(i32 0, <4 x i32> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %conv
+  store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_add_i64(
+; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1(
+; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.add.v4i64(i64 0, <4 x i64> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_add_f32(
+; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1(
+; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_smin_i32(
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smin.v4i32(i32 2147483647, <4 x i32> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_umin_i32(
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umin.v4i32(i32 -1, <4 x i32> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_smax_i32(
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.smax.v4i32(i32 -2147483648, <4 x i32> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_umax_i32(
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.umax.v4i32(i32 0, <4 x i32> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_fmin_f32(
+; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1(
+; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmin.v4f32(float 0x7FF8000000000000, <4 x float> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_fmax_f32(
+; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1(
+; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmax.v4f32(float 0xFFF8000000000000, <4 x float> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..c632bbefc304d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,203 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
+
+declare spir_func i32 @__mux_sub_group_reduce_mul_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_mul_i64(i64)
+declare spir_func float @__mux_sub_group_reduce_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_reduce_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_or_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_xor_i64(i64)
+
+declare spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1)
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i32(
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.mul.v4i32(i32 1, <4 x i32> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_i64(
+; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1(
+; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.mul.v4i64(i64 1, <4 x i64> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_mul_f32(
+; CHECK: [[C:%.*]] = call <4 x float> @llvm.vp.load.v4f32.p1(
+; CHECK: [[R:%.*]] = call float @llvm.vp.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float %0)
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv
+  store float %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_and_i32(
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.and.v4i32(i32 -1, <4 x i32> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_or_i32(
+; CHECK: [[C:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p1(
+; CHECK: [[R:%.*]] = call i32 @llvm.vp.reduce.or.v4i32(i32 0, <4 x i32> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_xor_i64(
+; CHECK: [[C:%.*]] = call <4 x i64> @llvm.vp.load.v4i64.p1(
+; CHECK: [[R:%.*]] = call i64 @llvm.vp.reduce.xor.v4i64(i64 0, <4 x i64> [[C]], {{.*}})
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 8
+define spir_kernel void @reduce_xor_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 8
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 8
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_and(
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.and.v4i1(i1 true, <4 x i1> [[T:%.*]], {{.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[R]])
+; CHECK: [[R:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_or(
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.or.v4i1(i1 false, <4 x i1> [[T:%.*]], {{.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[R]])
+; CHECK: [[R:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_logical_xor(
+; CHECK: [[R:%.*]] = call i1 @llvm.vp.reduce.xor.v4i1(i1 false, <4 x i1> [[T:%.*]], {{.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[R]])
+; CHECK: [[R:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[R]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
new file mode 100644
index 0000000000000..a2da3addcbccf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans.ll
@@ -0,0 +1,153 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | FileCheck %s 
+
+; Tests the use of the VectorPredication choice. However, note that this option
+; currently makes no difference on fixed length vectors.
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64)
+declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_i64(
+; CHECK: call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_add_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_smin_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_umin_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_smax_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_umax_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_fmin_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> %{{.*}})
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_fmax_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> %{{.*}})
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..3ec97bda6fb12
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,174 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S -vecz-passes=packetizer -vecz-choices=VectorPredication < %s | FileCheck %s
+
+; Tests the use of the VectorPredication choice. However, note that this option
+; currently makes no difference on fixed length vectors.
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1)
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_mul_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_excl_mul_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_mul_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_f(<4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_excl_mul_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_f(<4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_and_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_and_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_or_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_or_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_xor_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_and(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_and_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_or(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_or_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_vp_reduce_scan_incl_logical_xor(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
new file mode 100644
index 0000000000000..e28025d5bccfc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorPredication/udiv.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k udiv -vecz-scalable -vecz-simd-width=2 -vecz-choices=VectorPredication -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @udiv(i32* %aptr, i32* %bptr, i32* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, i32* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, i32* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load i32, i32* %arrayidxb, align 4
+  %sum = udiv i32 %a, %b
+  store i32 %sum, i32* %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv2_vp_udiv(
+; CHECK: [[LID:%.*]] = call i64 @__mux_get_local_id(i32 0)
+; CHECK: [[LSIZE:%.*]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK: [[WREM:%.*]] = sub nuw nsw i64 [[LSIZE]], [[LID]]
+; CHECK: [[T0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK: [[T1:%.*]] = shl {{(nuw )?}}i64 [[T0]], 1
+; CHECK: [[T2:%.*]] = call i64 @llvm.umin.i64(i64 [[WREM]], i64 [[T1]])
+; CHECK: [[VL:%.*]] = trunc i64 [[T2]] to i32
+; CHECK: [[LHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK:<vscale x 2 x i1> (shufflevector \(<vscale x 2 x i1> insertelement \(<vscale x 2 x i1> poison, i1 true, (i32|i64) 0\), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer\)|splat \(i1 true\))]], i32 [[VL]])
+; CHECK: [[RHS:%.*]] = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
+; CHECK: [[ADD:%.*]] = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> [[LHS]], <vscale x 2 x i32> [[RHS]], [[TRUEMASK]], i32 [[VL]])
+; CHECK: call void @llvm.vp.store.nxv2i32.p0(<vscale x 2 x i32> [[ADD]], ptr {{%.*}}, [[TRUEMASK]], i32 [[VL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
new file mode 100644
index 0000000000000..7c71b06f530c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+; Test if the interleaved load is NOT defined
+; CHECK-NOT: define <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
+
+; Wide load instead
+; CHECK: load <16 x double>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
new file mode 100644
index 0000000000000..7c71b06f530c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/define_interleaved_load_as_masked.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+; Test if the interleaved load is NOT defined
+; CHECK-NOT: define <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
+
+; Wide load instead
+; CHECK: load <16 x double>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
new file mode 100644
index 0000000000000..c2c7b68912910
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/delete_packetized_memop.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.addr.0, %e
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in)
+  call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out)
+  %0 = extractelement <4 x i32> %call1, i64 0
+  %tobool = icmp ne i32 %0, 0
+  %tobool2 = icmp eq i64 %call, 0
+  %or.cond = and i1 %tobool2, %tobool
+  br i1 %or.cond, label %while.cond, label %for.inc
+
+while.cond:                                       ; preds = %while.cond, %for.body
+  %tobool3 = icmp eq i64 %call, 0
+  br i1 %tobool3, label %for.inc, label %while.cond
+
+for.inc:                                          ; preds = %for.body, %while.cond
+  %inc = add nsw i32 %i.addr.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
+
+declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*)
+
+; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep
+
+; Vector widening results in a single load
+; CHECK: load <16 x i32>
+; CHECK-NOT: call {{.*}}i32 @__vecz_b_interleaved_load4_ju3ptrU3AS1
+
+; CHECK: ret void
+
+; Check if the declaration is missing as well
+; CHECK-NOT: @__vecz_b_interleaved_load4_ju3ptrU3AS1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
new file mode 100644
index 0000000000000..2d28ba251b055
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_constant_index.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k extract_constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_constant_index(<4 x i64> addrspace(1)* %in, i32 %x, i64 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %in, i64 %call
+  %0 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x i64> %0, i32 0;
+  %arrayidx1 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %vecext, i64 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index
+; CHECK: %[[LD:.+]] = load <16 x i64>
+; CHECK: %[[EXT:.+]] = shufflevector <16 x i64> %[[LD]], <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK: store <4 x i64> %[[EXT]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
new file mode 100644
index 0000000000000..7739407d482f8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x float> %0, i32 %x
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %vecext, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index
+; CHECK: %[[LD:.+]] = load <16 x float>, ptr addrspace(1) %
+
+; No splitting of the widened source vector
+; CHECK-NOT: shufflevector
+
+; Extract directly from the widened source and insert directly into result
+; CHECK: %[[EXT0:.+]] = extractelement <16 x float> %[[LD]], i32 %x
+; CHECK: %[[INS0:.+]] = insertelement <4 x float> poison, float %[[EXT0]], i32 0
+; CHECK: %[[IDX1:.+]] = add i32 %x, 4
+; CHECK: %[[EXT1:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX1]]
+; CHECK: %[[INS1:.+]] = insertelement <4 x float> %[[INS0]], float %[[EXT1]], i32 1
+; CHECK: %[[IDX2:.+]] = add i32 %x, 8
+; CHECK: %[[EXT2:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX2]]
+; CHECK: %[[INS2:.+]] = insertelement <4 x float> %[[INS1]], float %[[EXT2]], i32 2
+; CHECK: %[[IDX3:.+]] = add i32 %x, 12
+; CHECK: %[[EXT3:.+]] = extractelement <16 x float> %[[LD]], i32 %[[IDX3]]
+; CHECK: %[[INS3:.+]] = insertelement <4 x float> %[[INS2]], float %[[EXT3]], i32 3
+; CHECK: store <4 x float> %[[INS3]], ptr addrspace(1) %{{.+}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
new file mode 100644
index 0000000000000..405cff01c2e34
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index2.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_runtime_index(i32 addrspace(1)* %in, <4 x i8> %x, i8 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x i8> %x, i32 %0
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %call
+  store i8 %vecext, i8 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %
+
+; No splitting of the widened source vector
+; CHECK-NOT: shufflevector
+
+; Extract directly from the uniform source with vectorized indices and insert directly into result
+; CHECK: %[[IND0:.+]] = extractelement <4 x i32> %[[LD]], i32 0
+; CHECK: %[[EXT0:.+]] = extractelement <4 x i8> %x, i32 %[[IND0]]
+; CHECK: %[[INS0:.+]] = insertelement <4 x i8> poison, i8 %[[EXT0]], i32 0
+; CHECK: %[[IND1:.+]] = extractelement <4 x i32> %[[LD]], i32 1
+; CHECK: %[[EXT1:.+]] = extractelement <4 x i8> %x, i32 %[[IND1]]
+; CHECK: %[[INS1:.+]] = insertelement <4 x i8> %[[INS0]], i8 %[[EXT1]], i32 1
+; CHECK: %[[IND2:.+]] = extractelement <4 x i32> %[[LD]], i32 2
+; CHECK: %[[EXT2:.+]] = extractelement <4 x i8> %x, i32 %[[IND2]]
+; CHECK: %[[INS2:.+]] = insertelement <4 x i8> %[[INS1]], i8 %[[EXT2]], i32 2
+; CHECK: %[[IND3:.+]] = extractelement <4 x i32> %[[LD]], i32 3
+; CHECK: %[[EXT3:.+]] = extractelement <4 x i8> %x, i32 %[[IND3]]
+; CHECK: %[[INS3:.+]] = insertelement <4 x i8> %[[INS2]], i8 %[[EXT3]], i32 3
+; CHECK: store <4 x i8> %[[INS3]], ptr addrspace(1) %{{.+}}, align 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
new file mode 100644
index 0000000000000..70d1908c8a9ab
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/extractelement_runtime_index3.ll
@@ -0,0 +1,62 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 addrspace(1)* %x, float addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %x, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
+  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  %vecext = extractelement <4 x float> %0, i32 %1
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %vecext, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index
+; CHECK: %[[SRC:.+]] = load <16 x float>, ptr addrspace(1) %
+; CHECK: %[[IDX:.+]] = load <4 x i32>, ptr addrspace(1) %
+
+; No splitting of the widened source vector
+; CHECK-NOT: shufflevector
+
+; Offset the indices
+; CHECK: %[[ADD:.+]] = add <4 x i32> %[[IDX]], <i32 0, i32 4, i32 8, i32 12>
+
+; Extract directly from the widened source with vectorized indices and insert directly into result
+; CHECK: %[[IND0:.+]] = extractelement <4 x i32> %[[ADD]], i32 0
+; CHECK: %[[EXT0:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND0]]
+; CHECK: %[[INS0:.+]] = insertelement <4 x float> poison, float %[[EXT0]], i32 0
+; CHECK: %[[IND1:.+]] = extractelement <4 x i32> %[[ADD]], i32 1
+; CHECK: %[[EXT1:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND1]]
+; CHECK: %[[INS1:.+]] = insertelement <4 x float> %[[INS0]], float %[[EXT1]], i32 1
+; CHECK: %[[IND2:.+]] = extractelement <4 x i32> %[[ADD]], i32 2
+; CHECK: %[[EXT2:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND2]]
+; CHECK: %[[INS2:.+]] = insertelement <4 x float> %[[INS1]], float %[[EXT2]], i32 2
+; CHECK: %[[IND3:.+]] = extractelement <4 x i32> %[[ADD]], i32 3
+; CHECK: %[[EXT3:.+]] = extractelement <16 x float> %[[SRC]], i32 %[[IND3]]
+; CHECK: %[[INS3:.+]] = insertelement <4 x float> %[[INS2]], float %[[EXT3]], i32 3
+; CHECK: store <4 x float> %[[INS3]], ptr addrspace(1) %{{.+}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
new file mode 100644
index 0000000000000..2d767313e0ddc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @constant_index(<4 x i32>* %in, i32* %inval, <4 x i32>* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx2 = getelementptr inbounds i32, i32* %inval, i64 %call
+  %ldval = load i32, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 %ldval, i32 2
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx3
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_constant_index
+
+; A single wide load
+; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr %
+
+; The vectorized element load:
+; CHECK: %[[ELTS:.+]] = load <4 x i32>, ptr %
+
+; No interleaved loads
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_Dv4_ju3ptr
+
+; Insert elements turned into shufflevectors
+; CHECK: %[[WIDE:.+]] = shufflevector <4 x i32> %[[ELTS]], <4 x i32> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; CHECK: %[[INS:.+]] = shufflevector <16 x i32> %[[WIDE]], <16 x i32> %[[INTO]], <16 x i32> <i32 16, i32 17, i32 2, i32 19, i32 20, i32 21, i32 6, i32 23, i32 24, i32 25, i32 10, i32 27, i32 28, i32 29, i32 14, i32 31>
+
+; No more shuffles..
+; CHECK-NOT: shufflevector
+
+; We should have one widened store
+; CHECK: store <16 x i32> %[[INS]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
new file mode 100644
index 0000000000000..9c023a64e57ed
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_constant_index_constant_value.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @constant_index(<4 x i32>* %in, <4 x i32>* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 42, i32 2
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx2
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_constant_index
+
+; A single wide load
+; CHECK: %[[INTO:.+]] = load <16 x i32>, ptr %
+
+; No interleaved loads
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_Dv4_ju3ptr
+
+; Insert constant elements into the widened vector:
+; CHECK: %[[INS0:.+]] = insertelement <16 x i32> %[[INTO]], i32 42, i32 2
+; CHECK: %[[INS1:.+]] = insertelement <16 x i32> %[[INS0]], i32 42, i32 6
+; CHECK: %[[INS2:.+]] = insertelement <16 x i32> %[[INS1]], i32 42, i32 10
+; CHECK: %[[INS3:.+]] = insertelement <16 x i32> %[[INS2]], i32 42, i32 14
+
+; No shuffles..
+; CHECK-NOT: shufflevector
+
+; We should have one widened store
+; CHECK: store <16 x i32> %[[INS3]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
new file mode 100644
index 0000000000000..05ccf997a7d0a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/insertelement_runtime_index.ll
@@ -0,0 +1,60 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k runtime_index -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @runtime_index(<4 x i32>* %in, <4 x i32>* %out, i32* %index) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  store <4 x i32> %0, <4 x i32>* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32* %index, i64 %call
+  %1 = load i32, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 42, i32 %1
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx3
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_runtime_index
+
+; CHECK: %[[INTO:.+]]  = load <16 x i32>, ptr %arrayidx, align 16
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr
+; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LD]], <i32 0, i32 4, i32 8, i32 12>
+
+; The inserts got widened
+; CHECK: %[[ELT0:.+]] = extractelement <4 x i32> %[[ADD]], i32 0
+; CHECK: %[[INS0:.+1]] = insertelement <16 x i32> %[[INTO]], i32 42, i32 %[[ELT0]]
+; CHECK: %[[ELT1:.+]] = extractelement <4 x i32> %[[ADD]], i32 1
+; CHECK: %[[INS1:.+]] = insertelement <16 x i32> %[[INS0]], i32 42, i32 %[[ELT1]]
+; CHECK: %[[ELT2:.+]] = extractelement <4 x i32> %[[ADD]], i32 2
+; CHECK: %[[INS2:.+]] = insertelement <16 x i32> %[[INS1]], i32 42, i32 %[[ELT2]]
+; CHECK: %[[ELT3:.+]] = extractelement <4 x i32> %[[ADD]], i32 3
+; CHECK: %[[INS3:.+]] = insertelement <16 x i32> %[[INS2]], i32 42, i32 %[[ELT3]]
+
+; No shuffles..
+; CHECK-NOT: shufflevector
+
+; One widened store directly storing the result
+; CHECK: store <16 x i32> %[[INS3]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
new file mode 100644
index 0000000000000..00853a9b28b94
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/interleaved_safety.ll
@@ -0,0 +1,98 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-simd-width 4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Function start
+; CHECK: define spir_kernel void @__vecz_v4_f
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+
+; There should be exactly six vector loads and one store in the code
+; CHECK: load <16 x double>
+
+; And in between them there should be a barrier call
+; CHECK: call void @__mux_work_group_barrier
+; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}}
+; CHECK: load <16 x double>
+; CHECK: load <16 x double>
+; CHECK: load <16 x double>
+; CHECK: load <16 x double>
+
+; The fmuladd instrinsic will be widened..
+; CHECK: call <16 x double> @llvm.fmuladd.v16f64
+; CHECK: load <16 x double>
+; CHECK: store <16 x double>
+
+; There shouldn't be any interleaved loads or stores left
+; CHECK-NOT: call <4 x double> @__vecz_b_interleaved_load
+; CHECK-NOT: call void @__vecz_b_interleaved_store
+
+; Function end
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
new file mode 100644
index 0000000000000..3aae73704e7c9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_d.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_d
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+; CHECK: and <16 x i64>
+; CHECK: icmp slt <16 x i64>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
new file mode 100644
index 0000000000000..08a97d76842e7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isfiniteDv4_f.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_f
+; CHECK: and <16 x i32>
+; CHECK: icmp slt <16 x i32>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
new file mode 100644
index 0000000000000..1431fa1c19573
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_d.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_d
+; CHECK: and <16 x i64>
+; CHECK: icmp eq <16 x i64>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
new file mode 100644
index 0000000000000..83054e694801a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isinfDv4_f.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_f
+; CHECK: and <16 x i32>
+; CHECK: icmp eq <16 x i32>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
new file mode 100644
index 0000000000000..945ac791355c0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_d.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_d
+; CHECK: and <16 x i64>
+; CHECK: icmp eq <16 x i64>
+; CHECK: and <16 x i64>
+; CHECK: icmp sgt <16 x i64>
+; CHECK: and <16 x i1>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
new file mode 100644
index 0000000000000..86139d572338b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnanDv4_f.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_f
+; CHECK: and <16 x i32>
+; CHECK: icmp eq <16 x i32>
+; CHECK: and <16 x i32>
+; CHECK: icmp sgt <16 x i32>
+; CHECK: and <16 x i1>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
new file mode 100644
index 0000000000000..05117b1b691dc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_d.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_d
+; CHECK: and <16 x i64>
+; CHECK: icmp slt <16 x i64>
+; CHECK: icmp sgt <16 x i64>
+; CHECK: and <16 x i1>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
new file mode 100644
index 0000000000000..d33853b4e8d32
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/onearg_relationals_isnormalDv4_f.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-passes=builtin-inlining,packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_f
+; CHECK: and <16 x i32>
+; CHECK: icmp slt <16 x i32>
+; CHECK: icmp sgt <16 x i32>
+; CHECK: and <16 x i1>
+; CHECK: sext <16 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
new file mode 100644
index 0000000000000..4c887d9c66be5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/scalar_vector_user.ll
@@ -0,0 +1,79 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_local_id(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) #0
+
+declare spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float>, i64, float addrspace(1)*)
+
+declare spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64, float addrspace(1)*)
+; Function Attrs: inlinehint norecurse nounwind readnone
+declare spir_func float @_Z3madfff(float, float, float) local_unnamed_addr #2
+
+define spir_kernel void @scalar_vector_user(float addrspace(1)* %inout, i64 %n) {
+entry:
+  %lid = tail call i64 @__mux_get_local_id(i32 0) #0
+  %inout.address = getelementptr inbounds float, float addrspace(1)* %inout, i64 %lid
+  br label %loop
+
+loop:                                              ; preds = %entry, %loop
+  %madv4.prev = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4, %loop ]
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %i.inc = add nuw nsw i64 %i, 1
+  %cmp = icmp slt i64 %i.inc, %n
+  %inout.vload = tail call spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64 0, float addrspace(1)* %inout.address)
+  %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> poison, <4 x i32> zeroinitializer
+  %madv4 = tail call spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float> %inout.vload, <4 x float> %inout.vec0, <4 x float> %madv4.prev) #0
+  br i1 %cmp, label %loop, label %end
+
+end:                                             ; preds = %loop
+  %mad.vec0 = extractelement <4 x float> %madv4, i32 0
+  store float %mad.vec0, float addrspace(1)* %inout.address, align 4
+  tail call spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float> %madv4, i64 0, float addrspace(1)* %inout.address)
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate }
+attributes #2 = { inlinehint norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; The purpose of this test is to make sure we correctly scalarize an instruction
+; used by both a scalar and vector instruction. We would previously try to
+; scalarize its users twice thus resulting in invalid IR.
+
+; CHECK: define spir_kernel void @__vecz_v4_scalar_vector_user
+; CHECK: loop:
+; CHECK: %madv4.prev{{.*}} = phi <16 x float> [ zeroinitializer, %entry ], [ %[[CONCAT:.+]], %loop ]{{$}}
+
+; make sure the above PHI incomings are unique by looking for their definitions
+; one day we might be able to super-vectorize this call, but for now we instantiate and concatenate it
+; CHECK: %madv4[[S0:[0-9]+]] =
+; CHECK: %madv4[[S1:[0-9]+]] =
+; CHECK: %madv4[[S2:[0-9]+]] =
+; CHECK: %madv4[[S3:[0-9]+]] =
+; CHECK: %[[C0:.+]] = shufflevector <4 x float> %madv4[[S0]], <4 x float> %madv4[[S1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[C1:.+]] = shufflevector <4 x float> %madv4[[S2]], <4 x float> %madv4[[S3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CONCAT]] = shufflevector <8 x float> %[[C0]], <8 x float> %[[C1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
new file mode 100644
index 0000000000000..6fab62e9ca4a8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_copy.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k vector_copy -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_copy(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 16
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %0, <4 x i32> addrspace(1)* %arrayidx1, align 16
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; It makes sure the vector load and store are preserved right through to packetization
+; and then widened, instead of being scalarized across work-items first
+; and then getting de-interleaved by the Interleaved Group Combine Pass.
+; We expect a single vector loads feeding directly into a single vector store.
+
+; CHECK: define spir_kernel void @__vecz_v4_vector_copy
+; CHECK: load <16 x i32>
+; CHECK-NOT: load
+; CHECK-NOT: %deinterleave{{[0-9]*}} = shufflevector
+; CHECK-NOT: %interleave{{[0-9]*}} = shufflevector
+; CHECK: store <16 x i32>
+; CHECK-NOT: store
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
new file mode 100644
index 0000000000000..e5054ae1201e7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/vector_phi_varying.ll
@@ -0,0 +1,91 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %call.trunc = trunc i64 %call to i32
+  %call.splatinsert = insertelement <4 x i32> poison, i32 %call.trunc, i32 0
+  %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %call1 = call i64 @__mux_get_global_size(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
+  %0 = extractelement <4 x i1> %cmp2, i64 0
+  br i1 %0, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, %call.splat
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
+
+; This test checks if a varying <4 x i32> phi is scalarized into 4 i32 phis
+; and then re-packetized
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %[[STOREMERGE1:.+]] = phi <16 x i32> [ %[[INC2:.+]], %for.body ], [ zeroinitializer, %entry ]
+; CHECK: %[[INC2]] = add <16 x i32> %[[STOREMERGE1]], [[CALL:.+]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
new file mode 100644
index 0000000000000..67965785ba932
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_abs.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i32 @llvm.abs.i32(i32, i1)
+declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
+
+define spir_kernel void @absff(i32* %pa, i32* %pb) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr i32, i32* %pa, i64 %idx
+  %b = getelementptr i32, i32* %pb, i64 %idx
+  %la = load i32, i32* %a, align 16
+  %res = call spir_func i32 @llvm.abs.i32(i32 %la, i1 true)
+  store i32 %res, i32* %b, align 16
+  ret void
+}
+
+define spir_kernel void @absvf(<2 x i32>* %pa, <2 x i32>* %pb) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <2 x i32>, <2 x i32>* %pa, i64 %idx
+  %b = getelementptr <2 x i32>, <2 x i32>* %pb, i64 %idx
+  %la = load <2 x i32>, <2 x i32>* %a, align 16
+  %res = call spir_func <2 x i32> @llvm.abs.v2i32(<2 x i32> %la, i1 true)
+  store <2 x i32> %res, <2 x i32>* %b, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_absff(ptr %pa, ptr %pb)
+; CHECK: entry:
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %a = getelementptr i32, ptr %pa, i64 %idx
+; CHECK: %b = getelementptr i32, ptr %pb, i64 %idx
+; CHECK: %[[T0:.*]] = load <4 x i32>, ptr %a, align 4
+; CHECK: %[[RES1:.+]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %[[T0]], i1 true)
+; CHECK: store <4 x i32> %[[RES1]], ptr %b, align 4
+; CHECK: ret void
+
+; CHECK: define spir_kernel void @__vecz_v4_absvf(ptr %pa, ptr %pb)
+; CHECK: entry:
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %a = getelementptr <2 x i32>, ptr %pa, i64 %idx
+; CHECK: %b = getelementptr <2 x i32>, ptr %pb, i64 %idx
+; CHECK: %[[T0:.*]] = load <8 x i32>, ptr %a, align 8
+; CHECK: %[[RES2:.+]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %[[T0]], i1 true)
+; CHECK: store <8 x i32> %[[RES2]], ptr %b, align 8
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
new file mode 100644
index 0000000000000..96fbeb3ad959c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_binops.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k widen_binops -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @widen_binops(<4 x i32>* %pa, <4 x i32>* %pb, <4 x i64>* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <4 x i32>, <4 x i32>* %pa, i64 %idx
+  %b = getelementptr <4 x i32>, <4 x i32>* %pb, i64 %idx
+  %d = getelementptr <4 x i64>, <4 x i64>* %pd, i64 %idx
+  %la = load <4 x i32>, <4 x i32>* %a, align 16
+  %lb = load <4 x i32>, <4 x i32>* %b, align 16
+  %xa = zext <4 x i32> %la to <4 x i64>
+  %xb = zext <4 x i32> %lb to <4 x i64>
+  %add = add nuw nsw <4 x i64> %xa, %xb
+  store <4 x i64> %add, <4 x i64>* %d, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v8_widen_binops(ptr %pa, ptr %pb, ptr %pd)
+; CHECK: entry:
+
+; It checks that the zexts and add of <4 x i32> gets widened by a factor of 8,
+; to produce PAIRs of <16 x i32>s.
+; CHECK: %[[LDA0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x i32>, ptr %{{.+}}, align 16
+; CHECK: %[[XA0:.+]] = zext <16 x i32> %[[LDA0]] to <16 x i64>
+; CHECK: %[[XA1:.+]] = zext <16 x i32> %[[LDA1]] to <16 x i64>
+; CHECK: %[[XB0:.+]] = zext <16 x i32> %[[LDB0]] to <16 x i64>
+; CHECK: %[[XB1:.+]] = zext <16 x i32> %[[LDB1]] to <16 x i64>
+; CHECK: %[[ADD0:.+]] = add nuw nsw <16 x i64> %[[XA0]], %[[XB0]]
+; CHECK: %[[ADD1:.+]] = add nuw nsw <16 x i64> %[[XA1]], %[[XB1]]
+; CHECK: store <16 x i64> %[[ADD0]], ptr %{{.+}}
+; CHECK: store <16 x i64> %[[ADD1]], ptr %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
new file mode 100644
index 0000000000000..4d07f4a90a961
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_copysign.ll
@@ -0,0 +1,75 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+declare float @llvm.copysign.f32(float, float)
+declare <2 x float> @llvm.copysign.v2f32(<2 x float>, <2 x float>)
+
+define spir_kernel void @copysignff(float* %pa, float* %pb, float* %pc) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr float, float* %pa, i64 %idx
+  %b = getelementptr float, float* %pb, i64 %idx
+  %c = getelementptr float, float* %pc, i64 %idx
+  %la = load float, float* %a, align 16
+  %lb = load float, float* %b, align 16
+  %res = call float @llvm.copysign.f32(float %la, float %lb)
+  store float %res, float* %c, align 16
+  ret void
+}
+
+define spir_kernel void @copysignvf(<2 x float>* %pa, <2 x float>* %pb, <2 x float>* %pc) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <2 x float>, <2 x float>* %pa, i64 %idx
+  %b = getelementptr <2 x float>, <2 x float>* %pb, i64 %idx
+  %c = getelementptr <2 x float>, <2 x float>* %pc, i64 %idx
+  %la = load <2 x float>, <2 x float>* %a, align 16
+  %lb = load <2 x float>, <2 x float>* %b, align 16
+  %res = call <2 x float> @llvm.copysign.v2f32(<2 x float> %la, <2 x float> %lb)
+  store <2 x float> %res, <2 x float>* %c, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_copysignff(ptr %pa, ptr %pb, ptr %pc)
+; CHECK: entry:
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %a = getelementptr float, ptr %pa, i64 %idx
+; CHECK: %b = getelementptr float, ptr %pb, i64 %idx
+; CHECK: %c = getelementptr float, ptr %pc, i64 %idx
+; CHECK: [[T0:%.*]] = load <4 x float>, ptr %a, align 4
+; CHECK: [[T1:%.*]] = load <4 x float>, ptr %b, align 4
+; CHECK: %res1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[T0]], <4 x float> [[T1]])
+; CHECK: store <4 x float> %res1, ptr %c, align 4
+; CHECK: ret void
+
+; CHECK: define spir_kernel void @__vecz_v4_copysignvf(ptr %pa, ptr %pb, ptr %pc)
+; CHECK: entry:
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %a = getelementptr <2 x float>, ptr %pa, i64 %idx
+; CHECK: %b = getelementptr <2 x float>, ptr %pb, i64 %idx
+; CHECK: %c = getelementptr <2 x float>, ptr %pc, i64 %idx
+; CHECK: [[T0:%.*]] = load <8 x float>, ptr %a, align 8
+; CHECK: [[T1:%.*]] = load <8 x float>, ptr %b, align 8
+; CHECK: %res1 = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[T0]], <8 x float> [[T1]])
+; CHECK: store <8 x float> %res1, ptr %c, align 8
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
new file mode 100644
index 0000000000000..7b11bc9e63808
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fma.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
+  %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load <4 x float>, <4 x float>* %b, align 16
+  %lc = load <4 x float>, <4 x float>* %c, align 16
+  %fma = call <4 x float> @llvm.fma.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc)
+  store <4 x float> %fma, <4 x float>* %d, align 16
+  ret void
+}
+
+declare <4x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fma intrinsic of <4 x float> gets widened by a factor of 8,
+; to produce a PAIR of <16 x float>s.
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
+; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fma.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
+; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
+; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
new file mode 100644
index 0000000000000..1251115351205
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmin_vector_scalar.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k fmin_vector_scalar -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+; Function Attrs: nounwind readnone
+declare spir_func <4 x float> @_Z4fminDv4_ff(<4 x float>, float)
+
+; Note that we have to declare the scalar version, because when we vectorize
+; an already-vector builtin, we have to scalarize it first. This is the case
+; even for Vector Widening, where we don't actually create a call to the
+; scalar version, but we retrieve the wide version via the scalar version,
+; so the declaration still needs to exist.
+
+; Function Attrs: inlinehint nounwind readnone
+declare spir_func float @_Z4fminff(float, float)
+
+; Function Attrs: inlinehint nounwind readnone
+declare spir_func <16 x float> @_Z4fminDv16_fS_(<16 x float>, <16 x float>)
+
+define spir_kernel void @fmin_vector_scalar(<4 x float>* %pa, float* %pb, <4 x float>* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr float, float* %pb, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load float, float* %b, align 4
+  %res = tail call spir_func <4 x float> @_Z4fminDv4_ff(<4 x float> %la, float %lb)
+  store <4 x float> %res, <4 x float>* %d, align 16
+  ret void
+}
+
+
+; CHECK: define spir_kernel void @__vecz_v4_fmin_vector_scalar(ptr %pa, ptr %pb, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fmin builtin gets widened by a factor of 4, while its
+; scalar operand is sub-splatted to the required <16 x float>.
+; CHECK: %[[LDA:.+]] = load <16 x float>, ptr %{{.+}}
+; CHECK: %[[LDB:.+]] = load <4 x float>, ptr %{{.+}}
+; CHECK: %[[SPL:.+]] = shufflevector <4 x float> %[[LDB]], <4 x float> poison, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
+; CHECK: %[[RES:.+]] = call <16 x float> @llvm.minnum.v16f32(<16 x float> %[[LDA]], <16 x float> %[[SPL]])
+; CHECK: store <16 x float> %[[RES]], ptr %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
new file mode 100644
index 0000000000000..2760239937542
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
+  %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load <4 x float>, <4 x float>* %b, align 16
+  %lc = load <4 x float>, <4 x float>* %c, align 16
+  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc)
+  store <4 x float> %fma, <4 x float>* %d, align 16
+  ret void
+}
+
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
+; to produce a PAIR of <16 x float>s.
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
+; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[LDA1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
+; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
+; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
new file mode 100644
index 0000000000000..c092dbd97ca09
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd2.ll
@@ -0,0 +1,91 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %idx2 = shl i64 %idx, 1
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx2
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx2
+  %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx2
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx2
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load <4 x float>, <4 x float>* %b, align 16
+  %lc = load <4 x float>, <4 x float>* %c, align 16
+  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %la, <4 x float> %lb, <4 x float> %lc)
+  store <4 x float> %fma, <4 x float>* %d, align 16
+  ret void
+}
+
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
+; to produce a PAIR of <16 x float>s.
+
+; It concatenates the 8 x <4 x float> inputs into 2 x <16 x float> values
+; CHECK: %[[CA0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CA1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CA2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CA3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[SA0:.+]] = shufflevector <8 x float> %[[CA0]], <8 x float> %[[CA1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[SA1:.+]] = shufflevector <8 x float> %[[CA2]], <8 x float> %[[CA3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+; CHECK: %[[CB0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CB1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CB2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CB3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[SB0:.+]] = shufflevector <8 x float> %[[CB0]], <8 x float> %[[CB1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[SB1:.+]] = shufflevector <8 x float> %[[CB2]], <8 x float> %[[CB3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+; CHECK: %[[CC0:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CC1:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CC2:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[CC3:.+]] = shufflevector <4 x float> %{{.+}}, <4 x float> %{{.+}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[SC0:.+]] = shufflevector <8 x float> %[[CC0]], <8 x float> %[[CC1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[SC1:.+]] = shufflevector <8 x float> %[[CC2]], <8 x float> %[[CC3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+; CHECK: %[[FMA0:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[SA0]], <16 x float> %[[SB0]], <16 x float> %[[SC0]])
+; CHECK: %[[FMA1:.+]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[SA1]], <16 x float> %[[SB1]], <16 x float> %[[SC1]])
+
+; It splits the 2 x <16 x float> results into 8 <4 x float> values
+; CHECK: %[[RES0:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %[[RES1:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[RES2:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: %[[RES3:.+]] = shufflevector <16 x float> %[[FMA0]], <16 x float> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK: %[[RES4:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %[[RES5:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[RES6:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK: %[[RES7:.+]] = shufflevector <16 x float> %[[FMA1]], <16 x float> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK: store <4 x float> %[[RES0]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES1]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES2]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES3]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES4]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES5]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES6]], ptr %{{.+}}, align 16
+; CHECK: store <4 x float> %[[RES7]], ptr %{{.+}}, align 16
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
new file mode 100644
index 0000000000000..daf71de9b2446
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fmuladd_phi.ll
@@ -0,0 +1,74 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x float>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
+  %c = getelementptr <4 x float>, <4 x float>* %pc, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %la = load <4 x float>, <4 x float>* %a, align 16
+  %lb = load <4 x float>, <4 x float>* %b, align 16
+  %lc = load <4 x float>, <4 x float>* %c, align 16
+  br label %loop
+
+loop:
+  %n = phi i32 [ %dec, %loop ], [ 10, %entry ]
+  %acc = phi <4 x float> [ %fma, %loop ], [ %la, %entry ]
+  %fma = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %acc, <4 x float> %lb, <4 x float> %lc)
+  %dec = sub i32 %n, 1
+  %cmp = icmp ne i32 %dec, 0
+  br i1 %cmp, label %loop, label %end
+
+end:
+  store <4 x float> %fma, <4 x float>* %d, align 16
+  ret void
+}
+
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v8_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fmuladd intrinsic of <4 x float> gets widened by a factor of 8,
+; to produce a PAIR of <16 x float>s.
+; CHECK: %[[LDA0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDA1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDB1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC0:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+; CHECK: %[[LDC1:.+]] = load <16 x float>, ptr %{{.+}}, align 16
+
+; CHECK: loop:
+; CHECK: %[[ACC0:.+]] = phi <16 x float> [ %[[FMA0:.+]], %loop ], [ %[[LDA0]], %entry ]
+; CHECK: %[[ACC1:.+]] = phi <16 x float> [ %[[FMA1:.+]], %loop ], [ %[[LDA1]], %entry ]
+
+; CHECK: %[[FMA0]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[ACC0]], <16 x float> %[[LDB0]], <16 x float> %[[LDC0]])
+; CHECK: %[[FMA1]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %[[ACC1]], <16 x float> %[[LDB1]], <16 x float> %[[LDC1]])
+
+; CHECK: end:
+; CHECK: store <16 x float> %[[FMA0]], ptr %{{.+}}, align 16
+; CHECK: store <16 x float> %[[FMA1]], ptr %{{.+}}, align 16
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
new file mode 100644
index 0000000000000..1974c22c15a81
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshl.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_calls(i8* %pa, i8* %pb, i8* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr i8, i8* %pa, i64 %idx
+  %b = getelementptr i8, i8* %pb, i64 %idx
+  %d = getelementptr i8, i8* %pd, i64 %idx
+  %la = load i8, i8* %a, align 16
+  %lb = load i8, i8* %b, align 16
+  %res = tail call i8 @llvm.fshl.i8(i8 %la, i8 %lb, i8 4)
+  store i8 %res, i8* %d, align 16
+  ret void
+}
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+
+; CHECK: define spir_kernel void @__vecz_v16_test_calls(ptr %pa, ptr %pb, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fshl intrinsic of i8 gets widened by a factor of 16
+; CHECK: %[[LDA:.+]] = load <16 x i8>, ptr %{{.+}}
+; CHECK: %[[LDB:.+]] = load <16 x i8>, ptr %{{.+}}
+; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> {{<(i8 4(, )?)+>|splat \(i8 4\)}})
+; CHECK: store <16 x i8> %[[RES]], ptr %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
new file mode 100644
index 0000000000000..6b6f41e066ae1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_fshr.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_calls -vecz-passes=packetizer -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_calls(i8* %pa, i8* %pb, i8* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr i8, i8* %pa, i64 %idx
+  %b = getelementptr i8, i8* %pb, i64 %idx
+  %d = getelementptr i8, i8* %pd, i64 %idx
+  %la = load i8, i8* %a, align 16
+  %lb = load i8, i8* %b, align 16
+  %res = tail call i8 @llvm.fshr.i8(i8 %la, i8 %lb, i8 2)
+  store i8 %res, i8* %d, align 16
+  ret void
+}
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+
+; CHECK: define spir_kernel void @__vecz_v16_test_calls(ptr %pa, ptr %pb, ptr %pd)
+; CHECK: entry:
+
+; It checks that the fshr intrinsic of i8 gets widened by a factor of 16
+; CHECK: %[[LDA:.+]] = load <16 x i8>, ptr %{{.+}}
+; CHECK: %[[LDB:.+]] = load <16 x i8>, ptr %{{.+}}
+; CHECK: %[[RES:.+]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %[[LDA]], <16 x i8> %[[LDB]], <16 x i8> {{<(i8 2(, )?)+>|splat \(i8 2\)}})
+; CHECK: store <16 x i8> %[[RES]], ptr %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
new file mode 100644
index 0000000000000..38ea8eb57c60e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_shufflevector.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k widen_shufflevector -vecz-simd-width=2 -vecz-passes=packetizer -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @widen_shufflevector(<2 x float> addrspace(1)* %a, <2 x float> addrspace(1)* %b, <4 x float> addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidxa = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 %call
+  %arrayidxb = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %b, i64 %call
+  %la = load <2 x float>, <2 x float> addrspace(1)* %arrayidxa, align 4
+  %lb = load <2 x float>, <2 x float> addrspace(1)* %arrayidxb, align 4
+  %shuffle = shufflevector <2 x float> %la, <2 x float> %lb, <4 x i32> <i32 0, i32 3, i32 1, i32 2>
+  %arrayidx1 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %shuffle, <4 x float> addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v2_widen_shufflevector
+; CHECK: %[[LDA:.+]] = load <4 x float>, ptr addrspace(1) %
+; CHECK: %[[LDB:.+]] = load <4 x float>, ptr addrspace(1) %
+; CHECK: %[[SHF:.+]] = shufflevector <4 x float> %[[LDA]], <4 x float> %[[LDB]], <8 x i32> <i32 0, i32 5, i32 1, i32 4, i32 2, i32 7, i32 3, i32 6>
+; CHECK: store <8 x float> %[[SHF]], ptr addrspace(1) %
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
new file mode 100644
index 0000000000000..15ce1517417b2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/VectorWidening/widen_sqrt.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_sqrt -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func float @_Z4sqrtf(float)
+declare spir_func <2 x float> @_Z4sqrtDv2_f(<2 x float>)
+declare spir_func <4 x float> @_Z4sqrtDv4_f(<4 x float>)
+declare spir_func <8 x float> @_Z4sqrtDv8_f(<8 x float>)
+declare spir_func <16 x float> @_Z4sqrtDv16_f(<16 x float>)
+
+define spir_kernel void @test_sqrt(<2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %out2,
+                                   <4 x float> addrspace(1)* %in4, <4 x float> addrspace(1)* %out4) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %arrayin2 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %in2, i64 %gid
+  %arrayin4 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in4, i64 %gid
+  %arrayout2 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out2, i64 %gid
+  %arrayout4 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out4, i64 %gid
+  %ld2 = load <2 x float>, <2 x float> addrspace(1)* %arrayin2, align 16
+  %ld4 = load <4 x float>, <4 x float> addrspace(1)* %arrayin4, align 16
+  %sqrt2 = call spir_func <2 x float> @_Z4sqrtDv2_f(<2 x float> %ld2)
+  %sqrt4 = call spir_func <4 x float> @_Z4sqrtDv4_f(<4 x float> %ld4)
+  store <2 x float> %sqrt2, <2 x float> addrspace(1)* %arrayout2, align 16
+  store <4 x float> %sqrt4, <4 x float> addrspace(1)* %arrayout4, align 16
+  ret void
+}
+
+; The purpose of this test is to check that the vector context is able to
+; supply the packetizer with two versions of the builtin vectorized to two
+; different widths.
+;
+; CHECK: define spir_kernel void @__vecz_v4_test_sqrt
+; CHECK: call spir_func <8 x float> @_Z4sqrtDv8_f(<8 x float> %{{.*}})
+; CHECK: call spir_func <16 x float> @_Z4sqrtDv16_f(<16 x float> %{{.*}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
new file mode 100644
index 0000000000000..7a7d4428bdd4c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/alloca_alias.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.testStruct = type { <3 x i32> }
+
+define spir_kernel void @alloca_alias(i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %myStructs = alloca [2 x %struct.testStruct], align 16
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = bitcast [2 x %struct.testStruct]* %myStructs to i8*
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* nonnull %0)
+  %1 = trunc i64 %call to i32
+  %conv = add nuw nsw i32 %1, 2
+  %2 = insertelement <4 x i32> poison, i32 %conv, i64 0
+  %conv2 = add nuw nsw i32 %1, 3
+  %3 = insertelement <4 x i32> %2, i32 %conv2, i64 1
+  %4 = insertelement <4 x i32> %3, i32 %1, i64 2
+  %i = getelementptr inbounds [2 x %struct.testStruct], [2 x %struct.testStruct]* %myStructs, i64 0, i64 1, i32 0
+  %storetmp8 = bitcast <3 x i32>* %i to <4 x i32>*
+  store <4 x i32> %4, <4 x i32>* %storetmp8, align 16
+  %idxprom = sext i32 %index to i64
+  %i9 = getelementptr inbounds [2 x %struct.testStruct], [2 x %struct.testStruct]* %myStructs, i64 0, i64 %idxprom, i32 0
+  %castToVec410 = bitcast <3 x i32>* %i9 to <4 x i32>*
+  %loadVec411 = load <4 x i32>, <4 x i32>* %castToVec410, align 16
+  %extractVec12 = shufflevector <4 x i32> %loadVec411, <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+  %5 = mul i64 %call, 3
+  %vstore_base = getelementptr i32, i32 addrspace(1)* %out, i64 %5
+  %vstore_extract = extractelement <3 x i32> %extractVec12, i32 0
+  %6 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 0
+  store i32 %vstore_extract, i32 addrspace(1)* %6, align 4
+  %vstore_extract1 = extractelement <3 x i32> %extractVec12, i32 1
+  %7 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 1
+  store i32 %vstore_extract1, i32 addrspace(1)* %7, align 4
+  %vstore_extract2 = extractelement <3 x i32> %extractVec12, i32 2
+  %8 = getelementptr i32, i32 addrspace(1)* %vstore_base, i32 2
+  store i32 %vstore_extract2, i32 addrspace(1)* %8, align 4
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* nonnull %0)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8*)
+
+declare i64 @__mux_get_global_id(i32)
+
+declare spir_func void @_Z7vstore3Dv3_imPU3AS1i(<3 x i32>, i64, i32 addrspace(1)*)
+
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8*)
+
+; CHECK: spir_kernel void @__vecz_v4_alloca_alias
+; CHECK: alloca [4 x [2 x %struct.testStruct{{.*}}]]
+; CHECK-NOT: = alloca .*
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
new file mode 100644
index 0000000000000..3d39ca518818f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/arm_neon_store.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; REQUIRES: arm
+
+; RUN: veczc -k short3_char3_codegen -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-unknown-linux-gnueabihf"
+
+; Function Attrs: nounwind
+define spir_kernel void @short3_char3_codegen(i8 addrspace(1)* %src, i16 addrspace(1)* %dest) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+entry:
+  %call = call i32 @__mux_get_global_id(i32 0) #3
+  %call1 = call spir_func <3 x i8> @_Z6vload3jPU3AS1Kc(i32 %call, i8 addrspace(1)* %src) #3
+  %call3 = call spir_func <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8> %call1) #3
+  call spir_func void @_Z7vstore3Dv3_sjPU3AS1s(<3 x i16> %call3, i32 %call, i16 addrspace(1)* %dest) #3
+  ret void
+}
+
+declare i32 @__mux_get_global_id(i32) #1
+
+declare spir_func <3 x i8> @_Z6vload3jPU3AS1Kc(i32, i8 addrspace(1)*) #1
+
+declare spir_func <3 x i16> @_Z14convert_short3Dv3_c(<3 x i8>) #1
+
+declare spir_func void @_Z7vstore3Dv3_sjPU3AS1s(<3 x i16>, i32, i16 addrspace(1)*) #1
+
+; Function Attrs: inlinehint nounwind
+declare spir_func signext i16 @_Z13convert_shortc(i8 signext) #2
+
+; Function Attrs: inlinehint nounwind
+declare spir_func <16 x i16> @_Z15convert_short16Dv16_c(<16 x i8>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { inlinehint nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.spir.version = !{!0, !0, !0, !0, !0}
+!opencl.ocl.version = !{!1, !1, !1, !1, !1}
+
+!0 = !{i32 2, i32 0}
+!1 = !{i32 1, i32 2}
+!2 = !{i32 1, i32 1}
+!3 = !{!"none", !"none"}
+!4 = !{!"char*", !"short*"}
+!5 = !{!"", !""}
+
+; Assert call to neon intrinsic exists
+; CHECK: call void @llvm.arm.neon.vst3.p1.v4i16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
new file mode 100644
index 0000000000000..4a3f38ba7ad0c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/async_workgroup_copy_uniform.ll
@@ -0,0 +1,60 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%opencl.event_t = type opaque
+
+; Function Attrs: nounwind
+define spir_kernel void @test(i32 addrspace(1)* %input, i32 addrspace(3)* %output, i32 addrspace(1)* %elements) {
+  %ev = alloca %opencl.event_t*, align 8
+  %1 = call i64 @__mux_get_global_id(i32 0)
+  %2 = call i64 @__mux_get_group_id(i32 0)
+  %3 = call i64 @__mux_get_local_size(i32 0)
+  %4 = mul i64 %3, %2
+  %5 = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %4
+  %6 = mul i64 %3, %2
+  %7 = getelementptr inbounds i32, i32 addrspace(3)* %output, i64 %6
+  %8 = getelementptr inbounds i32, i32 addrspace(1)* %elements, i64 %2
+  %9 = load i32, i32 addrspace(1)* %8, align 4
+  %10 = sext i32 %9 to i64
+  %11 = load %opencl.event_t*, %opencl.event_t** %ev, align 8
+  %12 = call spir_func %opencl.event_t* @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event(i32 addrspace(1)* %5, i32 addrspace(3)* %7, i64 %10, %opencl.event_t* %11)
+  %13 = trunc i64 %3 to i32
+  call spir_func void @_Z17wait_group_eventsiP9ocl_event(i32 %13, %opencl.event_t** nonnull %ev)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_group_id(i32)
+declare i64 @__mux_get_local_size(i32)
+declare spir_func %opencl.event_t* @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event(i32 addrspace(1)*, i32 addrspace(3)*, i64, %opencl.event_t*)
+declare spir_func void @_Z17wait_group_eventsiP9ocl_event(i32, %opencl.event_t**)
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; Check if we have one and exactly one call to async_workgroup copy
+; CHECK: call spir_func ptr @_Z21async_work_group_copyPU3AS1iPKU3AS3im9ocl_event
+; CHECK-NOT: async_workgroup_copy
+
+; Check if we have one and exactly one call to wait_group_events
+; CHECK: call spir_func void @_Z17wait_group_eventsiP9ocl_event
+; CHECK-NOT: wait_group_events
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
new file mode 100644
index 0000000000000..786c7236e1585
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomic_cmpxchg.ll
@@ -0,0 +1,83 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire
+  %1 = extractvalue { i32, i1 } %0, 0
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %1, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub  acq_rel
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %0, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %conv, i32 addrspace(1)* %arrayidx
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; We no longer support instantiating atomic instructions in diverged blocks,
+; since they require masking. FileCheck does not support comments, so the CHECKs
+; have been removed or reversed in the following lines
+; CHECK-NOT: define spir_kernel void @__vecz_v4_atomic_cmpxchg_builtin
+; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire
+; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire
+; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire
+; cmpxchg i32 addrspace(1)* %counter, i32 %{{.+}}, i32 %{{.+}} seq_cst acquire
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
new file mode 100644
index 0000000000000..c403cf419d301
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw.ll
@@ -0,0 +1,83 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire
+  %1 = extractvalue { i32, i1 } %0, 0
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %1, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub  acq_rel
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %0, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %conv, i32 addrspace(1)* %arrayidx
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; We no longer support instantiating atomic instructions in diverged blocks,
+; since they require masking. FileCheck does not support comments, so the CHECKs
+; have been removed or reversed in the following lines
+; CHECK-NOT: define spir_kernel void @__vecz_v4_atomic_atomicrmw_builtin
+; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel
+; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel
+; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel
+; atomicrmw nand i32 addrspace(1)* %counter, i32 %{{.+}} acq_rel
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
new file mode 100644
index 0000000000000..e87ff74f7a6e3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/atomicrmw_uniform.ll
@@ -0,0 +1,81 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k atomic_rmw -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @atomic_cmpxchg_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = cmpxchg i32 addrspace(1)* %counter, i32 %sub, i32 %conv seq_cst acquire
+  %1 = extractvalue { i32, i1 } %0, 0
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %1, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_atomicrmw_builtin(i32 addrspace(1)* %counter, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %sub = add nsw i32 %conv, -1
+  %0 = atomicrmw nand i32 addrspace(1)* %counter, i32 %sub  acq_rel
+  %sub2 = add nsw i32 %conv, -1
+  %cmp = icmp eq i32 %0, %sub2
+  br i1 %cmp, label %do.end, label %do.body
+
+do.end:                                           ; preds = %do.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store volatile i32 %0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+define spir_kernel void @atomic_rmw(i32 addrspace(1)* %counter2, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = atomicrmw add i32 addrspace(1)* %counter2, i32 1 seq_cst
+  %idxprom = sext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %conv, i32 addrspace(1)* %arrayidx
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_atomic_rmw
+; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
+; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
+; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
+; CHECK: atomicrmw add ptr addrspace(1) %counter2, i32 1 seq_cst
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
new file mode 100644
index 0000000000000..08fc176beee7f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_mem2reg.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-passes="function(mem2reg),vecz-mem2reg" -vecz-simd-width=4 -vecz-handle-declaration-only-calls -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 %a, i32 %b, i32* %c, float %rf) {
+entry:
+  %d = alloca i32
+  %e = alloca i32
+  %f = alloca float
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %sum = add i32 %a, %b
+  store i32 %sum, i32* %d, align 4
+  store i32 %sum, i32* %e, align 4
+  %call = call spir_func i32 @foo(i32* %e)
+  %d.load = load i32, i32* %d, align 4
+  %e.load = load i32, i32* %e, align 4
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %d.load, i32* %c0, align 4
+  %c1 = getelementptr i32, i32* %c0, i64 1
+  store i32 %e.load, i32* %c1, align 4
+  store float %rf, float* %f
+  %ri = bitcast float* %f to i32*
+  %ri.load = load i32, i32* %ri, align 4
+  %c2 = getelementptr i32, i32* %c1, i64 2
+  store i32 %ri.load, i32* %c2, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @foo(i32*)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(i32 %a, i32 %b, ptr %c, float %rf)
+; CHECK: entry:
+; CHECK: %e = alloca i32
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %sum = add i32 %a, %b
+; CHECK: store i32 %sum, ptr %e
+; CHECK: %call = call spir_func i32 @foo(ptr{{.*}} %e)
+; CHECK: %e.load = load i32, ptr %e
+; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
+; CHECK: store i32 %sum, ptr %c0
+; CHECK: %c1 = getelementptr i32, ptr %c0, i64 1
+; CHECK: store i32 %e.load, ptr %c1
+; CHECK: %0 = bitcast float %rf to i32
+; CHECK: %c2 = getelementptr i32, ptr %c1, i64 2
+; CHECK: store i32 %0, ptr %c2, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
new file mode 100644
index 0000000000000..71035cb07e9e8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/basic_vecz_mem2reg.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Note: *not* running LLVM's mem2reg pass as before LLVM 15 it crashes for the
+; same reason we used to!
+; RUN: veczc -vecz-passes=vecz-mem2reg -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p) {
+  %data = alloca i32, align 4
+  %1 = tail call i64 @__mux_get_global_id(i32 0) #4
+  %2 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %1
+  %3 = load i32, ptr addrspace(1) %2, align 4
+  store i32 %3, ptr %data, align 4
+  %4 = load <2 x i16>, ptr %data, align 2
+  ret void
+}
+
+define spir_kernel void @load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) {
+  %data = alloca i32, align 4
+  %1 = tail call i64 @__mux_get_global_id(i32 0) #4
+  %2 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %1
+  %3 = load i32, ptr addrspace(1) %2, align 4
+  store i32 %3, ptr %data, align 4
+  %4 = load i16, ptr %data, align 2
+  ret void
+}
+
+define spir_kernel void @store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p) {
+  %data = alloca i32, align 4
+  %1 = tail call i64 @__mux_get_global_id(i32 0) #4
+  %2 = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %1
+  %3 = load i16, ptr addrspace(1) %2, align 4
+  store i16 %3, ptr %data, align 2
+  %4 = load i32, ptr %data, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_load_store_type_mismatch_no_bitcast(ptr addrspace(1) %p)
+; CHECK-NOT: alloca i32
+; CHECK:  %3 = load i32, ptr addrspace(1) %2, align 4
+; CHECK:  %4 = bitcast i32 %3 to <2 x i16>
+
+; Note: we can't optimize this as the allocated type size and loaded type sizes
+; don't match. Maybe we could trunc %3 from i32 to i16?
+
+; CHECK: define spir_kernel void @__vecz_v4_load_type_size_mismatch_no_bitcast(ptr addrspace(1) %p)
+; CHECK:  %data = alloca i32, align 4
+; CHECK:  %4 = load i16, ptr %data, align 2
+
+; Note: we can't optimize this as the allocated type size and loaded type sizes
+; don't match. Maybe we could trunc %3 from i32 to i16?
+
+; CHECK: define spir_kernel void @__vecz_v4_store_type_size_mismatch_no_bitcast(ptr addrspace(1) %p)
+; CHECK:  %data = alloca i32, align 4
+; CHECK:  %4 = load i32, ptr %data, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
new file mode 100644
index 0000000000000..4a2c09ca9ff69
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/bitcast_function.ll
@@ -0,0 +1,79 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @test(i32* %in, i32* %out) {
+entry:
+  %in.addr = alloca i32*, align 8
+  %out.addr = alloca i32*, align 8
+  %gid = alloca i64, align 8
+  store i32* %in, i32** %in.addr, align 8
+  store i32* %out, i32** %out.addr, align 8
+  %call = call i64 @__mux_get_global_id(i32 0)
+  store i64 %call, i64* %gid, align 8
+  %0 = load i64, i64* %gid, align 8
+  %rem = urem i64 %0, 16
+  %cmp = icmp eq i64 %rem, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %1 = load i64, i64* %gid, align 8
+  %2 = load i32*, i32** %in.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %1
+  %3 = load i32, i32* %arrayidx, align 4
+  %4 = load i64, i64* %gid, align 8
+  %5 = load i32*, i32** %in.addr, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 %4
+  %call2 = call spir_func i32 bitcast (i32 (i32, i32 addrspace(1)*)* @foo to i32 (i32, i32*)*)(i32 %3, i32* %arrayidx1)
+  %6 = load i64, i64* %gid, align 8
+  %7 = load i32*, i32** %out.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %7, i64 %6
+  store i32 %call2, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %8 = load i64, i64* %gid, align 8
+  %9 = load i32*, i32** %in.addr, align 8
+  %arrayidx4 = getelementptr inbounds i32, i32* %9, i64 %8
+  %10 = load i32, i32* %arrayidx4, align 4
+  %11 = load i64, i64* %gid, align 8
+  %12 = load i32*, i32** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds i32, i32* %12, i64 %11
+  store i32 %10, i32* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @foo(i32, i32 addrspace(1)*)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK: call spir_func i32 @__vecz_b_masked_foo(
+; CHECK: call spir_func i32 @__vecz_b_masked_foo(
+; CHECK: call spir_func i32 @__vecz_b_masked_foo(
+; CHECK: call spir_func i32 @__vecz_b_masked_foo(
+; CHECK: ret void
+
+; CHECK: define private spir_func i32 @__vecz_b_masked_foo(i32{{( %0)?}}, ptr{{( %1)?}}, i1{{( %2)?}}
+; CHECK: call spir_func i32 @foo(i32 %0, ptr %1)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
new file mode 100644
index 0000000000000..890f63e748592
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_and.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @split_branch(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
+  %varying = load i32, i32 addrspace(1)* %a_gep
+  %uniform = load i32, i32 addrspace(1)* %b_gep
+  %cmp_v = icmp sgt i32 %varying, 0
+  %cmp_u = icmp sgt i32 %uniform, 0
+  %and_vu = and i1 %cmp_v, %cmp_u
+  br i1 %and_vu, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %inc = add i32 %uniform, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %result = phi i32 [ %inc, %if.then ], [ %varying, %entry ]
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %result, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks that a conditional branch based on an AND of both
+; a uniform and a varying value gets split into two separate branches
+; CHECK: define spir_kernel void @__vecz_v4_split_branch
+
+; CHECK: %cmp_v = icmp sgt i32 %varying, 0
+; CHECK: %cmp_u = icmp sgt i32 %uniform, 0
+
+; ensure the original binary operator got deleted
+; CHECK-NOT: and i1
+; CHECK: br i1 %cmp_u, label %entry.cond_split, label %if.end
+
+; CHECK: entry.cond_split:
+; CHECK: br i1 %cmp_v, label %if.then, label %if.end
+
+; CHECK: if.then:
+; CHECK: %inc = add i32 %uniform, 1
+; CHECK: br label %if.end
+
+; CHECK: if.end:
+; CHECK: %[[RESULT:.+]] = phi i32 [ %inc, %if.then ], [ %varying, %entry.cond_split ], [ %varying, %entry ]
+; CHECK: store i32 %[[RESULT]], ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
new file mode 100644
index 0000000000000..37d1ff7cebffa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/branch_splitting_or.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k split_branch -vecz-simd-width=4 -vecz-passes=uniform-reassoc -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @split_branch(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
+  %varying = load i32, i32 addrspace(1)* %a_gep
+  %uniform = load i32, i32 addrspace(1)* %b_gep
+  %cmp_v = icmp sgt i32 %varying, 0
+  %cmp_u = icmp sgt i32 %uniform, 0
+  %or_vu = or i1 %cmp_v, %cmp_u
+  br i1 %or_vu, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %inc = add i32 %uniform, 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %result = phi i32 [ %inc, %if.then ], [ %varying, %entry ]
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %result, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks that a conditional branch based on an OR of both
+; a uniform and a varying value gets split into two separate branches
+; CHECK: define spir_kernel void @__vecz_v4_split_branch
+
+; CHECK: %cmp_v = icmp sgt i32 %varying, 0
+; CHECK: %cmp_u = icmp sgt i32 %uniform, 0
+
+; ensure the original binary operator got deleted
+; CHECK-NOT: or i1
+; CHECK: br i1 %cmp_u, label %if.then, label %entry.cond_split
+
+; CHECK: entry.cond_split:
+; CHECK: br i1 %cmp_v, label %if.then, label %if.end
+
+; CHECK: if.then:
+; CHECK: %inc = add i32 %uniform, 1
+; CHECK: br label %if.end
+
+; CHECK: if.end:
+; CHECK: %[[RESULT:.+]] = phi i32 [ %inc, %if.then ], [ %varying, %entry.cond_split ]
+; CHECK: store i32 %[[RESULT]], ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
new file mode 100644
index 0000000000000..141543d69b0fd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_addsat.ll
@@ -0,0 +1,118 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @saddsatc(i8 addrspace(1)* %lhs, i8 addrspace(1)* %rhs) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %lhs, i64 %call
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %rhs, i64 %call
+  %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func i8 @_Z7add_satcc(i8 %0, i8 %1)
+  store i8 %call2, i8 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @uaddsatc(i8 addrspace(1)* %lhs, i8 addrspace(1)* %rhs) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %lhs, i64 %call
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %rhs, i64 %call
+  %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func i8 @_Z7add_sathh(i8 %0, i8 %1)
+  store i8 %call2, i8 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @saddsati(i32 addrspace(1)* %lhs, i32 addrspace(1)* %rhs) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %lhs, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %rhs, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func i32 @_Z7add_satii(i32 %0, i32 %1)
+  store i32 %call2, i32 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @uaddsati(i32 addrspace(1)* %lhs, i32 addrspace(1)* %rhs) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %lhs, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %rhs, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func i32 @_Z7add_satjj(i32 %0, i32 %1)
+  store i32 %call2, i32 addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @saddsati4(<4 x i32> addrspace(1)* %lhs, <4 x i32> addrspace(1)* %rhs) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %lhs, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %rhs, i64 %call
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func <4 x i32> @_Z7add_satDv2_iS_(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %call2, <4 x i32> addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+define spir_kernel void @uaddsati4(<4 x i32> addrspace(1)* %lhs, <4 x i32> addrspace(1)* %rhs) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %lhs, i64 %call
+  %0 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %rhs, i64 %call
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx1, align 1
+  %call2 = tail call spir_func <4 x i32> @_Z7add_satDv2_jS_(<4 x i32> %0, <4 x i32> %1)
+  store <4 x i32> %call2, <4 x i32> addrspace(1)* %arrayidx1, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i8 @_Z7add_satcc(i8, i8)
+declare spir_func i8 @_Z7add_sathh(i8, i8)
+declare spir_func i32 @_Z7add_satii(i32, i32)
+declare spir_func i32 @_Z7add_satjj(i32, i32)
+declare spir_func <4 x i32> @_Z7add_satDv2_iS_(<4 x i32>, <4 x i32>)
+declare spir_func <4 x i32> @_Z7add_satDv2_jS_(<4 x i32>, <4 x i32>)
+
+; CHECK: define spir_kernel void @__vecz_v4_saddsatc(
+; CHECK: = call i8 @llvm.sadd.sat.i8(i8 %{{.*}}, i8 %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_uaddsatc(
+; CHECK: = call i8 @llvm.uadd.sat.i8(i8 %{{.*}}, i8 %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_saddsati(
+; CHECK: = call i32 @llvm.sadd.sat.i32(i32 %{{.*}}, i32 %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_uaddsati(
+; CHECK: = call i32 @llvm.uadd.sat.i32(i32 %{{.*}}, i32 %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_saddsati4(
+; CHECK: = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+
+; CHECK: define spir_kernel void @__vecz_v4_uaddsati4(
+; CHECK: = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
new file mode 100644
index 0000000000000..1bcc968885303
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_clamp.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k clampkernel -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @clampkernel(float %a, float* %c) {
+entry:
+  %clmp = call spir_func float @_Z5clampfff(float %a, float 0.0, float 1.0)
+  store float %clmp, float* %c, align 4
+  ret void
+}
+
+define spir_func float @_Z5clampfff(float %x, float %y, float %z) {
+entry:
+  %call.i.i = tail call spir_func float @_Z13__abacus_fmaxff(float %x, float %y)
+  %call1.i.i = tail call spir_func float @_Z13__abacus_fminff(float %call.i.i, float %z)
+  ret float %call1.i.i
+; CHECK-LABEL: float @_Z5clampfff(
+; CHECK: [[TMP:%.*]] = call float @llvm.maxnum.f32(float %x, float %y)
+; CHECK:             = call float @llvm.minnum.f32(float [[TMP]], float %z)
+}
+
+declare spir_func float @_Z13__abacus_fminff(float, float)
+declare spir_func float @_Z13__abacus_fmaxff(float, float)
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
new file mode 100644
index 0000000000000..e99d01d477e1f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmax.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @fmaxff(float %a, float %b, float* %c) {
+entry:
+  %max = call spir_func float @_Z4fmaxff(float %a, float %b)
+  store float %max, float* %c, align 4
+  ret void
+}
+
+define spir_kernel void @fmaxvf(<2 x float> %a, float %b, <2 x float>* %c) {
+entry:
+  %max = call spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float> %a, float %b)
+  store <2 x float> %max, <2 x float>* %c, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare spir_func float @_Z4fmaxff(float, float)
+declare spir_func <2 x float> @_Z4fmaxDv2_ff(<2 x float>, float)
+
+; CHECK: define spir_kernel void @__vecz_v4_fmaxff(float %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %0 = call float @llvm.maxnum.f32(float %a, float %b)
+; CHECK: store float %0, ptr %c, align 4
+; CHECK: ret void
+
+; CHECK: define spir_kernel void @__vecz_v4_fmaxvf(<2 x float> %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %.splatinsert = insertelement <2 x float> {{.*}}, float %b, {{(i32|i64)}} 0
+; CHECK: %.splat = shufflevector <2 x float> %.splatinsert, <2 x float> {{.*}}, <2 x i32> zeroinitializer
+; CHECK: %0 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %.splat)
+; CHECK: store <2 x float> %0, ptr %c, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
new file mode 100644
index 0000000000000..65b7e5697a68b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_fmin.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @fminff(float %a, float %b, float* %c) {
+entry:
+  %min = call spir_func float @_Z4fminff(float %a, float %b)
+  store float %min, float* %c, align 4
+  ret void
+}
+
+define spir_kernel void @fminvf(<2 x float> %a, float %b, <2 x float>* %c) {
+entry:
+  %min = call spir_func <2 x float> @_Z4fminDv2_ff(<2 x float> %a, float %b)
+  store <2 x float> %min, <2 x float>* %c, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare spir_func float @_Z4fminff(float, float)
+declare spir_func <2 x float> @_Z4fminDv2_ff(<2 x float>, float)
+
+; CHECK: define spir_kernel void @__vecz_v4_fminff(float %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %0 = call float @llvm.minnum.f32(float %a, float %b)
+; CHECK: store float %0, ptr %c, align 4
+; CHECK: ret void
+
+; CHECK: define spir_kernel void @__vecz_v4_fminvf(<2 x float> %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %.splatinsert = insertelement <2 x float> {{.*}}, float %b, {{(i32|i64)}} 0
+; CHECK: %.splat = shufflevector <2 x float> %.splatinsert, <2 x float> {{.*}}, <2 x i32> zeroinitializer
+; CHECK: %0 = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %.splat)
+; CHECK: store <2 x float> %0, ptr %c, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
new file mode 100644
index 0000000000000..86591570fbcab
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_mem.ll
@@ -0,0 +1,122 @@
+
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=builtin-inlining,verify -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_memset_i8(i64* %z) {
+  %dst = bitcast i64* %z to i8*
+  call void @llvm.memset.p0i8.i64(i8* %dst, i8 42, i64 18, i32 8, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memset_i8(ptr %z)
+; CHECK:  %dst = bitcast ptr %z to ptr
+; CHECK:  %1 = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK:  store i64 3038287259199220266, ptr %1, align 8
+; CHECK:  %2 = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK:  store i64 3038287259199220266, ptr %2, align 8
+; CHECK:  %dst1 = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK:  store i8 42, ptr %dst1, align 1
+; CHECK:  %dst2 = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK:  store i8 42, ptr %dst2, align 1
+; CHECK:  ret void
+; CHECK: }
+
+define spir_kernel void @test_memset_i16(i64* %z) {
+  %dst = bitcast i64* %z to i16*
+  call void @llvm.memset.p0i16.i64(i16* %dst, i8 42, i64 18, i32 8, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memset_i16(ptr %z)
+; CHECK:  %dst = bitcast ptr %z to ptr
+; CHECK:  %1 = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK:  store i64 3038287259199220266, ptr %1, align 8
+; CHECK:  %2 = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK:  store i64 3038287259199220266, ptr %2, align 8
+; CHECK:  %dst1 = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK:  store i8 42, ptr %dst1, align 1
+; CHECK:  %dst2 = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK:  store i8 42, ptr %dst2, align 1
+; CHECK:  ret void
+; CHECK: }
+
+define spir_kernel void @test_memcpy_i8(i64* %a, i64* %z) {
+  %src = bitcast i64* %a to i8*
+  %dst = bitcast i64* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 18, i32 8, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memcpy_i8(ptr %a, ptr %z)
+; CHECK:  %src = bitcast ptr %a to ptr
+; CHECK:  %dst = bitcast ptr %z to ptr
+; CHECK:  %1 = getelementptr inbounds i8, ptr %src, i64 0
+; CHECK:  %2 = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK:  %src1 = load i64, ptr %1, align 8
+; CHECK:  store i64 %src1, ptr %2, align 8
+; CHECK:  %3 = getelementptr inbounds i8, ptr %src, i64 8
+; CHECK:  %4 = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK:  %src2 = load i64, ptr %3, align 8
+; CHECK:  store i64 %src2, ptr %4, align 8
+; CHECK:  %5 = getelementptr inbounds i8, ptr %src, i64 16
+; CHECK:  %dst3 = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK:  %src4 = load i8, ptr %5, align 1
+; CHECK:  store i8 %src4, ptr %dst3, align 1
+; CHECK:  %6 = getelementptr inbounds i8, ptr %src, i64 17
+; CHECK:  %dst5 = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK:  %src6 = load i8, ptr %6, align 1
+; CHECK:  store i8 %src6, ptr %dst5, align 1
+; CHECK:  ret void
+; CHECK: }
+
+define spir_kernel void @test_memcpy_i16(i64* %a, i64* %z) {
+  %src = bitcast i64* %a to i16*
+  %dst = bitcast i64* %z to i16*
+  call void @llvm.memcpy.p0i16.p0i16.i64(i16* %dst, i16* %src, i64 18, i32 8, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_memcpy_i16(ptr %a, ptr %z)
+; CHECK:  %src = bitcast ptr %a to ptr
+; CHECK:  %dst = bitcast ptr %z to ptr
+; CHECK:  %1 = getelementptr inbounds i8, ptr %src, i64 0
+; CHECK:  %2 = getelementptr inbounds i8, ptr %dst, i64 0
+; CHECK:  %src1 = load i64, ptr %1, align 8
+; CHECK:  store i64 %src1, ptr %2, align 8
+; CHECK:  %3 = getelementptr inbounds i8, ptr %src, i64 8
+; CHECK:  %4 = getelementptr inbounds i8, ptr %dst, i64 8
+; CHECK:  %src2 = load i64, ptr %3, align 8
+; CHECK:  store i64 %src2, ptr %4, align 8
+; CHECK:  %5 = getelementptr inbounds i8, ptr %src, i64 16
+; CHECK:  %dst3 = getelementptr inbounds i8, ptr %dst, i64 16
+; CHECK:  %src4 = load i8, ptr %5, align 1
+; CHECK:  store i8 %src4, ptr %dst3, align 1
+; CHECK:  %6 = getelementptr inbounds i8, ptr %src, i64 17
+; CHECK:  %dst5 = getelementptr inbounds i8, ptr %dst, i64 17
+; CHECK:  %src6 = load i8, ptr %6, align 1
+; CHECK:  store i8 %src6, ptr %dst5, align 1
+; CHECK:  ret void
+; CHECK: }
+
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i32, i1)
+declare void @llvm.memset.p0i16.i64(i16*, i8, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)
+declare void @llvm.memcpy.p0i16.p0i16.i64(i16*, i16*, i64, i32, i1)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll
new file mode 100644
index 0000000000000..7ad572a7cebed
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_memcpy.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k memcpy_align -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @memcpy_align(ptr align(16) %out, ptr align(8) %in) {
+entry:
+; CHECK:  %[[A:.*]] = getelementptr inbounds i8, ptr %in, i64 0
+; CHECK:  %[[B:.*]] = getelementptr inbounds i8, ptr %out, i64 0
+; CHECK:  %[[C:.*]] = load i64, ptr %[[A]], align 8
+; CHECK:  store i64 %[[C]], ptr %[[B]], align 16
+
+; CHECK:  %[[D:.*]] = getelementptr inbounds i8, ptr %in, i64 8
+; CHECK:  %[[E:.*]] = getelementptr inbounds i8, ptr %out, i64 8
+; CHECK:  %[[F:.*]] = load i64, ptr %[[D]], align 8
+; CHECK:  store i64 %[[F]], ptr %[[E]], align 8
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef align(16) %out, ptr noundef align(8) %in, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i32(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i32, i1 immarg)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
new file mode 100644
index 0000000000000..0a1c85af00cda
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_negative.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_rhadd -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_normalize(float %a, float %b, i32* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %norm = call spir_func float @_Z9normalizef(float %a)
+  %normi = fptosi float %norm to i32
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %normi, i32* %c0, align 4
+  ret void
+}
+
+define spir_kernel void @test_rhadd(i32 %a, i32 %b, i32* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %add = call spir_func i32 @_Z5rhaddjj(i32 %a, i32 %b)
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %add, i32* %c0, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func float @_Z9normalizef(float)
+declare spir_func i32 @_Z5rhaddjj(i32, i32)
+
+; CHECK-NOT: define spir_kernel void @__vecz_v4_test_normalize(float %a, float %b, ptr %c)
+
+; CHECK: define spir_kernel void @__vecz_v4_test_rhadd(i32 %a, i32 %b, ptr %c)
+; CHECK: entry:
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %add = call spir_func i32 @_Z5rhaddjj(i32 %a, i32 %b)
+; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
+; CHECK: store i32 %add, ptr %c0, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
new file mode 100644
index 0000000000000..379428725bb39
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_inlining_positive.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(float %a, float %b, i32* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cmp = call spir_func i32 @_Z9isgreaterff(float %a, float %b)
+  %c0 = getelementptr i32, i32* %c, i64 %gid
+  store i32 %cmp, i32* %c0, align 4
+  %cmp1 = call spir_func i32 @_Z6islessff(float %a, float %b)
+  %c1 = getelementptr i32, i32* %c0, i32 1
+  store i32 %cmp1, i32* %c1, align 4
+  %cmp2 = call spir_func i32 @_Z7isequalff(float %a, float %b)
+  %c2 = getelementptr i32, i32* %c0, i32 2
+  store i32 %cmp2, i32* %c2, align 4
+  %cmp3 = call spir_func i32 @opt_Z7isequalff(float %a, float %b)
+  %c3 = getelementptr i32, i32* %c0, i32 3
+  store i32 %cmp3, i32* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z9isgreaterff(float, float)
+declare spir_func i32 @_Z6islessff(float, float)
+declare spir_func i32 @_Z7isequalff(float, float)
+
+; Test that a non-builtin function is inlined.
+define spir_func i32 @opt_Z7isequalff(float, float) {
+  ret i32 zeroinitializer
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test(float %a, float %b, ptr %c)
+; CHECK: entry:
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %relational = fcmp ogt float %a, %b
+; CHECK: %relational[[R1:[0-9]+]] = zext i1 %relational to i32
+; CHECK: %c0 = getelementptr i32, ptr %c, i64 %gid
+; CHECK: store i32 %relational[[R1]], ptr %c0, align 4
+; CHECK: %relational[[R2:[0-9]+]] = fcmp olt float %a, %b
+; CHECK: %relational[[R3:[0-9]+]] = zext i1 %relational[[R2:[0-9]+]] to i32
+; CHECK: %c1 = getelementptr i32, ptr %c0, {{(i32|i64)}} 1
+; CHECK: store i32 %relational[[R3:[0-9]+]], ptr %c1, align 4
+; CHECK: %relational[[R4:[0-9]+]] = fcmp oeq float %a, %b
+; CHECK: %relational[[R5:[0-9]+]] = zext i1 %relational[[R4:[0-9]+]] to i32
+; CHECK: %c2 = getelementptr i32, ptr %c0, {{(i32|i64)}} 2
+; CHECK: store i32 %relational[[R5:[0-9]+]], ptr %c2, align 4
+; CHECK: %c3 = getelementptr i32, ptr %c0, {{(i32|i64)}} 3
+; CHECK: store i32 0, ptr %c3, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
new file mode 100644
index 0000000000000..d6bc1e0d2c71d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/builtin_pointer_return.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
+
+declare i64 @__mux_get_global_id(i32)
+
+declare spir_func float @_Z5fractfPf(float, float*)
+declare spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float>, <2 x float>*)
+declare spir_func <4 x float> @_Z5fractDv4_fPS_(<4 x float>, <4 x float>*)
+declare spir_func <8 x float> @_Z5fractDv8_fPS_(<8 x float>, <8 x float>*)
+
+; FIXME: Both of these are instantiating when we have vector equivalents.
+
+define spir_kernel void @fract_v1(float* %xptr, float* %outptr, float* %ioutptr) {
+  %iouta = alloca float
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.x = getelementptr inbounds float, float* %xptr, i64 %idx
+  %x = load float, float* %arrayidx.x, align 4
+  %out = call spir_func float @_Z5fractfPf(float %x, float* %iouta)
+  %arrayidx.out = getelementptr inbounds float, float* %outptr, i64 %idx
+  %arrayidx.iout = getelementptr inbounds float, float* %ioutptr, i64 %idx
+  store float %out, float* %arrayidx.out, align 4
+  %iout = load float, float* %iouta, align 4
+  store float %iout, float* %arrayidx.iout, align 4
+  ret void
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func float @_Z5fractfPf(float {{%.*}}, ptr nonnull {{%.*}})
+}
+
+define spir_kernel void @fract_v2(<2 x float>* %xptr, <2 x float>* %outptr, <2 x float>* %ioutptr) {
+  %iouta = alloca <2 x float>
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.x = getelementptr inbounds <2 x float>, <2 x float>* %xptr, i64 %idx
+  %x = load <2 x float>, <2 x float>* %arrayidx.x, align 8
+  %out = call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> %x, <2 x float>* %iouta)
+  %arrayidx.out = getelementptr inbounds <2 x float>, <2 x float>* %outptr, i64 %idx
+  %arrayidx.iout = getelementptr inbounds <2 x float>, <2 x float>* %ioutptr, i64 %idx
+  store <2 x float> %out, <2 x float>* %arrayidx.out, align 8
+  %iout = load <2 x float>, <2 x float>* %iouta, align 8
+  store <2 x float> %iout, <2 x float>* %arrayidx.iout, align 8
+  ret void
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+; CHECK: call spir_func <2 x float> @_Z5fractDv2_fPS_(<2 x float> {{%.*}}, ptr nonnull {{%.*}})
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll
new file mode 100644
index 0000000000000..6ee06a5479108
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/call_instantiation.ll
@@ -0,0 +1,160 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Kernels
+
+; We should be able to handle intrinsics
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_instrinsic(ptr %in1, ptr %in2, ptr %in3, ptr %out)
+; CHECK: call <4 x float> @llvm.fmuladd.v4f32(<4 x float> {{%.*}}, <4 x float> {{%.*}}, <4 x float> {{%.*}})
+define spir_kernel void @instrinsic(ptr %in1, ptr %in2, ptr %in3, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr %in1, i64 %call
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds float, ptr %in2, i64 %call
+  %1 = load float, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds float, ptr %in3, i64 %call
+  %2 = load float, ptr %arrayidx2, align 4
+  %3 = tail call float @llvm.fmuladd.f32(float %0, float %1, float %2)
+  %arrayidx3 = getelementptr inbounds float, ptr %out, i64 %call
+  store float %3, ptr %arrayidx3, align 4
+  ret void
+}
+
+; We should be able to handle builtins for which we have a vector declaration
+; in the module.
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_builtin(ptr %in, ptr %out)
+; CHECK: = call spir_func <4 x i32> @_Z3absDv4_i(<4 x i32> {{%.*}})
+define spir_kernel void @builtin(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call
+  %0 = load i32, ptr %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3absi(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr %out, i64 %call
+  store i32 %call1, ptr %arrayidx2, align 4
+  ret void
+}
+
+; We should be able to handle user functions for which we have a definition
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_user_defined(ptr %in, ptr %out)
+; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @defined(ptr {{%.*}}, ptr {{%.*}})
+define spir_kernel void @user_defined(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call
+  call spir_func void @defined(ptr %add.ptr, ptr %add.ptr1)
+  ret void
+}
+
+; We should be able to handle user functions (or builtins) for which we have no
+; definition
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_user_undefined(ptr %in, ptr %out)
+; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @undefined(ptr {{%.*}}, ptr {{%.*}})
+define spir_kernel void @user_undefined(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call
+  call spir_func void @undefined(ptr %add.ptr, ptr %add.ptr1)
+  ret void
+}
+
+; We should be able to handle user functions (or builtins) which we can't
+; inline
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_cantinline(ptr %in, ptr %out)
+; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}})
+; CHECK: call spir_func void @dontinline(ptr {{%.*}}, ptr {{%.*}})
+define spir_kernel void @cantinline(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds i32, ptr %in, i64 %call
+  %add.ptr1 = getelementptr inbounds i32, ptr %out, i64 %call
+  call spir_func void @dontinline(ptr %add.ptr, ptr %add.ptr1)
+  ret void
+}
+
+; If we can't duplicate a function, we can't packetize it.
+; CHECK-NOT: @__vecz_v4_cantduplicate
+define spir_kernel void @cantduplicate(ptr %in, ptr %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call
+  %0 = load i32, ptr %arrayidx, align 4
+  %call1 = tail call spir_func i32 @_Z3clzi(i32 %0) #1
+  %arrayidx2 = getelementptr inbounds i32, ptr %out, i64 %call
+  store i32 %call1, ptr %arrayidx2, align 4
+  ret void
+}
+
+; The optnone attribute has no impact when directly running the packetizer
+; pass. The higher-level vectorization factor decisions must take this into
+; account instead.
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_optnone(ptr %in, ptr %out)
+define spir_kernel void @optnone(ptr %in, ptr %out) #2 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 %call
+  %0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %out, i64 %call
+  store i32 %0, ptr %arrayidx1, align 4
+  ret void
+}
+
+; Declaration only functions
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare spir_func i32 @_Z3absi(i32)
+declare spir_func <4 x i32> @_Z3absDv4_i(<4 x i32>)
+declare spir_func i32 @_Z3clzi(i32) #1
+declare i64 @__mux_get_global_id(i32)
+declare spir_func void @undefined(ptr, ptr)
+
+; Functions with definitions
+
+define spir_func void @defined(ptr %in, ptr %out) {
+entry:
+  %0 = load i32, ptr %in, align 4
+  store i32 %0, ptr %out, align 4
+  ret void
+}
+
+define spir_func void @dontinline(ptr %in, ptr %out) #0 {
+entry:
+  %0 = load i32, ptr %in, align 4
+  store i32 %0, ptr %out, align 4
+  ret void
+}
+
+; Attributes
+
+attributes #0 = { noinline }
+attributes #1 = { noduplicate }
+attributes #2 = { optnone noinline }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
new file mode 100644
index 0000000000000..2df00a15e33cf
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/cmpxchg.ll
@@ -0,0 +1,92 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r)
+define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
+entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
+  %call = call i64 @__mux_get_global_id(i32 0)
+
+; Test that this cmpxchg is packetized by generating a call to an all-true masked version.
+; CHECK: [[A0:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}},
+; CHECK-SAME: <4 x i32> {{<(i32 2(, )?)+>|splat \(i32 2\)}},
+; CHECK-SAME: <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}
+  %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
+; CHECK: [[EXT0:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 0
+  %val0 = extractvalue { i32, i1 } %old0, 0
+; CHECK: [[EXT1:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[A0]], 1
+  %success0 = extractvalue { i32, i1 } %old0, 1
+
+  %out = getelementptr i32, ptr %q, i64 %call
+; Stored as a vector
+; CHECK: store <4 x i32> [[EXT0]], ptr
+  store i32 %val0, ptr %out, align 4
+
+; CHECK: [[PTR:%.*]] = getelementptr i8, ptr %r, i64 %call
+  %outsuccess = getelementptr i8, ptr %r, i64 %call
+; CHECK: [[ZEXT0:%.*]] = zext <4 x i1> [[EXT1]] to <4 x i8>
+  %outbyte = zext i1 %success0 to i8
+; Stored as a vector
+; CHECK: store <4 x i8> [[ZEXT0]], ptr [[PTR]], align 1
+  store i8 %outbyte, ptr %outsuccess, align 1
+
+  ; Test a couple of insert/extract patterns
+
+  ; Test inserting a uniform value into a varying literal struct
+; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> zeroinitializer, 1
+; CHECK: [[EXT2:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS0]], 1
+; CHECK: [[ZEXT1:%.*]] = zext <4 x i1> [[EXT2]] to <4 x i8>
+; CHECK: store <4 x i8> [[ZEXT1]], ptr [[PTR]], align 1
+  %testinsertconst = insertvalue { i32, i1 } %old0, i1 false, 1
+  %testextract0 = extractvalue { i32, i1 } %testinsertconst, 1
+  %outbyte0 = zext i1 %testextract0 to i8
+  store i8 %outbyte0, ptr %outsuccess, align 1
+
+  ; Test inserting a varying value into a varying literal struct
+; CHECK: [[LD:%.*]] = load <4 x i8>, ptr
+; CHECK: [[VBOOL:%.*]] = trunc <4 x i8> [[LD]] to <4 x i1>
+; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[A0]], <4 x i1> [[VBOOL]], 1
+; CHECK: [[EXT3:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS1]], 1
+; CHECK: [[ZEXT2:%.*]] = zext <4 x i1> [[EXT3]] to <4 x i8>
+; CHECK: store <4 x i8> [[ZEXT2]], ptr [[PTR]], align 1
+  %byte1 = load i8, ptr %outsuccess, align 1
+  %bool1 = trunc i8 %byte1 to i1
+  %testinsertvarying0 = insertvalue { i32, i1 } %old0, i1 %bool1, 1
+  %testextract1 = extractvalue { i32, i1 } %testinsertvarying0, 1
+  %outbyte1 = zext i1 %testextract1 to i8
+  store i8 %outbyte1, ptr %outsuccess, align 1
+
+  ; Test inserting a varying value into a uniform literal struct
+; CHECK: [[INS2:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i1> [[VBOOL]], 1
+; CHECK: [[EXT4:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS2]], 1
+; CHECK: [[ZEXT3:%.*]] = zext <4 x i1> [[EXT4]] to <4 x i8>
+; CHECK: store <4 x i8> [[ZEXT3]], ptr [[PTR]], align 1
+  %testinsertvarying1 = insertvalue { i32, i1 } poison, i1 %bool1, 1
+  %testextract2 = extractvalue { i32, i1 } %testinsertvarying1, 1
+  %outbyte2 = zext i1 %testextract2 to i8
+  store i8 %outbyte2, ptr %outsuccess, align 1
+
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
new file mode 100644
index 0000000000000..0894b60d9fc7a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %out) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0) #1
+  %conv = trunc i64 %gid to i32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
+  store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+
+!opencl.kernels = !{!0}
+!opencl.spir.version = !{!7}
+!opencl.ocl.version = !{!7}
+!opencl.used.extensions = !{!8}
+!opencl.used.optional.core.features = !{!8}
+!opencl.compiler.options = !{!8}
+
+!0 = !{void (i32 addrspace(1)*)* @test, !1, !2, !3, !4, !5, !6}
+!1 = !{!"kernel_arg_addr_space", i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !""}
+!6 = !{!"kernel_arg_name", !"out"}
+!7 = !{i32 1, i32 2}
+!8 = !{}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT: %conv = trunc i64 %gid to i32
+; CHECK-NEXT: %arrayidx = getelementptr inbounds {{(nuw )?}}{{i32|i8}}, ptr addrspace(1) %out, i64 {{3|12}}
+; CHECK-NEXT: store i32 %conv, ptr addrspace(1) %arrayidx, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
new file mode 100644
index 0000000000000..e2d1ef91aec8e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/constant_address_with_uniform.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @__mux_get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %out2) {
+entry:
+  %gid = call i32 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 3
+  store i32 %gid, i32 addrspace(1)* %arrayidx, align 4
+
+  %arrayidx2 = getelementptr inbounds i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %out2, i32 %gid
+  store i32 addrspace(1)* %arrayidx, i32 addrspace(1)* addrspace(1)* %arrayidx2, align 4
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %gid = call i32 @__mux_get_global_id(i32 0)
+; CHECK-NEXT: %arrayidx = getelementptr inbounds {{(nuw )?}}{{i32|i8}}, ptr addrspace(1) %out, i32 {{3|12}}
+; CHECK: store i32 %gid, ptr addrspace(1) %arrayidx, align 4
+; CHECK: store <4 x ptr addrspace(1)> %{{.+}}, ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
new file mode 100644
index 0000000000000..bcc6bfd84b57a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/contiguous_allocas.ll
@@ -0,0 +1,74 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-auto -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@entry_test_alloca.lm = external unnamed_addr addrspace(3) constant [16 x <2 x float>], align 8
+
+define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in, <2 x float> addrspace(1)* nocapture %out, i32 %offset) local_unnamed_addr {
+entry:
+  %a.sroa.0 = alloca <2 x float>, align 8
+  %b.sroa.2 = alloca <2 x float>, align 8
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_local_id(i32 0)
+  %a.sroa.0.0..sroa_cast = bitcast <2 x float>* %a.sroa.0 to i8*
+  %b.sroa.2.0..sroa_cast = bitcast <2 x float>* %b.sroa.2 to i8*
+  %arrayidx2 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %call1
+  %0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx2, align 8
+  %conv = sext i32 %offset to i64
+  %add = add i64 %call1, %conv
+  %arrayidx4 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %add
+  %1 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx4, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup10
+  %mul.le.le = fmul <2 x float> %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0., %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8.
+  %arrayidx17 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i64 %call
+  store <2 x float> %mul.le.le, <2 x float> addrspace(1)* %arrayidx17, align 8
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup10, %entry
+  %i.038 = phi i32 [ 0, %entry ], [ %inc15, %for.cond.cleanup10 ]
+  store volatile <2 x float> %0, <2 x float>* %a.sroa.0, align 8
+  store volatile <2 x float> %1, <2 x float>* %b.sroa.2, align 8
+  br label %for.body11
+
+for.cond.cleanup10:                               ; preds = %for.body11
+  %inc15 = add nuw nsw i32 %i.038, 1
+  %cmp = icmp ult i32 %inc15, 16
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body11:                                       ; preds = %for.body11, %for.body
+  %i6.037 = phi i32 [ 0, %for.body ], [ %inc, %for.body11 ]
+  %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0. = load volatile <2 x float>, <2 x float>* %a.sroa.0, align 8
+  %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8. = load volatile <2 x float>, <2 x float>* %b.sroa.2, align 8
+  %inc = add nuw nsw i32 %i6.037, 1
+  %cmp8 = icmp ult i32 %inc, 16
+  br i1 %cmp8, label %for.body11, label %for.cond.cleanup10
+}
+
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
+declare i64 @__mux_get_local_id(i32) local_unnamed_addr
+
+; Check that all the allocas come before anything else
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <8 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <8 x float>, align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
new file mode 100644
index 0000000000000..7108df3732999
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_nested_loops.ll
@@ -0,0 +1,208 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; A nested loop, in the form of
+;
+;  int gid = get_global_id(0);
+;  for (int i = 16 - gid; i < 16; ++i) {
+;    for (int j = 24 - gid; i < 24; ++j) {
+;      b[i + gid] = a[j + gid] + i + j;
+;    }
+;  }
+;
+; The important bit is that both of the loops have their iterations dependent on
+; the global ID
+; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b)
+; CHECK: entry:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
+; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
+; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false
+; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false
+; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
+; CHECK: br label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND3:.+]]
+
+; CHECK: [[FORCOND3]]:
+; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
+; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
+; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false
+; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
+; CHECK: br label %[[FORBODY6:.+]]
+
+; CHECK: [[FORBODY6]]:
+; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]]
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]])
+; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]]
+
+; CHECK: [[FORINC12]]:
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]])
+; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]]
+
+; CHECK: [[FOREND14]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
new file mode 100644
index 0000000000000..0384d9959e24a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_y.ll
@@ -0,0 +1,208 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 1)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 1)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 1)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 1)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; A nested loop, in the form of
+;
+;  int gid = get_global_id(1);
+;  for (int i = 16 - gid; i < 16; ++i) {
+;    for (int j = 24 - gid; i < 24; ++j) {
+;      b[i + gid] = a[j + gid] + i + j;
+;    }
+;  }
+;
+; The important bit is that both of the loops have their iterations dependent on
+; the global ID
+; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b)
+; CHECK: entry:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
+; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
+; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false
+; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false
+; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
+; CHECK: br label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND3:.+]]
+
+; CHECK: [[FORCOND3]]:
+; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
+; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
+; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false
+; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
+; CHECK: br label %[[FORBODY6:.+]]
+
+; CHECK: [[FORBODY6]]:
+; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]]
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]])
+; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]]
+
+; CHECK: [[FORINC12]]:
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]])
+; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]]
+
+; CHECK: [[FOREND14]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
new file mode 100644
index 0000000000000..e6c92b8290d92
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_order_z.ll
@@ -0,0 +1,208 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_nested_loops -vecz-passes=cfg-convert -vecz-simd-width=4 -d 2 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 2)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 2)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 2)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 2)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; A nested loop, in the form of
+;
+;  int gid = get_global_id(1);
+;  for (int i = 16 - gid; i < 16; ++i) {
+;    for (int j = 24 - gid; i < 24; ++j) {
+;      b[i + gid] = a[j + gid] + i + j;
+;    }
+;  }
+;
+; The important bit is that both of the loops have their iterations dependent on
+; the global ID
+; CHECK: define spir_kernel void @__vecz_v4_test_nested_loops(ptr %a, ptr %b)
+; CHECK: entry:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[ENTRYMASK_FORCOND:.+]] = phi i1 [ true, %entry ], [ %[[FORINC12EXITMASK3:.+]], %[[FORINC12:.+]] ]
+; CHECK: %[[EXITMASK1:.+]] = phi i1 [ false, %entry ], [ %[[LOOPEXITMASK2:.+]], %[[FORINC12]] ]
+; CHECK: %[[CMP:.+]] = icmp slt i32 %[[STOREMERGE:.+]], 16
+; CHECK: %[[EDGEMASK_FORBODY:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[CMP]], i1 false
+; CHECK: %[[NOT_CMP:.+]] = xor i1 %[[CMP]], true
+; CHECK: %[[EDGEMASK_FOREND14:.+]] = select i1 %[[ENTRYMASK_FORCOND]], i1 %[[NOT_CMP]], i1 false
+; CHECK: %[[LOOPEXITMASK2]] = or i1 %[[EXITMASK1]], %[[EDGEMASK_FOREND14]]
+; CHECK: br label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND3:.+]]
+
+; CHECK: [[FORCOND3]]:
+; CHECK: %[[ENTRYMASK_FORCOND3:.+]] = phi i1 [ %[[EDGEMASK_FORBODY:.+]], %[[FORBODY]] ], [ %[[FORBODY6EXITMASK:.+]], %[[FORBODY6:.+]] ]
+; CHECK: %[[PREVEXITMASK:.+]] = phi i1 [ false, %[[FORBODY]] ], [ %[[FORINC12LOOPEXITMASKUPDATE:.+]], %[[FORBODY6]] ]
+; CHECK: %[[CMP4:.+]] = icmp slt i32 %[[STOREMERGE]], 24
+; CHECK: %[[EDGEMASK_FORBODY6:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[CMP4]], i1 false
+; CHECK: %[[NOT_CMP4:.+]] = xor i1 %[[CMP4]], true
+; CHECK: %[[EDGEMASK_FORINC12:.+]] = select i1 %[[ENTRYMASK_FORCOND3]], i1 %[[NOT_CMP4]], i1 false
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE]] = or i1 %[[PREVEXITMASK]], %[[EDGEMASK_FORINC12]]
+; CHECK: br label %[[FORBODY6:.+]]
+
+; CHECK: [[FORBODY6]]:
+; CHECK: %[[MGL:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrb(ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[ADD8:.+]] = add i32 %{{.+}}, %[[MGL]]
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %[[ADD8]], ptr %{{.+}}, i1 %[[EDGEMASK_FORBODY6]])
+; CHECK: %[[FORBODY6EXITMASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORBODY6EXITMASK]])
+; CHECK: br i1 %[[FORBODY6EXITMASK_ANY]], label %[[FORCOND3:.+]], label %[[FORINC12:.+]]
+
+; CHECK: [[FORINC12]]:
+; CHECK: %[[FORINC12LOOPEXITMASKUPDATE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[FORINC12LOOPEXITMASKUPDATE]])
+; CHECK: br i1 %[[FORINC12LOOPEXITMASKUPDATE_ANY]], label %[[FORCOND:.+]], label %[[FOREND14:.+]]
+
+; CHECK: [[FOREND14]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
new file mode 100644
index 0000000000000..99c0a220d0727
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_ptrs.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_varying_if_ptr(i32 %a, ptr %b, ptr %on_true, ptr %on_false) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds ptr, ptr %b, i64 %idxprom
+  store ptr %on_true, ptr %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds ptr, ptr %b, i64 42
+  store ptr %on_false, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK:     define void @__vecz_b_masked_store4_u3ptru3ptrb(ptr [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] {
+; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:     [[IF]]:
+; CHECK-NEXT:  store ptr [[A]], ptr [[B]], align 4
+; CHECK-NEXT:  br label %[[EXIT]]
+; CHECK:     [[EXIT]]:
+; CHECK-NEXT:  ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
new file mode 100644
index 0000000000000..b8a23afb5a39c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_if.ll
@@ -0,0 +1,168 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_uniform_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This tests a uniform if statement that shouldn't be touched by the CFC pass
+; CHECK: define spir_kernel void @__vecz_v4_test_uniform_if(i32 %a, ptr %b)
+; CHECK: br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: store i32 11, ptr %arrayidx, align 4
+
+; CHECK: if.else:
+; CHECK: store i32 13, ptr %arrayidx1, align 4
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
new file mode 100644
index 0000000000000..508d105fa78f7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_uniform_loop.ll
@@ -0,0 +1,176 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_uniform_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This tests for a uniform loop that should remain untouched by the CFC pass
+; CHECK: define spir_kernel void @__vecz_v4_test_uniform_loop(i32 %a, ptr %b)
+; CHECK: br label %for.cond
+
+; CHECK: for.cond:
+; CHECK: %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+; CHECK: %cmp = icmp slt i32 %storemerge, 16
+; CHECK: br i1 %cmp, label %for.body, label %for.end
+
+; CHECK: for.body:
+; CHECK: %add = add nsw i32 %storemerge, %a
+; CHECK: %idxprom = sext i32 %add2 to i64
+; CHECK: %arrayidx = getelementptr i32, ptr %b, i64 %idxprom
+; CHECK: store i32 %add, ptr %arrayidx, align 4
+; CHECK: %inc = add nsw i32 %storemerge, 1
+; CHECK: br label %for.cond
+
+; CHECK: for.end:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
new file mode 100644
index 0000000000000..c4a2b075b4664
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_if.ll
@@ -0,0 +1,166 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_varying_if -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Check for a varying that needs masked operations
+; CHECK: define spir_kernel void @__vecz_v4_test_varying_if(i32 %a, ptr %b)
+; CHECK: %cmp = icmp eq i64 %conv, %call
+; CHECK: %cmp.not = xor i1 %cmp, true
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 11, ptr %arrayidx, i1 %cmp)
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 13, ptr %arrayidx2, i1 %cmp.not)
+
+; Note that the entry mask would be removed by any DCE pass
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
new file mode 100644
index 0000000000000..77184596228ce
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/control_flow_conversion_varying_loop.ll
@@ -0,0 +1,185 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_varying_loop -vecz-passes=cfg-convert -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_uniform_if(i32 %a, i32* %b) {
+entry:
+  %cmp = icmp eq i32 %a, 1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx1 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_varying_if(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @test_uniform_loop(i32 %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_varying_loop(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %a
+  %add2 = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 %add, i32* %arrayidx, align 4
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+define spir_kernel void @test_nested_loops(i32* %a, i32* %b)  {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %sub = sub nsw i32 16, %conv
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc12, %entry
+  %storemerge = phi i32 [ %sub, %entry ], [ %inc13, %for.inc12 ]
+  %cmp = icmp slt i32 %storemerge, 16
+  br i1 %cmp, label %for.body, label %for.end14
+
+for.body:                                         ; preds = %for.cond
+  %sub2 = sub nsw i32 24, %conv
+  br label %for.cond3
+
+for.cond3:                                        ; preds = %for.body6, %for.body
+  %storemerge1 = phi i32 [ %sub2, %for.body ], [ %inc, %for.body6 ]
+  %cmp4 = icmp slt i32 %storemerge, 24
+  br i1 %cmp4, label %for.body6, label %for.inc12
+
+for.body6:                                        ; preds = %for.cond3
+  %add = add nsw i32 %storemerge1, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %add7 = add i32 %storemerge1, %storemerge
+  %add8 = add i32 %add7, %0
+  %add9 = add nsw i32 %storemerge, %conv
+  %idxprom10 = sext i32 %add9 to i64
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %idxprom10
+  store i32 %add8, i32* %arrayidx11, align 4
+  %inc = add nsw i32 %storemerge1, 1
+  br label %for.cond3
+
+for.inc12:                                        ; preds = %for.cond3
+  %inc13 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end14:                                        ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; The loop's start condition depends on the global ID
+; Note that the mask names are hardcoded in vecz, if they change they need to be
+; changed here as well. We do need them though, to make sure that we are
+; checking for the correct stuff. Since we don't have any duplicate names, they
+; should all be deterministic.
+; CHECK: define spir_kernel void @__vecz_v4_test_varying_loop(i32 %a, ptr %b)
+; CHECK: br label %for.cond
+
+; CHECK: for.cond:
+; CHECK: %for.cond.entry_mask = phi i1 [ true, %entry ], [ %for.body.exit_mask, %for.body ]
+; CHECK: %for.end.loop_exit_mask = phi i1 [ false, %entry ], [ %for.end.loop_exit_mask.update, %for.body ]
+; CHECK: %cmp = icmp slt i32 %storemerge, 16
+; CHECK: %for.body.exit_mask = select i1 %for.cond.entry_mask, i1 %cmp, i1 false
+; CHECK: %cmp.not = xor i1 %cmp, true
+; CHECK: %for.end.exit_mask = select i1 %for.cond.entry_mask, i1 %cmp.not, i1 false
+; CHECK: %for.end.loop_exit_mask.update = or i1 %for.end.loop_exit_mask, %for.end.exit_mask
+; CHECK: br label %for.body
+
+; CHECK: for.body:
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 %add, ptr %arrayidx, i1 %for.body.exit_mask)
+; CHECK: %[[EXIT_MASK_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %for.body.exit_mask)
+; CHECK: br i1 %[[EXIT_MASK_ANY]], label %for.cond, label %for.cond.pure_exit
+
+; CHECK: for.cond.pure_exit:
+; CHECK: br label %for.end
+
+; CHECK: for.end:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
new file mode 100644
index 0000000000000..07d638f131350
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert3.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k convert3 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @convert3(i64 addrspace(1)* %src, float addrspace(1)* %dest) local_unnamed_addr {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func <3 x i64> @_Z6vload3mPU3AS1Kl(i64 %call, i64 addrspace(1)* %src)
+  %call2 = tail call spir_func <3 x float> @_Z14convert_float3Dv3_l(<3 x i64> %call1)
+  tail call spir_func void @_Z7vstore3Dv3_fmPU3AS1f(<3 x float> %call2, i64 %call, float addrspace(1)* %dest)
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func void @_Z7vstore3Dv3_fmPU3AS1f(<3 x float>, i64, float addrspace(1)*) local_unnamed_addr
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func <3 x float> @_Z14convert_float3Dv3_l(<3 x i64>) local_unnamed_addr
+
+; Function Attrs: convergent nounwind
+declare spir_func <3 x i64> @_Z6vload3mPU3AS1Kl(i64, i64 addrspace(1)*) local_unnamed_addr
+
+; Note that we have to declare the scalar version, because when we vectorize
+; an already-vector builtin, we have to scalarize it first. The scalar call
+; exists during the intermediate stage between scalarization and packetization,
+; and so has to exist in the module.
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func float @_Z13convert_floatl(i64) local_unnamed_addr
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func <2 x float> @_Z14convert_float2Dv2_l(<2 x i64>) local_unnamed_addr
+
+; With SIMD width 2, should have 3 x convert_float2.
+
+; CHECK: define spir_kernel void @__vecz_v2_convert3
+; CHECK: call <2 x i64> @__vecz_b_interleaved_load8_3
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK-NOT: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call void @__vecz_b_interleaved_store4_3_Dv2_fu3ptrU3AS1(<2 x float>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
new file mode 100644
index 0000000000000..422e2be0e3237
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert4.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k convert4 -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nofree nounwind
+define spir_kernel void @convert4(<4 x i64> addrspace(1)* nocapture readonly %in, <4 x float> addrspace(1)* nocapture %out) local_unnamed_addr {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %in, i64 %call
+  %0 = load <4 x i64>, <4 x i64> addrspace(1)* %arrayidx, align 32
+  %call1 = tail call spir_func <4 x float> @_Z14convert_float4Dv4_l(<4 x i64> %0)
+  %arrayidx2 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %out, i64 %call
+  store <4 x float> %call1, <4 x float> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func <4 x float> @_Z14convert_float4Dv4_l(<4 x i64>) local_unnamed_addr
+
+; Note that we have to declare the scalar version, because when we vectorize
+; an already-vector builtin, we have to scalarize it first. The scalar call
+; exists during the intermediate stage between scalarization and packetization,
+; and so has to exist in the module.
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func float @_Z13convert_floatl(i64) local_unnamed_addr
+
+; Function Attrs: convergent nounwind readnone
+declare spir_func <2 x float> @_Z14convert_float2Dv2_l(<2 x i64>) local_unnamed_addr
+
+; With SIMD width 2, should have 4 x convert_float2.
+
+; CHECK: call <2 x i64> @__vecz_b_interleaved_load8_4
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK-NOT: call spir_func <2 x float> @_Z14convert_float2Dv2_l
+; CHECK: call void @__vecz_b_interleaved_store4_4_Dv2_fu3ptrU3AS1(<2 x float>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
new file mode 100644
index 0000000000000..f4f363b7e5c17
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/convert_contiguity.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k convert_contiguity -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @convert_contiguity(float addrspace(1)* %m_ptr) {
+  %1 = call i64 @__mux_get_global_id(i32 0)
+  %2 = call spir_func i32 @_Z12convert_uintm(i64 %1)
+  %3 = icmp slt i32 %2, 100
+  %4 = select i1 %3, float 1.000000e+00, float 0.000000e+00
+  %5 = call spir_func i64 @_Z12convert_longi(i32 %2)
+  %6 = getelementptr inbounds float, float addrspace(1)* %m_ptr, i64 %5
+  store float %4, float addrspace(1)* %6, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare spir_func i32 @_Z12convert_uintm(i64)
+
+; Function Attrs: nounwind readnone
+declare spir_func i64 @_Z12convert_longi(i32)
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32)
+
+; It checks that the store address was identified as congituous through the
+; OpenCL convert builtin function
+
+; CHECK: void @__vecz_v4_convert_contiguity
+; CHECK: store <4 x float>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
new file mode 100644
index 0000000000000..48bfa3ad25429
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  %c3.load = load i64, i64* %c3, align 4
+  %c4 = getelementptr i64, i64* %c3, i64 %gid
+  store i64 %c3.load, i64* %c4, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Test if the scatter store is defined correctly
+; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] {
+; CHECK: %[[V1:[0-9]+]] = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}},
+; CHECK: ret <4 x i64> %[[V1]]
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
new file mode 100644
index 0000000000000..bcbf179616d32
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_gather_load_as_masked.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  %c3.load = load i64, i64* %c3, align 4
+  %c4 = getelementptr i64, i64* %c3, i64 %gid
+  store i64 %c3.load, i64* %c4, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Test if the scatter store is defined correctly
+; CHECK: define <4 x i64> @__vecz_b_gather_load4_Dv4_mDv4_u3ptr(<4 x ptr>{{( %0)?}}) [[ATTRS:#[0-9]+]] {
+; CHECK: call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %0, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x i64> poison)
+; CHECK: ret <4 x i64>
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
new file mode 100644
index 0000000000000..d54d31595e7f8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load.ll
@@ -0,0 +1,62 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare void @__mux_work_group_barrier(i32, i32, i32)
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
+
+; Test if the interleaved load is defined correctly
+; CHECK: define <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
+; CHECK: %[[TMP1:.*]] = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK: %[[TMP2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %[[TMP1]], i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> poison)
+; CHECK: ret <4 x double> %[[TMP2]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
new file mode 100644
index 0000000000000..ca5b39de6e149
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_load_as_masked.ll
@@ -0,0 +1,79 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Test if the interleaved load is defined correctly
+; CHECK: define <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1(ptr addrspace(1){{( %0)?}})
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %0, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
+; CHECK: %1 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK: %2 = call <4 x double> @llvm.masked.gather.v4f64.v4p1(<4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}}, <4 x double> poison)
+; CHECK: ret <4 x double> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
new file mode 100644
index 0000000000000..1e8c1c3f67979
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528)
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare void @__mux_work_group_barrier(i32, i32, i32)
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
+; CHECK: %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK: call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
new file mode 100644
index 0000000000000..5fd7ad27aa856
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_interleaved_store_as_masked.ll
@@ -0,0 +1,80 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Test if the interleaved store is defined correctly
+; CHECK: define void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double>{{( %0)?}}, ptr addrspace(1){{( %1)?}})
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK:  %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
+; CHECK:  %2 = getelementptr double, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 4, i64 8, i64 12>
+; CHECK:  call void @llvm.masked.scatter.v4f64.v4p1(<4 x double> %0, <4 x ptr addrspace(1)> %2, i32{{( immarg)?}} 8, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
new file mode 100644
index 0000000000000..8de9ec81b534c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_internal_builtins.ll
@@ -0,0 +1,32 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  %b = bitcast i32 addrspace(2)* %in to <4 x i32> addrspace(2)*
+  %v = call <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(<4 x i32> addrspace(2)* %b, <4 x i1> zeroinitializer)
+  ret void
+}
+
+declare <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(<4 x i32> addrspace(2)*, <4 x i1>)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_masked_load4_Dv4_jPU3AS2Dv4_jDv4_b(ptr addrspace(2){{.*}}, <4 x i1>{{.*}}) {
+; CHECK:   %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32 4, <4 x i1> %1, <4 x i32> poison)
+; CHECK:   ret <4 x i32> %2
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
new file mode 100644
index 0000000000000..394eb61e4aaff
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_gather_load.ll
@@ -0,0 +1,83 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k masked_gather -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %idxprom4 = sext i32 %2 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4
+  store i32 42, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 42, i32 addrspace(1)* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Test if the masked gather load is defined correctly
+; CHECK: define <4 x i32> @__vecz_b_masked_gather_load4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x ptr addrspace(1)>{{( %0)?}}, <4 x i1>{{( %1)?}})
+; CHECK: entry:
+; CHECK: %2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)> %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> poison)
+; CHECK: ret <4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
new file mode 100644
index 0000000000000..1b7e191cce0a3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_load.ll
@@ -0,0 +1,90 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0) #5
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call2 = call i64 @__mux_get_global_id(i32 0) #5
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call i64 @__mux_get_local_size(i32 0) #5
+  %call9 = call i64 @__mux_get_group_id(i32 0) #5
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_local_id(i32) #1
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare i64 @__mux_get_local_size(i32) #1
+
+declare i64 @__mux_get_group_id(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline }
+attributes #3 = { argmemonly nounwind }
+attributes #4 = { argmemonly nounwind readonly }
+attributes #5 = { nobuiltin nounwind }
+attributes #6 = { nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !"const", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+
+
+; Test if the masked load is defined correctly
+; CHECK: define <4 x i32> @__vecz_b_masked_load4_Dv4_ju3ptrU3AS2Dv4_b(ptr addrspace(2){{( %0)?}}, <4 x i1>{{( %1)?}})
+; CHECK: entry:
+; CHECK: %2 = call <4 x i32> @llvm.masked.load.v4i32.p2(ptr addrspace(2) %0, i32{{( immarg)?}} 4, <4 x i1> %1, <4 x i32> poison)
+; CHECK: ret <4 x i32> %2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
new file mode 100644
index 0000000000000..bc33844fafee2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_scatter_store.ll
@@ -0,0 +1,85 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k masked_scatter -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @masked_scatter(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %b_index) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %idxprom = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b_index, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %idxprom4 = sext i32 %2 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %idxprom4
+  store i32 42, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+define spir_kernel void @masked_gather(i32 addrspace(1)* %a, i32 addrspace(1)* %a_index, i32 addrspace(1)* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %rem = urem i64 %call, 3
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a_index, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %call
+  store i32 42, i32 addrspace(1)* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Test if the masked scatter store is defined correctly
+; CHECK: define void @__vecz_b_masked_scatter_store4_Dv4_jDv4_u3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, <4 x ptr addrspace(1)>{{( %1)?}}, <4 x i1>{{( %2)?}})
+; CHECK: entry:
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %1, i32{{( immarg)?}} 4, <4 x i1> %2) #[[ATTRS:[0-9]+]]
+; CHECK: ret void
+
+; CHECK: attributes #[[ATTRS]] = {
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
new file mode 100644
index 0000000000000..21412fc239186
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_masked_store.ll
@@ -0,0 +1,90 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dont_mask_workitem_builtins -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0) #5
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %call2 = call i64 @__mux_get_global_id(i32 0) #5
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call i64 @__mux_get_local_size(i32 0) #5
+  %call9 = call i64 @__mux_get_group_id(i32 0) #5
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_local_id(i32) #1
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare i64 @__mux_get_local_size(i32) #1
+
+declare i64 @__mux_get_group_id(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline }
+attributes #3 = { argmemonly nounwind }
+attributes #4 = { argmemonly nounwind readonly }
+attributes #5 = { nobuiltin nounwind }
+attributes #6 = { nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !"const", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+
+
+; Test if the masked store is defined correctly
+; CHECK: define void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}})
+; CHECK: entry:
+; CHECK: call void @llvm.masked.store.v4i32.p1(<4 x i32> %0, ptr addrspace(1) %1, i32 4, <4 x i1> %2)
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
new file mode 100644
index 0000000000000..1f736694807fa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  %c3.load = load i64, i64* %c3, align 4
+  %c4 = getelementptr i64, i64* %c3, i64 %gid
+  store i64 %c3.load, i64* %c4, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Test if the scatter store is defined correctly
+; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) [[ATTRS:#[0-9]+]] {
+; CHECK: entry
+; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
+; CHECK: ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
new file mode 100644
index 0000000000000..326b7cf69d6a0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_scatter_store_as_masked.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  %c3.load = load i64, i64* %c3, align 4
+  %c4 = getelementptr i64, i64* %c3, i64 %gid
+  store i64 %c3.load, i64* %c4, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Test if the scatter store is defined correctly
+; CHECK: define void @__vecz_b_scatter_store4_Dv4_mDv4_u3ptr(<4 x i64>{{( %0)?}}, <4 x ptr>{{( %1)?}}) [[ATTRS:#[0-9]+]] {
+; CHECK: entry:
+; CHECK: call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %0, <4 x ptr> %1, i32{{( immarg)?}} 4, <4 x i1> {{<(i1 true(, )?)+>|splat \(i1 true\)}})
+; CHECK: ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
new file mode 100644
index 0000000000000..2d31999d37d37
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/define_subgroup_scans.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dummy -vecz-simd-width=4 -vecz-passes=define-builtins -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @dummy(i32 addrspace(2)* %in, i32 addrspace(1)* %out) {
+  ; Dummy uses of the builtins, as we don't define any with zero uses.
+  %a = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> zeroinitializer)
+  %b = call <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32> zeroinitializer)
+  ret void
+}
+
+declare <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32>)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> %0) {
+; CHECK: entry:
+; CHECK:  %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+; CHECK:  %[[ADD1:.+]] = add <4 x i32> %0, %[[SHUF1]]
+; CHECK:  %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, <4 x i32> <i32 4, i32 4, i32 1, i32 1>
+; CHECK:  %[[RESULT:.+]] = add <4 x i32> %[[ADD1]], %[[SHUF2]]
+; CHECK:  ret <4 x i32> %[[RESULT]]
+; CHECK: }
+
+declare <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32>)
+; CHECK-LABEL: define <4 x i32> @__vecz_b_sub_group_scan_exclusive_add_Dv4_j(<4 x i32> %0) {
+; CHECK: entry:
+; CHECK:  %[[SHUF1:.+]] = shufflevector <4 x i32> %0, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, <4 x i32> <i32 4, i32 0, i32 4, i32 2>
+; CHECK:  %[[ADD1:.+]] = add <4 x i32> %0, %[[SHUF1]]
+; CHECK:  %[[SHUF2:.+]] = shufflevector <4 x i32> %[[ADD1]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>, <4 x i32> <i32 4, i32 4, i32 1, i32 1>
+; CHECK:  %[[ADD2:.+]] = add <4 x i32> %[[ADD1]], %[[SHUF2]]
+; CHECK:  %[[ROTATE:.+]] = shufflevector <4 x i32> %[[ADD2]], <4 x i32> poison, <4 x i32> <i32 {{.+}}, i32 0, i32 1, i32 2>
+; CHECK:  %[[RESULT:.+]] = insertelement <4 x i32> %[[ROTATE]], i32 0, i64 0
+; CHECK:  ret <4 x i32> %[[RESULT]]
+; CHECK: }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
new file mode 100644
index 0000000000000..1ce3ddc2368c6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/delete_packetized_memop.ll
@@ -0,0 +1,77 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k memop_loop_dep -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.addr.0, %e
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in)
+  call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out)
+  %0 = extractelement <4 x i32> %call1, i64 0
+  %tobool = icmp ne i32 %0, 0
+  %tobool2 = icmp eq i64 %call, 0
+  %or.cond = and i1 %tobool2, %tobool
+  br i1 %or.cond, label %while.cond, label %for.inc
+
+while.cond:                                       ; preds = %while.cond, %for.body
+  %tobool3 = icmp eq i64 %call, 0
+  br i1 %tobool3, label %for.inc, label %while.cond
+
+for.inc:                                          ; preds = %for.body, %while.cond
+  %inc = add nsw i32 %i.addr.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
+
+declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*)
+
+; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep
+
+; Check if we have the packetized and only the packetized version of the memop.
+; Vecz should assert if this test fails, as we will not define the interleaved
+; op with width of 1.
+; Interleaved Group Combine gets rid of all the interleaved loads created by
+; the re-vectorization process
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK: load <4 x i32>
+; CHECK-NOT: call {{.*}}i32 @__vecz_b_interleaved_load4_ju3ptrU3AS1
+
+; CHECK: ret void
+
+; Check if the declaration is missing as well
+; CHECK-NOT: @__vecz_b_interleaved_load4_ju3ptrU3AS1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll
new file mode 100644
index 0000000000000..8bfa6cd569ea9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/device-sg-size-auto.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Let vecz pick the right vectorization factor for this kernel
+; RUN: veczc --vecz-auto -k foo -k bar --device-sg-sizes 6,7,8,9 -S < %s | FileCheck %s
+; RUN: veczc --vecz-auto -k foo:4 -k bar:4 --device-sg-sizes 6,7,8,9 -S < %s | FileCheck %s
+
+; Check we auto-vectorize to 8, despite any other options telling us a
+; different vectorization factor. A factor of 8 is 'best' here because it's a
+; power of two.
+; CHECK: define void @__vecz_v8_foo(
+define void @foo(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+  %id = call i64 @__mux_get_global_id(i32 0)
+  %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id
+  %x = load i32, ptr addrspace(1) %in.addr
+  %sglid = call i32 @__mux_get_sub_group_local_id()
+; CHECK: = add <8 x i32>
+  %y = add i32 %x, %sglid
+  %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id
+  store i32 %y, ptr addrspace(1) %out.addr
+  ret void
+}
+
+; Check we auto-vectorize to 7, despite any other options telling us a
+; different vectorization factor. A factor of 8 is 'best' here because it's a
+; power of two, but a factor of 7 works well because it won't need a tail.
+; CHECK: define void @__vecz_v7_bar(
+define void @bar(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !reqd_work_group_size !0 {
+  %id = call i64 @__mux_get_global_id(i32 0)
+  %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id
+  %x = load i32, ptr addrspace(1) %in.addr
+  %sglid = call i32 @__mux_get_sub_group_local_id()
+; CHECK: = add <7 x i32>
+  %y = add i32 %x, %sglid
+  %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id
+  store i32 %y, ptr addrspace(1) %out.addr
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i32 @__mux_get_sub_group_local_id()
+
+attributes #0 = { "mux-kernel"="entry-point" }
+
+!0 = !{i64 14, i64 1, i64 1}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
new file mode 100644
index 0000000000000..cca59611013d3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/divergent_loop_bug.ll
@@ -0,0 +1,186 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=cfg-convert -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_uniform_if_then_in_divergent_block(
+; CHECK-SAME:                      ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out)
+define spir_kernel void @uniform_if_then_in_divergent_block(ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) #4 !reqd_work_group_size !10 {
+; CHECK: entry:
+; CHECK: [[CMP_NOT:%.*]] = icmp slt i32 %0, %threshold
+; CHECK: %cmp.not.ROSCC = icmp eq i1 [[CMP_NOT]], false
+; CHECK: %cmp.not.ROSCC_any = call i1 @__vecz_b_divergence_any(i1 %cmp.not.ROSCC)
+; CHECK: br i1 %cmp.not.ROSCC_any, label %entry.ROSCC, label %entry.if.end17_crit_edge
+entry:
+  %cosa = alloca float, align 4
+  %call = tail call i64 @__mux_get_global_id(i32 0) #5
+  %sext = mul i64 %call, 51539607552
+  %idx.ext = ashr exact i64 %sext, 32
+  %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %accum_ptr, i64 %idx.ext
+  %0 = load i32, ptr addrspace(1) %add.ptr, align 4
+  %cmp.not = icmp slt i32 %0, %threshold
+  br i1 %cmp.not, label %entry.if.end17_crit_edge, label %if.then
+
+; CHECK: entry.ROSCC:
+; CHECK: [[CMP_NOT_NOT:%.*]] = xor i1 [[CMP_NOT]], true
+; CHECK: br label %if.then
+
+entry.if.end17_crit_edge:                          ; preds = %entry
+  br label %if.end17
+
+; Ensure that only active lanes (masked by %cmp.not.not) contribute towards the
+; %or.cond branch.
+; CHECK: if.then:
+; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %mul7 = fmul float %2, -2.950000e+01
+; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00
+; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01
+; CHECK: %or.cond = and i1 %cmp11, %cmp14
+; CHECK: %or.cond_active = select i1 [[CMP_NOT_NOT]], i1 %or.cond, i1 false
+; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active)
+; CHECK: br i1 %or.cond_active_any, label %if.then.if.end_crit_edge, label %if.then16
+if.then:                                           ; preds = %entry
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cosa) #6
+  store float 0.000000e+00, ptr %cosa, align 4
+  %call4 = call spir_func float @_Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa) #7
+  %1 = load float, ptr %cosa, align 4
+  %mul7 = fmul float %1, -2.950000e+01
+  %cmp11 = fcmp uge float %mul7, 0.000000e+00
+  %cmp14 = fcmp ult float %mul7, 6.400000e+01
+  %or.cond = and i1 %cmp11, %cmp14
+  br i1 %or.cond, label %if.then.if.end_crit_edge, label %if.then16
+
+if.then.if.end_crit_edge:                          ; preds = %if.then
+  br label %if.end
+
+if.then16:                                         ; preds = %if.then
+  %sext2 = shl i64 %call, 32
+  %idxprom = ashr exact i64 %sext2, 32
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom
+  store float %mul7, ptr addrspace(1) %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then.if.end_crit_edge, %if.then16
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cosa) #6
+  br label %if.end17
+
+if.end17:                                         ; preds = %entry.if.end17_crit_edge, %if.end
+  ret void
+}
+
+define spir_kernel void @uniform_if_else_in_divergent_block(ptr addrspace(1) %accum_ptr, i32 %threshold, ptr addrspace(1) %out) #4 !reqd_work_group_size !10 {
+; CHECK: entry:
+; CHECK: [[CMP_NOT:%.*]] = icmp slt i32 %0, %threshold
+; CHECK: %cmp.not.ROSCC = icmp eq i1 [[CMP_NOT]], false
+; CHECK: %cmp.not.ROSCC_any = call i1 @__vecz_b_divergence_any(i1 %cmp.not.ROSCC)
+; CHECK: br i1 %cmp.not.ROSCC_any, label %entry.ROSCC, label %entry.if.end17_crit_edge
+entry:
+  %cosa = alloca float, align 4
+  %call = tail call i64 @__mux_get_global_id(i32 0) #5
+  %sext = mul i64 %call, 51539607552
+  %idx.ext = ashr exact i64 %sext, 32
+  %add.ptr = getelementptr inbounds i32, ptr addrspace(1) %accum_ptr, i64 %idx.ext
+  %0 = load i32, ptr addrspace(1) %add.ptr, align 4
+  %cmp.not = icmp slt i32 %0, %threshold
+  br i1 %cmp.not, label %entry.if.end17_crit_edge, label %if.then
+
+; CHECK: entry.ROSCC:
+; CHECK: [[CMP_NOT_NOT:%.*]] = xor i1 [[CMP_NOT]], true
+; CHECK: br label %if.then
+
+entry.if.end17_crit_edge:                          ; preds = %entry
+  br label %if.end17
+
+; Ensure that only active lanes (masked by %cmp.not.not) contribute towards the
+; %or.cond branch.
+; CHECK: if.then:
+; CHECK: call void @__vecz_b_masked_store4_fu3ptrb(float 0.000000e+00, ptr %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %1 = call spir_func float @__vecz_b_masked__Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %2 = call float @__vecz_b_masked_load4_fu3ptrb(ptr %cosa, i1 [[CMP_NOT_NOT]])
+; CHECK: %mul7 = fmul float %2, -2.950000e+01
+; CHECK: %cmp11 = fcmp uge float %mul7, 0.000000e+00
+; CHECK: %cmp14 = fcmp ult float %mul7, 6.400000e+01
+; CHECK: %or.cond = and i1 %cmp11, %cmp14
+; CHECK: %or.cond_active = select i1 [[CMP_NOT_NOT]], i1 %or.cond, i1 false
+; CHECK: %or.cond_active_any = call i1 @__vecz_b_divergence_any(i1 %or.cond_active)
+; CHECK: br i1 %or.cond_active_any, label %if.else.crit_edge, label %if.then16
+if.then:                                           ; preds = %entry
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cosa) #6
+  store float 0.000000e+00, ptr %cosa, align 4
+  %call4 = call spir_func float @_Z6sincosfPf(float 0.000000e+00, ptr nonnull %cosa) #7
+  %1 = load float, ptr %cosa, align 4
+  %mul7 = fmul float %1, -2.950000e+01
+  %cmp11 = fcmp uge float %mul7, 0.000000e+00
+  %cmp14 = fcmp ult float %mul7, 6.400000e+01
+  %or.cond = and i1 %cmp11, %cmp14
+  br i1 %or.cond, label %if.else.crit_edge, label %if.then16
+
+if.else.crit_edge:                                 ; preds = %if.then
+  br label %if.else
+
+if.then16:                                         ; preds = %if.then
+  %sext2 = shl i64 %call, 32
+  %idxprom = ashr exact i64 %sext2, 32
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom
+  store float %mul7, ptr addrspace(1) %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %idxprom
+  store float 1.0, ptr addrspace(1) %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then16
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cosa) #6
+  br label %if.end17
+
+if.end17:                                         ; preds = %entry.if.end17_crit_edge, %if.end
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind
+declare spir_func float @_Z6sincosfPf(float, ptr) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: alwaysinline norecurse nounwind memory(read)
+declare i64 @__mux_get_global_id(i32) #3
+
+attributes #0 = { norecurse nounwind "mux-kernel"="entry-point" "mux-local-mem-usage"="0" "mux-no-subgroups" "mux-orig-fn"="get_lines" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "vecz-mode"="auto" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) "vecz-mode"="auto" }
+attributes #2 = { nounwind "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "vecz-mode"="auto" }
+attributes #3 = { alwaysinline norecurse nounwind memory(read) "vecz-mode"="auto" }
+attributes #4 = { norecurse nounwind "mux-base-fn-name"="get_lines" "mux-kernel"="entry-point" "mux-local-mem-usage"="0" "mux-no-subgroups" "mux-orig-fn"="get_lines" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "vecz-mode"="auto" }
+attributes #5 = { alwaysinline norecurse nounwind memory(read) }
+attributes #6 = { nounwind }
+attributes #7 = { nobuiltin nounwind "no-builtins" }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!2}
+!opencl.spir.version = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 1, i32 2}
+!10 = !{i32 2, i32 1, i32 1}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
new file mode 100644
index 0000000000000..f300fff8801f7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_loop.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_local_size(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call i32 @__mux_get_local_id(i32 0)
+  %size = call i32 @__mux_get_local_size(i32 0)
+  br label %loop
+
+loop:
+  %index = phi i32 [0, %entry], [%inc, %loop]
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc = add i32 %index, 1
+  %cmp = icmp ne i32 %inc, %id
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: loop:
+; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %loop.entry_mask{{[0-9]*}} to i4
+; CHECK: %[[MASK:[^ ]+]] = icmp ne i4 %[[BITCAST]], 0
+; CHECK: %[[LOAD:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]])
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[LOAD]], ptr addrspace(1) %{{.+}}, i1 %[[MASK]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
new file mode 100644
index 0000000000000..2f720c7a49ec0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/diverging_nested_loop.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_local_size(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call i32 @__mux_get_local_id(i32 0)
+  %size = call i32 @__mux_get_local_size(i32 0)
+  br label %loop
+
+loop:
+  %index = phi i32 [0, %entry], [%inc, %nested_merge]
+  br label %koop
+
+koop:
+  %kndex = phi i32 [%index, %loop], [%knc, %koop]
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %knc = add i32 %kndex, 1
+  %kmp = icmp ne i32 %knc, %id
+  br i1 %kmp, label %koop, label %nested_merge
+
+nested_merge:
+  %old = atomicrmw add i32 addrspace(1)* %in, i32 42 acq_rel
+  %inc = add i32 %index, 1
+  %cmp = icmp ne i32 %inc, %size
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: koop:
+; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %koop.entry_mask{{[0-9]*}} to i4
+; CHECK: %[[MASK:[^ ]+]] = icmp ne i4 %[[BITCAST]], 0
+; CHECK: %[[LOAD:.+]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]])
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[LOAD]], ptr addrspace(1) %{{.+}}, i1 %[[MASK]])
+; CHECK: nested_merge:
+; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel
+; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel
+; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel
+; CHECK: atomicrmw add ptr addrspace(1) %in, i32 42 acq_rel
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
new file mode 100644
index 0000000000000..a20dc32f71b38
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/early-cse-mul-swap.ll
@@ -0,0 +1,78 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k multiple_dimensions_0 -vecz-simd-width 4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_size(i32) #1
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @multiple_dimensions_0(i32 addrspace(1)* %output) #2 {
+entry:
+  %call.i = call i64 @__mux_get_global_id(i32 0) #3
+  %call1.i = call i64 @__mux_get_global_size(i32 1) #3
+  %mul.i = mul i64 %call1.i, %call.i
+  %call2.i = call i64 @__mux_get_global_size(i32 2) #3
+  %mul3.i = mul i64 %mul.i, %call2.i
+  %call4.i = call i64 @__mux_get_global_id(i32 1) #3
+  %mul6.i = mul i64 %call2.i, %call4.i
+  %add.i = add i64 %mul6.i, %mul3.i
+  %call7.i = call i64 @__mux_get_global_id(i32 2) #3
+  %add8.i = add i64 %add.i, %call7.i
+  %conv = trunc i64 %add8.i to i32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %add8.i
+  store i32 %conv, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"}
+!3 = !{void (i32 addrspace(1)*)* @multiple_dimensions_0, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none"}
+!6 = !{!"kernel_arg_type", !"int*"}
+!7 = !{!"kernel_arg_base_type", !"int*"}
+!8 = !{!"kernel_arg_type_qual", !""}
+!9 = !{!"kernel_arg_name", !"output"}
+
+; Function start
+; CHECK: define spir_kernel void @__vecz_v4_multiple_dimensions_0
+
+; make sure the stride calculation uses the correct operand of the multiply
+; CHECK: %[[CALL1:.+]] = call i64 @__mux_get_global_size(i32 1)
+; CHECK: %[[CALL2:.+]] = call i64 @__mux_get_global_size(i32 2)
+; CHECK: %[[NEWMUL:.+]] = mul i64 %[[CALL1]], %[[CALL2]]
+; CHECK: call void @__vecz_b_interleaved_store4_V_Dv4_ju3ptrU3AS1({{.+}} %[[NEWMUL]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
new file mode 100644
index 0000000000000..bc4270c9e2a8c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_memintrinsics.ll
@@ -0,0 +1,200 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k entry -vecz-passes="builtin-inlining,function(instcombine,early-cse),cfg-convert,packetizer" -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Laid out, this struct is 80 bytes
+%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @entry(i64 addrspace(1)* %result, %struct.S2* %result2) {
+entry:
+  %gid = call i64 @__mux_get_local_id(i32 0)
+  %sa = alloca %struct.S2, align 16
+  %sb = alloca %struct.S2, align 16
+  %sa_i8 = bitcast %struct.S2* %sa to i8*
+  %sb_i8 = bitcast %struct.S2* %sb to i8*
+  %sb_i8as = addrspacecast i8* %sb_i8 to i8 addrspace(1)*
+  %rsi = ptrtoint i64 addrspace(1)* %result to i64
+  %rsit = trunc i64 %rsi to i8
+  call void @llvm.memset.p0i8.i64(i8* %sa_i8, i8 %rsit, i64 80, i32 16, i1 false)
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %sb_i8as, i8 0, i64 80, i32 16, i1 false)
+  %lr = addrspacecast %struct.S2* %result2 to %struct.S2 addrspace(1)*
+  %lri = bitcast %struct.S2 addrspace(1)* %lr to i64 addrspace(1)*
+  %cond = icmp eq i64 addrspace(1)* %result, %lri
+  br i1 %cond, label %middle, label %end
+
+middle:
+  call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %sb_i8as, i8* %sa_i8, i64 80, i32 16, i1 false)
+  br label %end
+
+end:
+  %g_343 = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 0
+  %g_343_load = load i16, i16* %g_343
+  %g_343_zext = zext i16 %g_343_load to i64
+  %resp = getelementptr i64, i64 addrspace(1)* %result, i64 %gid
+  store i64 %g_343_zext, i64 addrspace(1)* %resp, align 8
+  %result2_i8 = bitcast %struct.S2* %result2 to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 16, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture readonly, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1)
+
+declare i64 @__mux_get_local_id(i32)
+
+; Note: Between LLVM 17 and LLVM 18, optimizations to alignments were moved to
+; their own pass. We don't run that pass here, resulting in a difference in
+; alignment values between LLVM versions. Because of that, we don't check
+; alignment of any loads or stores
+
+; Sanity checks: Make sure the non-vecz entry function is still in place and
+; contains memset and memcpy. This is done in order to prevent future bafflement
+; in case some pass optimizes them out.
+; CHECK: define spir_kernel void @entry
+; CHECK: entry:
+; CHECK: call void @llvm.memset
+; CHECK: call void @llvm.memset
+; CHECK: middle:
+; CHECK: call void @llvm.memcpy
+; CHECK: end:
+; CHECK: call void @llvm.memcpy
+
+; And now for the actual checks
+
+; Check if the kernel was vectorized
+; CHECK: define spir_kernel void @__vecz_v{{[0-9]+}}_entry
+; CHECK: %[[SB_I8AS:.*]] = addrspacecast ptr %sb to ptr addrspace(1)
+
+; Check if the memset and memcpy calls have been removed
+; CHECK-NOT: call void @llvm.memset
+; CHECK-NOT: call void @llvm.memcpy
+
+; Check if the calculation of the stored value for the second memset is in place
+; CHECK: %ms64val
+
+; Check if the generated loads and stores are in place
+; Check the stores for the first memset
+; CHECK: store i64 %ms64val, ptr %sa
+; CHECK: %[[V14:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 8
+; CHECK: store i64 %ms64val, ptr %[[V14]]
+; CHECK: %[[V15:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 16
+; CHECK: store i64 %ms64val, ptr %[[V15]]
+; CHECK: %[[V16:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 24
+; CHECK: store i64 %ms64val, ptr %[[V16]]
+; CHECK: %[[V17:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 32
+; CHECK: store i64 %ms64val, ptr %[[V17]]
+; CHECK: %[[V18:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 40
+; CHECK: store i64 %ms64val, ptr %[[V18]]
+; CHECK: %[[V19:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 48
+; CHECK: store i64 %ms64val, ptr %[[V19]]
+; CHECK: %[[V20:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 56
+; CHECK-EQ14: %[[V20:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %sa, i64 0, i32 3, i64 8
+; CHECK: %[[V21:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 64
+; CHECK: %[[V22:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %sa, i64 72
+
+; Check the stores for the second memset
+; CHECK: store i64 0, ptr addrspace(1) %[[SB_I8AS]]
+; CHECK: %[[V24:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 8
+; CHECK: store i64 0, ptr addrspace(1) %[[V24]]
+; CHECK: %[[V26:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 16
+; CHECK: store i64 0, ptr addrspace(1) %[[V26]]
+; CHECK: %[[V28:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 24
+; CHECK: store i64 0, ptr addrspace(1) %[[V28]]
+; CHECK: %[[V30:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 32
+; CHECK: store i64 0, ptr addrspace(1) %[[V30]]
+; CHECK: %[[V32:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 40
+; CHECK: store i64 0, ptr addrspace(1) %[[V32]]
+; CHECK: %[[V33:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 48
+; CHECK: store i64 0, ptr addrspace(1) %[[V33]]
+; CHECK: %[[V35T:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 56
+; CHECK-EQ14: %[[V35T:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %sb, i64 0, i32 3, i64 8
+; CHECK-EQ14: %[[V35:[0-9]+]] = bitcast i8* %[[V35T]] to i64*
+; CHECK-EQ14: %[[SB_I8AS18:.+]] = addrspacecast i64* %[[V35]] to i64 addrspace(1)*
+; CHECK: store i64 0, ptr addrspace(1) %[[V35T]]
+; CHECK: %[[V36:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 64
+; CHECK: store i64 0, ptr addrspace(1) %[[V36]]
+; CHECK: %[[V38:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr addrspace(1) %[[SB_I8AS]], i64 72
+; CHECK: store i64 0, ptr addrspace(1) %[[V38]]
+
+
+; Check the loads and stores for the first memcpy
+; CHECK:middle:                                           ; preds = %entry
+; CHECK: %[[SA_I822:.+]] = load i64, ptr %sa
+; CHECK: store i64 %[[SA_I822]], ptr addrspace(1) %[[SB_I8AS]]
+; CHECK: %[[SA_I824:.+]] = load i64, ptr %[[V14]]
+; CHECK: store i64 %[[SA_I824]], ptr addrspace(1) %[[V24]]
+; CHECK: %[[SA_I826:.+]] = load i64, ptr %[[V15]]
+; CHECK: store i64 %[[SA_I826]], ptr addrspace(1) %[[V26]]
+; CHECK: %[[SA_I828:.+]] = load i64, ptr %[[V16]]
+; CHECK: store i64 %[[SA_I828]], ptr addrspace(1) %[[V28]]
+; CHECK: %[[SA_I830:.+]] = load i64, ptr %[[V17]]
+; CHECK: store i64 %[[SA_I830]], ptr addrspace(1) %[[V30]]
+; CHECK: %[[SA_I832:.+]] = load i64, ptr %[[V18]]
+; CHECK: store i64 %[[SA_I832]], ptr addrspace(1) %[[V32]]
+; CHECK: %[[SA_I834:.+]] = load i64, ptr %[[V19]]
+; CHECK: store i64 %[[SA_I834]], ptr addrspace(1) %[[V33]]
+; CHECK: %[[SA_I836:.+]] = load i64, ptr %[[V20]]
+; CHECK: store i64 %[[SA_I836]], ptr addrspace(1) %[[V35T]]
+; CHECK: %[[SA_I838:.+]] = load i64, ptr %[[V21]]
+; CHECK: store i64 %[[SA_I838]], ptr addrspace(1) %[[V36]]
+; CHECK: %[[SA_I840:.+]] = load i64, ptr %[[V22]]
+; CHECK: store i64 %[[SA_I840]], ptr addrspace(1) %[[V38]]
+
+; Check the loads and stores for the second memcpy
+; CHECK:end:                                              ; preds = %middle, %entry
+; CHECK: %[[SB_I8AS42:.+]] = load i64, ptr addrspace(1) %[[SB_I8AS]]
+; CHECK: store i64 %[[SB_I8AS42]], ptr %result2
+; CHECK: %[[V42:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 8
+; CHECK: %[[SB_I8AS44:.+]] = load i64, ptr addrspace(1) %[[V24]]
+; CHECK: store i64 %[[SB_I8AS44]], ptr %[[V42]]
+; CHECK: %[[V43:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 16
+; CHECK: %[[SB_I8AS46:.+]] = load i64, ptr addrspace(1) %[[V26]]
+; CHECK: store i64 %[[SB_I8AS46]], ptr %[[V43]]
+; CHECK: %[[V44:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 24
+; CHECK: %[[SB_I8AS48:.+]] = load i64, ptr addrspace(1) %[[V28]]
+; CHECK: store i64 %[[SB_I8AS48]], ptr %[[V44]]
+; CHECK: %[[V45:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 32
+; CHECK: %[[SB_I8AS50:.+]] = load i64, ptr addrspace(1) %[[V30]]
+; CHECK: store i64 %[[SB_I8AS50]], ptr %[[V45]]
+; CHECK: %[[V46:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 40
+; CHECK: %[[SB_I8AS52:.+]] = load i64, ptr addrspace(1) %[[V32]]
+; CHECK: store i64 %[[SB_I8AS52]], ptr %[[V46]]
+; CHECK: %[[V47:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 48
+; CHECK: %[[SB_I8AS54:.+]] = load i64, ptr addrspace(1) %[[V33]]
+; CHECK: store i64 %[[SB_I8AS54]], ptr %[[V47]]
+; CHECK: %[[V48:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 56
+; CHECK-EQ14: %[[V48:[0-9]+]] = getelementptr inbounds {{(nuw )?}}%struct.S2, %struct.S2* %result2, i64 0, i32 3, i64 8
+; CHECK: %[[SB_I8AS56:.+]] = load i64, ptr addrspace(1) %[[V35T]]
+; CHECK: store i64 %[[SB_I8AS56]], ptr %[[V48]]
+; CHECK: %[[V49:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 64
+; CHECK: %[[SB_I8AS58:.+]] = load i64, ptr addrspace(1) %[[V36]]
+; CHECK: store i64 %[[SB_I8AS58]], ptr %[[V49]]
+; CHECK: %[[V50:[0-9]+]] = getelementptr inbounds {{(nuw )?}}i8, ptr %result2, i64 72
+; CHECK: %[[SB_I8AS60:.+]] = load i64, ptr addrspace(1) %[[V38]]
+; CHECK: store i64 %[[SB_I8AS60]], ptr %[[V50]]
+
+; End of function
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
new file mode 100644
index 0000000000000..cf228937ec2bc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/emit_no_unaligned_memintrinsics.ll
@@ -0,0 +1,91 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k entry -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Laid out, this struct is 80 bytes
+%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @entry(i64 addrspace(1)* %result, %struct.S2* %result2) {
+entry:
+  %gid = call i64 @__mux_get_local_id(i32 0)
+  %sa = alloca %struct.S2, align 16
+  %sb = alloca %struct.S2, align 16
+  %sa_i8 = bitcast %struct.S2* %sa to i8*
+  %sb_i8 = bitcast %struct.S2* %sb to i8*
+  %sb_i8as = addrspacecast i8* %sb_i8 to i8 addrspace(1)*
+  %rsi = ptrtoint i64 addrspace(1)* %result to i64
+  %rsit = trunc i64 %rsi to i8
+  call void @llvm.memset.p0i8.i64(i8*  %sa_i8, i8 %rsit, i64 80, i32 4, i1 false)
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)*  %sb_i8as, i8 0, i64 80, i32 4, i1 false)
+  %lr = addrspacecast %struct.S2* %result2 to %struct.S2 addrspace(1)*
+  %lri = bitcast %struct.S2 addrspace(1)* %lr to i64 addrspace(1)*
+  %cond = icmp eq i64 addrspace(1)* %result, %lri
+  br i1 %cond, label %middle, label %end
+
+middle:
+  call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)*  %sb_i8as, i8* %sa_i8, i64 80, i32 4, i1 false)
+  br label %end
+
+end:
+  %g_343 = getelementptr inbounds %struct.S2, %struct.S2* %sa, i64 0, i32 0
+  %g_343_load = load i16, i16* %g_343
+  %g_343_zext = zext i16 %g_343_load to i64
+  %resp = getelementptr i64, i64 addrspace(1)* %result, i64 %gid
+  store i64 %g_343_zext, i64 addrspace(1)* %resp, align 8
+  %result2_i8 = bitcast %struct.S2* %result2 to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 4, i1 false)
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %result2_i8, i8 addrspace(1)* %sb_i8as, i64 80, i32 4, i1 false)  
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture readonly, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1)
+
+declare i64 @__mux_get_local_id(i32)
+
+; Sanity checks: Make sure the non-vecz entry function is still in place and
+; contains memset and memcpy. This is done in order to prevent future bafflement
+; in case some pass optimizes them out.
+; CHECK: define spir_kernel void @entry
+; CHECK: entry:
+; CHECK: call void @llvm.memset
+; CHECK: call void @llvm.memset
+; CHECK: middle:
+; CHECK: call void @llvm.memcpy
+; CHECK: end:
+; CHECK: call void @llvm.memcpy
+
+; And now for the actual checks
+
+; Check if the kernel was vectorized
+; CHECK: define spir_kernel void @__vecz_v{{[0-9]+}}_entry
+
+; Check if the memset and memcpy calls are still there
+; CHECK: call void @llvm.memset
+; CHECK: call void @llvm.memcpy
+
+; End of function
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
new file mode 100644
index 0000000000000..ebf2ef88aa2c8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/expect_assume.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+declare void @llvm.assume(i1)
+declare i32 @llvm.expect.i32(i32, i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_assume(
+; CHECK: [[A:%.*]] = load <4 x i32>, ptr %arrayidxa, align 4
+; CHECK: [[B:%.*]] = load <4 x i32>, ptr %arrayidxb, align 4
+; CHECK: [[SUM:%.*]] = add <4 x i32> %0, %1
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i32> [[SUM]], zeroinitializer
+; CHECK: [[E0:%.*]] = extractelement <4 x i1> [[CMP]], i64 0
+; CHECK: [[E1:%.*]] = extractelement <4 x i1> [[CMP]], i64 1
+; CHECK: [[E2:%.*]] = extractelement <4 x i1> [[CMP]], i64 2
+; CHECK: [[E3:%.*]] = extractelement <4 x i1> [[CMP]], i64 3
+; CHECK: call void @llvm.assume(i1 [[E0]])
+; CHECK: call void @llvm.assume(i1 [[E1]])
+; CHECK: call void @llvm.assume(i1 [[E2]])
+; CHECK: call void @llvm.assume(i1 [[E3]])
+; CHECK: store <4 x i32> [[SUM]], ptr %arrayidxz, align 4
+define spir_kernel void @assume(ptr %aptr, ptr %bptr, ptr %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
+  %a = load i32, ptr %arrayidxa, align 4
+  %b = load i32, ptr %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  %cond = icmp sgt i32 %sum, 0
+  call void @llvm.assume(i1 %cond)
+  store i32 %sum, ptr %arrayidxz, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_expect(
+; CHECK: [[A:%.*]] = load <4 x i32>, ptr %arrayidxa, align 4
+; CHECK: [[B:%.*]] = load <4 x i32>, ptr %arrayidxb, align 4
+; CHECK: [[SUM:%.*]] = add <4 x i32> %0, %1
+; CHECK: [[E0:%.*]] = extractelement <4 x i32> [[SUM]], i64 0
+; CHECK: [[E1:%.*]] = extractelement <4 x i32> [[SUM]], i64 1
+; CHECK: [[E2:%.*]] = extractelement <4 x i32> [[SUM]], i64 2
+; CHECK: [[E3:%.*]] = extractelement <4 x i32> [[SUM]], i64 3
+; CHECK: [[EX0:%.*]] = call i32 @llvm.expect.i32(i32 [[E0]], i32 42)
+; CHECK: [[EX1:%.*]] = call i32 @llvm.expect.i32(i32 [[E1]], i32 42)
+; CHECK: [[EX2:%.*]] = call i32 @llvm.expect.i32(i32 [[E2]], i32 42)
+; CHECK: [[EX3:%.*]] = call i32 @llvm.expect.i32(i32 [[E3]], i32 42)
+; CHECK: [[C0:%.*]] = insertelement <4 x i32> poison, i32 [[EX0]], i64 0
+; CHECK: [[C1:%.*]]  = insertelement <4 x i32> [[C0]], i32 [[EX1]], i64 1
+; CHECK: [[C2:%.*]]  = insertelement <4 x i32> [[C1]], i32 [[EX2]], i64 2
+; CHECK: [[C3:%.*]]  = insertelement <4 x i32> [[C2]], i32 [[EX3]], i64 3
+; CHECK: store <4 x i32> [[C3]], ptr %arrayidxz, align 4
+
+define spir_kernel void @expect(ptr %aptr, ptr %bptr, ptr %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
+  %a = load i32, ptr %arrayidxa, align 4
+  %b = load i32, ptr %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  %cond = icmp sgt i32 %sum, 0
+  %v = call i32 @llvm.expect.i32(i32 %sum, i32 42)
+  store i32 %v, ptr %arrayidxz, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
new file mode 100644
index 0000000000000..a9b0dbaad5388
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_constant_index.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k extract_constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_constant_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x float> %0, i32 0;
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %vecext, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_constant_index
+; CHECK: call <4 x float> @__vecz_b_interleaved_load4_4_Dv4
+; CHECK: getelementptr float
+; CHECK: store <4 x float>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
new file mode 100644
index 0000000000000..4512408948dc3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/extractelement_runtime_index.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k extract_runtime_index -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @extract_runtime_index(<4 x float> addrspace(1)* %in, i32 %x, float addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 4
+  %vecext = extractelement <4 x float> %0, i32 %x
+  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %vecext, float addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_extract_runtime_index
+; CHECK: load float, {{(ptr|float)}}
+; CHECK: load float, {{(ptr|float)}}
+; CHECK: load float, {{(ptr|float)}}
+; CHECK: load float, {{(ptr|float)}}
+; CHECK: icmp eq i32 0, %x
+; CHECK: select i1
+; CHECK: icmp eq i32 1, %x
+; CHECK: select i1
+; CHECK: icmp eq i32 2, %x
+; CHECK: select i1
+; CHECK: icmp eq i32 3, %x
+; CHECK: select i1
+; CHECK: store float
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
new file mode 100644
index 0000000000000..55d15033ecfc2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_duplication.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S -vecz-passes="function(mem2reg,instcombine),cfg-convert,gvn,packetizer" < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.testStruct = type { [2 x i32] }
+
+
+; Check that we de-duplicate the GEPs used across this kernel (using a
+; combination of instcombine and GVN).
+; CHECK: spir_kernel void @__vecz_v{{[0-9]+}}_gep_duplication
+; CHECK: entry:
+; CHECK: getelementptr inbounds {{(nuw )?}}{{\[2 x i32]|i8}}, ptr %myStruct, {{i64 0, i64 1|i64 4}}
+; CHECK-NOT: getelementptr {{.*}}%myStruct
+define spir_kernel void @gep_duplication(ptr addrspace(1) align 4 %out) {
+entry:
+  %out.addr = alloca ptr addrspace(1), align 8
+  %global_id = alloca i64, align 8
+  %myStruct = alloca %struct.testStruct, align 4
+  store ptr addrspace(1) %out, ptr %out.addr, align 8
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  store i64 %call, ptr %global_id, align 8
+  %x = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
+  %arrayidx = getelementptr inbounds [2 x i32], ptr %x, i64 0, i64 0
+  store i32 0, ptr %arrayidx, align 4
+  %x1 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
+  %arrayidx2 = getelementptr inbounds [2 x i32], ptr %x1, i64 0, i64 1
+  store i32 1, ptr %arrayidx2, align 4
+  %0 = load i64, ptr %global_id, align 8
+  %and = and i64 %0, 1
+  %tobool = icmp ne i64 %and, 0
+  br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %x3 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
+  %arrayidx4 = getelementptr inbounds [2 x i32], ptr %x3, i64 0, i64 0
+  %1 = load i32, ptr %arrayidx4, align 4
+  %2 = load ptr addrspace(1), ptr %out.addr, align 8
+  %3 = load i32, ptr %global_id, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx5 = getelementptr inbounds i32, ptr addrspace(1) %2, i64 %idxprom
+  store i32 %1, ptr addrspace(1) %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %x6 = getelementptr inbounds %struct.testStruct, ptr %myStruct, i32 0, i32 0
+  %arrayidx7 = getelementptr inbounds [2 x i32], ptr %x6, i64 0, i64 1
+  %4 = load i32, ptr %arrayidx7, align 4
+  %5 = load ptr addrspace(1), ptr %out.addr, align 8
+  %6 = load i32, ptr %global_id, align 4
+  %idxprom8 = sext i32 %6 to i64
+  %arrayidx9 = getelementptr inbounds i32, ptr addrspace(1) %5, i64 %idxprom8
+  store i32 %4, ptr addrspace(1) %arrayidx9, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
new file mode 100644
index 0000000000000..005dbcaa64966
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/gep_elim_opaque.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=gep-elim -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i32:32-f80:128-n8:16:32:64-S128"
+target triple = "spir-unknown-unknown"
+
+%struct.mystruct = type { [2 x i32], ptr }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @test(ptr addrspace(1) nocapture writeonly align 4 %output) {
+entry:
+  %foo = alloca [4 x %struct.mystruct], align 4
+  %call = tail call spir_func i32 @__mux_get_global_id(i32 0)
+  store i32 20, ptr %foo, align 4
+  %arrayidx4 = getelementptr inbounds [2 x i32], ptr %foo, i32 0, i32 1
+  store i32 22, ptr %arrayidx4, align 4
+  %y31 = getelementptr inbounds %struct.mystruct, ptr %foo, i32 0, i32 1
+  store ptr %foo, ptr %y31, align 4
+  %mul = shl nuw nsw i32 %call, 2
+  store i32 1, ptr %foo, align 4
+  %0 = load ptr, ptr %y31, align 4
+  %1 = load i32, ptr %0, align 4
+  %add98 = add nsw i32 %mul, %1
+  %arrayidx117 = getelementptr inbounds i32, ptr addrspace(1) %output, i32 %mul
+  store i32 %add98, ptr addrspace(1) %arrayidx117, align 4
+  ret void
+}
+
+declare i32 @__mux_get_global_id(i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(
+
+; Make sure all three GEPs are retained
+; CHECK: %arrayidx4 = getelementptr inbounds [2 x i32], ptr %foo, i32 0, i32 1
+; CHECK: %y31 = getelementptr inbounds %struct.mystruct, ptr %foo, i32 0, i32 1
+; CHECK: %arrayidx117 = getelementptr inbounds i32, ptr addrspace(1) %output, i32 %mul
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
new file mode 100644
index 0000000000000..b45e215814d49
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/indirect_call.ll
@@ -0,0 +1,30 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(void (i32)*, i32) {
+entry:
+  call void %0 (i32 %1)
+  ret void
+}
+
+; This is really a check to see if opt crashed or not
+; CHECK: define spir_kernel void @test(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
new file mode 100644
index 0000000000000..ce041960424b9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/inlined_function_debug_info.ll
@@ -0,0 +1,141 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check VECZ debug info for inlined DILocation metadata nodes
+
+; RUN: veczc -k functions_one -vecz-passes=builtin-inlining -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = '/tmp/inlined_function.ll'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: alwaysinline
+define spir_func i32 @k_one(i32 %x, i32 %y) #0 !dbg !4 {
+entry:
+  call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !9, metadata !38), !dbg !39
+  call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !10, metadata !38), !dbg !39
+  %mul = mul nsw i32 %x, %y, !dbg !40
+  ret i32 %mul, !dbg !40
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nounwind
+define spir_kernel void @functions_one(i32 addrspace(1)* %in1i, i32 addrspace(1)* %in2i, float addrspace(1)* %in1f, float addrspace(1)* %in2f, i32 addrspace(1)* %out1i, float addrspace(1)* %out1f) #2 !dbg !11 {
+entry:
+  call void @llvm.dbg.value(metadata i32 addrspace(1)* %in1i, i64 0, metadata !18, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata i32 addrspace(1)* %in2i, i64 0, metadata !19, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata float addrspace(1)* %in1f, i64 0, metadata !20, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata float addrspace(1)* %in2f, i64 0, metadata !21, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata i32 addrspace(1)* %out1i, i64 0, metadata !22, metadata !38), !dbg !41
+  call void @llvm.dbg.value(metadata float addrspace(1)* %out1f, i64 0, metadata !23, metadata !38), !dbg !41
+  %call = call i64 @__mux_get_global_id(i32 0) #4, !dbg !42
+  call void @llvm.dbg.value(metadata i64 %call, i64 0, metadata !24, metadata !38), !dbg !42
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1i, i64 %call, !dbg !43
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !43
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2i, i64 %call, !dbg !43
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !43
+  call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !9, metadata !38), !dbg !44
+  call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !10, metadata !38), !dbg !44
+  %mul.i = mul nsw i32 %0, %1, !dbg !46
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out1i, i64 %call, !dbg !43
+  store i32 %mul.i, i32 addrspace(1)* %arrayidx3, align 4, !dbg !43
+  ret void, !dbg !47
+}
+
+declare i64 @__mux_get_global_id(i32) #3
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { alwaysinline }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!29}
+!llvm.module.flags = !{!36}
+!llvm.ident = !{!37}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "kernel.opencl", directory: "Aorta/vecz_build")
+!2 = !{}
+!3 = !{!4, !11}
+!4 = distinct !DISubprogram(name: "k_one", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !7, !7}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{!9, !10}
+!9 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1, line: 1, type: !7)
+!10 = !DILocalVariable(name: "y", arg: 2, scope: !4, file: !1, line: 1, type: !7)
+!11 = distinct !DISubprogram(name: "functions_one", scope: !1, file: !1, line: 6, type: !12, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !17)
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !14, !14, !15, !15, !14, !15}
+!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 64)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 64)
+!16 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
+!17 = !{!18, !19, !20, !21, !22, !23, !24}
+!18 = !DILocalVariable(name: "in1i", arg: 1, scope: !11, file: !1, line: 6, type: !14)
+!19 = !DILocalVariable(name: "in2i", arg: 2, scope: !11, file: !1, line: 6, type: !14)
+!20 = !DILocalVariable(name: "in1f", arg: 3, scope: !11, file: !1, line: 6, type: !15)
+!21 = !DILocalVariable(name: "in2f", arg: 4, scope: !11, file: !1, line: 6, type: !15)
+!22 = !DILocalVariable(name: "out1i", arg: 5, scope: !11, file: !1, line: 6, type: !14)
+!23 = !DILocalVariable(name: "out1f", arg: 6, scope: !11, file: !1, line: 6, type: !15)
+!24 = !DILocalVariable(name: "tid", scope: !11, file: !1, line: 7, type: !25)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !26, line: 33, baseType: !27)
+!26 = !DIFile(filename: "Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "Aorta/vecz_build")
+!27 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !26, line: 31, baseType: !28)
+!28 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!29 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i32 addrspace(1)*, float addrspace(1)*)* @functions_one, !30, !31, !32, !33, !34, !35}
+!30 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!31 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!32 = !{!"kernel_arg_type", !"int*", !"int*", !"float*", !"float*", !"int*", !"float*"}
+!33 = !{!"kernel_arg_base_type", !"int*", !"int*", !"float*", !"float*", !"int*", !"float*"}
+!34 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!35 = !{!"reqd_work_group_size", i32 32, i32 1, i32 1}
+!36 = !{i32 2, !"Debug Info Version", i32 3}
+!37 = !{!"clang version 3.8.1 "}
+!38 = !DIExpression()
+!39 = !DILocation(line: 1, scope: !4)
+!40 = !DILocation(line: 2, scope: !4)
+!41 = !DILocation(line: 6, scope: !11)
+!42 = !DILocation(line: 7, scope: !11)
+!43 = !DILocation(line: 8, scope: !11)
+!44 = !DILocation(line: 1, scope: !4, inlinedAt: !45)
+!45 = distinct !DILocation(line: 8, scope: !11)
+!46 = !DILocation(line: 2, scope: !4, inlinedAt: !45)
+!47 = !DILocation(line: 9, scope: !11)
+
+; CHECK: spir_func i32 @k_one
+; CHECK-SAME: !dbg [[HELPER_DI:![0-9]+]]
+
+; CHECK: define spir_kernel void @__vecz_v4_functions_one
+; CHECK-SAME: !dbg [[KERN_DI:![0-9]+]]
+
+; CHECK: %[[LOAD1:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4
+; CHECK: %[[LOAD2:[0-9]+]] = load i32, ptr addrspace(1) %{{.*}}, align 4
+; CHECK: #dbg_value(i32 %[[LOAD1]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1:![0-9]+]]
+; CHECK: #dbg_value(i32 %[[LOAD2]], !{{[0-9]+}}, !DIExpression(), [[DI_LOC1]]
+; CHECK: %{{.*}} = mul nsw i32 %[[LOAD1]], %[[LOAD2]], !dbg [[DI_LOC2:![0-9]+]]
+
+; CHECK: [[HELPER_SUBPROGRAM:![0-9]+]] = distinct !DISubprogram(name: "k_one",
+
+; CHECK: [[DI_LOC1]] = !DILocation(line: 1, scope: [[HELPER_SUBPROGRAM]], inlinedAt: [[DI_INLINED_AT:![0-9]+]])
+; CHECK: [[DI_INLINED_AT]] = distinct !DILocation(line: 8,
+; CHECK: [[DI_LOC2]] = !DILocation(line: 2, scope: [[HELPER_SUBPROGRAM]], inlinedAt: [[DI_INLINED_AT]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
new file mode 100644
index 0000000000000..24947313c290b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insert_element_debug_info.ll
@@ -0,0 +1,148 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Regression test for debug info bug related to creating llvm.dbg.value
+; intrinsics across all lanes even when scalarization masks disable some
+; of the lanes. This occurs when we scalarize insertelement instructions.
+
+; RUN: veczc -k unaligned_load -vecz-passes="function(instcombine,adce),scalarize,packetizer,instcombine" -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_unaligned_load
+define spir_kernel void @unaligned_load(i32 addrspace(1)* %in, i32 addrspace(1)* %offsets, i32 addrspace(1)* %out) #0 !dbg !7 {
+entry:
+  %in.addr = alloca i32 addrspace(1)*, align 8
+  %offsets.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+; CHECK: %tmp = alloca <16 x i32>, align 16
+  %tid = alloca i32, align 4
+  %tmp = alloca <3 x i32>, align 16
+  store i32 addrspace(1)* %in, i32 addrspace(1)** %in.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in.addr, metadata !11, metadata !29), !dbg !30
+  store i32 addrspace(1)* %offsets, i32 addrspace(1)** %offsets.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %offsets.addr, metadata !12, metadata !29), !dbg !30
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+  call void @llvm.dbg.declare(metadata i32* %tid, metadata !14, metadata !29), !dbg !31
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
+  %conv = trunc i64 %call to i32, !dbg !31
+  store i32 %conv, i32* %tid, align 4, !dbg !31
+  call void @llvm.dbg.declare(metadata <3 x i32>* %tmp, metadata !15, metadata !29), !dbg !32
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %in.addr, align 8, !dbg !32
+; CHECK: %[[TMP_LD:.+]] = call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr(ptr nonnull %tmp)
+; FIXME: This llvm.dbg.value marks a 'kill location' and denotes the
+; termination of the previous value assigned to %tmp - we could probably do
+; better here by manifesting a vectorized value?
+; CHECK: #dbg_value(i32 poison, [[VAR:![0-9]+]],
+; CHECK-SAME:   !DIExpression({{.*}}),
+; CHECK-SAME:   !{{[0-9]+}}
+  %1 = load i32, i32* %tid, align 4, !dbg !32
+  %mul = mul nsw i32 3, %1, !dbg !32
+  %idx.ext = sext i32 %mul to i64, !dbg !32
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %idx.ext, !dbg !32
+  %call1 = call spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64 0, i32 addrspace(1)* %add.ptr) #3, !dbg !32
+  %extractVec = shufflevector <3 x i32> %call1, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>, !dbg !32
+  %storetmp = bitcast <3 x i32>* %tmp to <4 x i32>*, !dbg !32
+  store <4 x i32> %extractVec, <4 x i32>* %storetmp, align 16, !dbg !32
+  %2 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !33
+  %3 = extractelement <3 x i32> %2, i64 0, !dbg !33
+  %4 = load i32, i32* %tid, align 4, !dbg !33
+  %mul2 = mul nsw i32 3, %4, !dbg !33
+  %idxprom = sext i32 %mul2 to i64, !dbg !33
+  %5 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !33
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %5, i64 %idxprom, !dbg !33
+  store i32 %3, i32 addrspace(1)* %arrayidx, align 4, !dbg !33
+  %6 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !34
+  %7 = extractelement <3 x i32> %6, i64 1, !dbg !34
+  %8 = load i32, i32* %tid, align 4, !dbg !34
+  %mul3 = mul nsw i32 3, %8, !dbg !34
+  %add = add nsw i32 %mul3, 1, !dbg !34
+  %idxprom4 = sext i32 %add to i64, !dbg !34
+  %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %idxprom4, !dbg !34
+  store i32 %7, i32 addrspace(1)* %arrayidx5, align 4, !dbg !34
+  %10 = load <3 x i32>, <3 x i32>* %tmp, align 16, !dbg !35
+  %11 = extractelement <3 x i32> %10, i64 2, !dbg !35
+  %12 = load i32, i32* %tid, align 4, !dbg !35
+  %mul6 = mul nsw i32 3, %12, !dbg !35
+  %add7 = add nsw i32 %mul6, 2, !dbg !35
+  %idxprom8 = sext i32 %add7 to i64, !dbg !35
+  %13 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !35
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %13, i64 %idxprom8, !dbg !35
+  store i32 %11, i32 addrspace(1)* %arrayidx9, align 4, !dbg !35
+  ret void, !dbg !36
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare i64 @__mux_get_global_id(i32) #2
+
+declare spir_func <3 x i32> @_Z6vload3mPKU3AS1i(i64, i32 addrspace(1)*) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!27}
+!llvm.ident = !{!28}
+
+; Now check we're actually looking at the right variable.
+; CHECK: [[VAR]] = !DILocalVariable(name: "tmp",
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "kernel.opencl", directory: "/home/Aorta/vecz_build")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64, align: 64)
+!5 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!6 = !{!7}
+!7 = distinct !DISubprogram(name: "unaligned_load", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !4, !4, !4}
+!10 = !{!11, !12, !13, !14, !15}
+!11 = !DILocalVariable(name: "in", arg: 1, scope: !7, file: !1, line: 1, type: !4)
+!12 = !DILocalVariable(name: "offsets", arg: 2, scope: !7, file: !1, line: 1, type: !4)
+!13 = !DILocalVariable(name: "out", arg: 3, scope: !7, file: !1, line: 1, type: !4)
+!14 = !DILocalVariable(name: "tid", scope: !7, file: !1, line: 2, type: !5)
+!15 = !DILocalVariable(name: "tmp", scope: !7, file: !1, line: 3, type: !16)
+!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int3", file: !17, line: 64, baseType: !18)
+!17 = !DIFile(filename: "/home//Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/home/Aorta/vecz_build")
+!18 = !DICompositeType(tag: DW_TAG_array_type, baseType: !5, size: 128, align: 128, flags: DIFlagVector, elements: !19)
+!19 = !{!20}
+!20 = !DISubrange(count: 3)
+!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @unaligned_load, !22, !23, !24, !25, !26}
+!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"clang version 3.8.1 "}
+!29 = !DIExpression()
+!30 = !DILocation(line: 1, scope: !7)
+!31 = !DILocation(line: 2, scope: !7)
+!32 = !DILocation(line: 3, scope: !7)
+!33 = !DILocation(line: 4, scope: !7)
+!34 = !DILocation(line: 5, scope: !7)
+!35 = !DILocation(line: 6, scope: !7)
+!36 = !DILocation(line: 7, scope: !7)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
new file mode 100644
index 0000000000000..0ecccdb14e767
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_constant_index.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k constant_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @constant_index(<4 x i32>* %in, <4 x i32>* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 42, i32 2
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx2
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_constant_index
+
+; We should only have 3 loads since one of the elements will be replaced
+; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr
+; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr
+; CHECK: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr
+; CHECK-NOT: call <4 x i32> @__vecz_b_interleaved_load4_4_Dv4_ju3ptr
+
+; We should have four stores, one of which would use the constant given
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
new file mode 100644
index 0000000000000..146f7d15f0d0d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/insertelement_runtime_index.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k runtime_index -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @runtime_index(<4 x i32>* %in, <4 x i32>* %out, i32* %index) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32>* %in, i64 %call
+  %0 = load <4 x i32>, <4 x i32>* %arrayidx
+  %arrayidx1 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  store <4 x i32> %0, <4 x i32>* %arrayidx1
+  %arrayidx2 = getelementptr inbounds i32, i32* %index, i64 %call
+  %1 = load i32, i32* %arrayidx2
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %out, i64 %call
+  %vecins = insertelement <4 x i32> %0, i32 42, i32 %1
+  store <4 x i32> %vecins, <4 x i32>* %arrayidx3
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_runtime_index
+
+; Four icmps and selects
+; CHECK: icmp eq <4 x i32> %{{.+}}, zeroinitializer
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}}
+; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}}
+; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}}
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}}
+; CHECK: icmp eq <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}}
+; CHECK: select <4 x i1> %{{.+}}, <4 x i32> {{<(i32 42(, )?)+>|splat \(i32 42\)}}
+
+; Four stores
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
new file mode 100644
index 0000000000000..990b7cdcec49f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/instantiate_constants.ll
@@ -0,0 +1,95 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width 4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @test(half addrspace(1)* nocapture readonly %p, float addrspace(1)* nocapture %f) local_unnamed_addr #0 {
+entry:
+  %data = alloca [1 x i16], align 2
+  %0 = bitcast [1 x i16]* %data to i8*
+  %arraydecay = getelementptr inbounds [1 x i16], [1 x i16]* %data, i64 0, i64 0
+  %1 = bitcast [1 x i16]* %data to half*
+  %call = tail call i64 @__mux_get_global_id(i32 0) #5
+  %arrayidx7 = getelementptr inbounds half, half addrspace(1)* %p, i64 %call
+  %arrayidx = bitcast half addrspace(1)* %arrayidx7 to i16 addrspace(1)*
+  %2 = load i16, i16 addrspace(1)* %arrayidx, align 2, !tbaa !9
+  store i16 %2, i16* %arraydecay, align 2, !tbaa !9
+  %call2 = call spir_func float @_Z11vloada_halfmPKDh(i64 0, half* nonnull %1) #6
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %f, i64 %call
+  store float %call2, float addrspace(1)* %arrayidx3, align 4, !tbaa !13
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr #2
+
+; Function Attrs: convergent nounwind
+declare spir_func float @_Z11vloada_halfmPKDh(i64, half*) local_unnamed_addr #3
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+attributes #5 = { convergent nobuiltin nounwind readonly }
+attributes #6 = { convergent nobuiltin nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+!host.build_options = !{!8}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (half addrspace(1)*, float addrspace(1)*)* @test, !3, !4, !5, !6, !7}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"half*", !"float*"}
+!6 = !{!"kernel_arg_base_type", !"half*", !"float*"}
+!7 = !{!"kernel_arg_type_qual", !"const", !""}
+!8 = !{!""}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"short", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !{!14, !14, i64 0}
+!14 = !{!"float", !11, i64 0}
+
+; This test checks that an instantiated call with a constant operand gets
+; that operand instantiated (packet-broadcast) correctly instead of causing the
+; instantiation of the call to fail, thereby causing the packetization of the
+; store to fail.
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; CHECK: %[[C0:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
+; CHECK: %[[C1:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
+; CHECK: %[[C2:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
+; CHECK: %[[C3:.+]] = call spir_func float @_Z11vloada_halfmPKDh(i64 0, ptr nonnull %{{.+}})
+; CHECK: %[[G0:.+]] = insertelement <4 x float> poison, float %[[C0]], {{(i32|i64)}} 0
+; CHECK: %[[G1:.+]] = insertelement <4 x float> %[[G0]], float %[[C1]], {{(i32|i64)}} 1
+; CHECK: %[[G2:.+]] = insertelement <4 x float> %[[G1]], float %[[C2]], {{(i32|i64)}} 2
+; CHECK: %[[G3:.+]] = insertelement <4 x float> %[[G2]], float %[[C3]], {{(i32|i64)}} 3
+; CHECK: store <4 x float> %[[G3]], ptr addrspace(1) %{{.+}}
+; CHECK-NOT: store float
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
new file mode 100644
index 0000000000000..5f95b1edde16f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_defuse_instantiated.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: @.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1
+@.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1
+@.strf = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1
+
+; Function Attrs: nounwind
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_printf_kernel(
+; CHECK: if.then:
+; CHECK: [[ELT0:%.*]] = extractelement
+; CHECK: [[ELT1:%.*]] = extractelement
+; CHECK: [[ELT2:%.*]] = extractelement
+; CHECK: [[ELT3:%.*]] = extractelement
+; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT0]]
+; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT1]]
+; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT2]]
+; CHECK: = call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2jb(ptr addrspace(2) @.str, i32 [[ELT3]]
+; CHECK: ret void
+define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %cmp = icmp eq i64 %call, 13
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
+ x i8] addrspace(2)* @.str, i64 0, i64 0), i32 %0) #3
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @printf_kernel, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*", !"int*", !"int", !"int"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*", !"int", !"int"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.0 "}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
new file mode 100644
index 0000000000000..ec254f12ab85f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load16.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k load16 -vecz-simd-width 4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-p:32:32-f64:64-i64:64-v128:64-v64:64-v32:32-v16:16-n8:16:32-S64"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) #0 !shave_original_kernel !10 {
+entry:
+  %call = call i32 @__mux_get_global_id(i32 0) #2
+  %call1 = call i32 @__mux_get_global_id(i32 1) #2
+  %mul = mul nsw i32 %call1, %stride
+  %add = add nsw i32 %mul, %call
+  %mul2 = shl nsw i32 %add, 1
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i32 %mul2
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %mul3 = mul nsw i32 %call1, %stride
+  %add4 = add nsw i32 %mul3, %call
+  %mul5 = shl nsw i32 %add4, 1
+  %add6 = add i32 %mul5, 3
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %in, i32 %add6
+  %1 = load i8, i8 addrspace(1)* %arrayidx7, align 1
+  %add9 = add i8 %1, %0
+  %mul11 = mul nsw i32 %call1, %stride
+  %add12 = add nsw i32 %mul11, %call
+  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %add12
+  store i8 %add9, i8 addrspace(1)* %arrayidx13, align 1
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i32 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 7.0.0 (tags/RELEASE_700/final) (based on LLVM 7.0.0)"}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i32)* @load16, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 0}
+!5 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!6 = !{!"kernel_arg_type", !"uchar*", !"uchar*", !"int"}
+!7 = !{!"kernel_arg_base_type", !"uchar*", !"uchar*", !"int"}
+!8 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!9 = !{!"kernel_arg_name", !"out", !"in", !"stride"}
+!10 = !{!"load16"}
+
+; Function start
+; CHECK: define spir_kernel void @__vecz_v4_load16
+
+; There should be exactly 2 interleaved loads in the code
+; CHECK: call <4 x i8> @__vecz_b_interleaved_load1_2_Dv4_hu3ptrU3AS1
+; CHECK: call <4 x i8> @__vecz_b_interleaved_load1_2_Dv4_hu3ptrU3AS1
+
+; There shouldn't be any more interleaved loads or stores left
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+
+; There definitely shouldn't be any gather loads
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+
+; Function end
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
new file mode 100644
index 0000000000000..d2fda25173763
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_load_ooo.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc --vecz-passes=interleave-combine-loads -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; This test checks that we can optimize interleaved accesses out of order.
+
+define dso_local spir_kernel void @interleaved_load_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %conv, %mul
+  %mul3 = shl nsw i32 %add, 1
+  ; LLVM will not generate an add, but the precise form of the or instruction
+  ; that gets generated depends on the LLVM version.
+  ; LLVM 17-: %add4 = or i32 %mul3, 1
+  ; LLVM 18+: %add4 = or disjoint i32 %mul3, 1
+  ; The LLVM 17 form is not recognized as an add by LLVM 18, and the LLVM 18
+  ; form uses a flag which does not exist in LLVM 17. As this is not the
+  ; purpose of the test, use an add instruction here for now, and revisit this
+  ; once our minimum version of LLVM is LLVM 18.
+  %add4 = add nsw nuw i32 %mul3, 1
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %0 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx)
+  %idxprom8 = sext i32 %mul3 to i64
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom8
+  %1 = call <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)* %arrayidx9)
+  %sub1 = sub nsw <4 x i32> %0, %1
+  %idxprom12 = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom12
+  %2 = bitcast i32 addrspace(1)* %arrayidx13 to <4 x i32> addrspace(1)*
+  store <4 x i32> %sub1, <4 x i32> addrspace(1)* %2, align 4
+  ret void
+}
+
+; CHECK: __vecz_v4_interleaved_load_4(
+; CHECK:  [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK:  [[TMP2:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i32 4
+; CHECK:  [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP2]], align 4
+; CHECK:  %deinterleave = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK:  %deinterleave1 = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK:  %sub1 = sub nsw <4 x i32> %deinterleave1, %deinterleave
+
+
+declare i64 @__mux_get_global_id(i32)
+declare <4 x i32> @__vecz_b_interleaved_load4_2_Dv4_jPU3AS1j(i32 addrspace(1)*)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
new file mode 100644
index 0000000000000..9af442b68e1a2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/interleaved_safety.ll
@@ -0,0 +1,95 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k f -vecz-simd-width 4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @f(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e, i8 addrspace(1)* %flag) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %add.ptr = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %.cast = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %add.ptr, i64 0, i64 0
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  call void @__mux_work_group_barrier(i32 0, i32 2, i32 528) #3
+  store double 1.600000e+01, double addrspace(1)* %.cast, align 8
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %add.ptr, align 32
+  %vecins5 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+  %vecins7 = shufflevector <4 x double> %vecins5, <4 x double> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx8 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx8, align 32
+  %arrayidx9 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %4 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx9, align 32
+  %div = fdiv <4 x double> %3, %4
+  %5 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %vecins7, <4 x double> %2, <4 x double> %div)
+  %arrayidx10 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %6 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx10, align 32
+  %sub = fsub <4 x double> %6, %5
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx10, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare void @__mux_work_group_barrier(i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (<4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, <4 x double> addrspace(1)*, i8 addrspace(1)*)* @f, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 1, i32 1, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"double4*", !"double4*", !"double4*", !"double4*", !"double4*", !"char*"}
+!4 = !{!"kernel_arg_base_type", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"double __attribute__((ext_vector_type(4)))*", !"char*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; Function start
+; CHECK: define spir_kernel void @__vecz_v4_f
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+
+; There should be exactly 4 interleaved loads and one store in the code
+; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
+; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
+
+; And in between them there should be a barrier call
+; CHECK: call void @__mux_work_group_barrier
+; CHECK: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}}
+; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
+; CHECK: call <4 x double> @__vecz_b_interleaved_load8_4_Dv4_du3ptrU3AS1
+
+; There shouldn't be any more interleaved loads or stores left
+; CHECK-NOT: call <4 x double> @__vecz_b_interleaved_load4_Dv4_du3ptrU3AS1
+; CHECK-NOT: call void @__vecz_b_interleaved_store8_4_Dv4_du3ptrU3AS1(<4 x double> {{<(double 1.600000e\+01(, )?)+>|splat \(double 1.600000e\+01\)}}
+
+; There should be some sufflevector instructions after the simplification
+; CHECK: shufflevector
+
+; Function end
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
new file mode 100644
index 0000000000000..0b94abd180c31
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics-scalarize.ll
@@ -0,0 +1,207 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k ctpop -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTPOP
+; RUN: veczc -k ctlz -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTLZ
+; RUN: veczc -k cttz -vecz-simd-width=8 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix CTTZ
+; RUN: veczc -k sadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix SADD_SAT
+; RUN: veczc -k uadd_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix UADD_SAT
+; RUN: veczc -k ssub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix SSUB_SAT
+; RUN: veczc -k usub_sat -vecz-simd-width=2 -vecz-choices=FullScalarization -S < %s | FileCheck %s --check-prefix USUB_SAT
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; It checks that the scalar intrinsics get vectorized,
+; and the vector intrinsics get scalarized and then re-vectorized.
+
+define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a)
+  %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b)
+  store i32 %ctpopi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %ctlzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %cttzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.uadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.ssub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.usub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i64 @__mux_get_global_id(i32)
+
+; CTPOP: void @__vecz_v2_ctpop
+; CTPOP: = call {{.*}}<2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
+; CTPOP: = call {{.*}}<2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}})
+; CTPOP: = call {{.*}}<2 x i8> @llvm.ctpop.v2i8(<2 x i8> %{{.*}})
+
+; CTLZ: void @__vecz_v4_ctlz
+; CTLZ: = call {{.*}}<4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<4 x i8> @llvm.ctlz.v4i8(<4 x i8> %{{.*}}, i1 false)
+
+; CTTZ: void @__vecz_v8_cttz
+; CTTZ: = call {{.*}}<8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<8 x i8> @llvm.cttz.v8i8(<8 x i8> %{{.*}}, i1 false)
+
+; SADD_SAT: void @__vecz_v2_sadd_sat
+; SADD_SAT: = call <2 x i32> @llvm.sadd.sat.v2i32(
+; SADD_SAT: = call <2 x i8> @llvm.sadd.sat.v2i8(
+; SADD_SAT: = call <2 x i8> @llvm.sadd.sat.v2i8(
+
+; UADD_SAT: void @__vecz_v2_uadd_sat
+; UADD_SAT: = call <2 x i32> @llvm.uadd.sat.v2i32(
+; UADD_SAT: = call <2 x i8> @llvm.uadd.sat.v2i8(
+; UADD_SAT: = call <2 x i8> @llvm.uadd.sat.v2i8(
+
+; SSUB_SAT: void @__vecz_v2_ssub_sat
+; SSUB_SAT: = call <2 x i32> @llvm.ssub.sat.v2i32(
+; SSUB_SAT: = call <2 x i8> @llvm.ssub.sat.v2i8(
+; SSUB_SAT: = call <2 x i8> @llvm.ssub.sat.v2i8(
+
+; USUB_SAT: void @__vecz_v2_usub_sat
+; USUB_SAT: = call <2 x i32> @llvm.usub.sat.v2i32(
+; USUB_SAT: = call <2 x i8> @llvm.usub.sat.v2i8(
+; USUB_SAT: = call <2 x i8> @llvm.usub.sat.v2i8(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
new file mode 100644
index 0000000000000..d74607eea657e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/intrinsics.ll
@@ -0,0 +1,200 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k ctpop -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix CTPOP
+; RUN: veczc -k ctlz -vecz-simd-width=4 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s --check-prefix CTLZ
+; RUN: veczc -k cttz -vecz-simd-width=8 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s --check-prefix CTTZ
+; RUN: veczc -k sadd_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix SADD_SAT
+; RUN: veczc -k uadd_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix UADD_SAT
+; RUN: veczc -k ssub_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix SSUB_SAT
+; RUN: veczc -k usub_sat -vecz-simd-width=2 -S < %s | FileCheck %s --check-prefix USUB_SAT
+
+; It checks that the scalar intrinsics get vectorized,
+; and the vector intrinsics get widened.
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @ctpop(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctpopi32 = call i32 @llvm.ctpop.i32(i32 %a)
+  %ctpopv2i8 = call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %b)
+  store i32 %ctpopi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctpopv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ctlz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %ctlzi32 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %ctlzv2i8 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %ctlzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %ctlzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @cttz(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %cttzi32 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  %cttzv2i8 = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %b, i1 false)
+  store i32 %cttzi32, i32* %arrayidxy, align 4
+  store <2 x i8> %cttzv2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @sadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @uadd_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @ssub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.ssub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+define spir_kernel void @usub_sat(i32* %aptr, <2 x i8>* %bptr, i32* %yptr, <2 x i8>* %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, i32* %aptr, i64 %idx
+  %arrayidxy = getelementptr inbounds i32, i32* %yptr, i64 %idx
+  %a = load i32, i32* %arrayidxa, align 4
+  %y = load i32, i32* %arrayidxy, align 4
+  %v_i32 = call i32 @llvm.usub.sat.i32(i32 %a, i32 %y)
+  %arrayidxb = getelementptr inbounds <2 x i8>, <2 x i8>* %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds <2 x i8>, <2 x i8>* %zptr, i64 %idx
+  %b = load <2 x i8>, <2 x i8>* %arrayidxb, align 2
+  %z = load <2 x i8>, <2 x i8>* %arrayidxz, align 2
+  %v_v2i8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %b, <2 x i8> %z)
+  store i32 %v_i32, i32* %arrayidxy, align 4
+  store <2 x i8> %v_v2i8, <2 x i8>* %arrayidxz, align 2
+  ret void
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+declare <2 x i8> @llvm.ctpop.v2i8(<2 x i8>)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <2 x i8> @llvm.cttz.v2i8(<2 x i8>, i1)
+
+declare i32 @llvm.sadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.uadd.sat.i32(i32, i32)
+declare <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.ssub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i32 @llvm.usub.sat.i32(i32, i32)
+declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>)
+
+declare i64 @__mux_get_global_id(i32)
+
+; CTPOP: void @__vecz_v2_ctpop
+; CTPOP: = call {{.*}}<2 x i32> @llvm.ctpop.v2i32(<2 x i32> %{{.*}})
+; CTPOP: = call {{.*}}<4 x i8> @llvm.ctpop.v4i8(<4 x i8> %{{.*}})
+
+; CTLZ: void @__vecz_v4_ctlz
+; CTLZ: = call {{.*}}<4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+; CTLZ: = call {{.*}}<8 x i8> @llvm.ctlz.v8i8(<8 x i8> %{{.*}}, i1 false)
+
+; CTTZ: void @__vecz_v8_cttz
+; CTTZ: = call {{.*}}<8 x i32> @llvm.cttz.v8i32(<8 x i32> %{{.*}}, i1 false)
+; CTTZ: = call {{.*}}<16 x i8> @llvm.cttz.v16i8(<16 x i8> %{{.*}}, i1 false)
+
+; SADD_SAT: void @__vecz_v2_sadd_sat
+; SADD_SAT: = call <2 x i32> @llvm.sadd.sat.v2i32(
+; SADD_SAT: = call <4 x i8> @llvm.sadd.sat.v4i8(
+
+; UADD_SAT: void @__vecz_v2_uadd_sat
+; UADD_SAT: = call <2 x i32> @llvm.uadd.sat.v2i32(
+; UADD_SAT: = call <4 x i8> @llvm.uadd.sat.v4i8(
+
+; SSUB_SAT: void @__vecz_v2_ssub_sat
+; SSUB_SAT: = call <2 x i32> @llvm.ssub.sat.v2i32(
+; SSUB_SAT: = call <4 x i8> @llvm.ssub.sat.v4i8(
+
+; USUB_SAT: void @__vecz_v2_usub_sat
+; USUB_SAT: = call <2 x i32> @llvm.usub.sat.v2i32(
+; USUB_SAT: = call <4 x i8> @llvm.usub.sat.v4i8(
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll
new file mode 100644
index 0000000000000..5f68305bfc205
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_assumption_regression.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Just check that we correctly clean up the assumption cache when vectorizing
+; this function.:
+; RUN: veczc -k foo -w 2 -S < %s
+; RUN: not veczc -k foo -w 2 -vecz-scalable -S < %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @foo(ptr addrspace(1) nocapture readonly %_arg_v_acc) #0 {
+entry:
+  %v4 = tail call i64 @__mux_get_global_id(i32 0) #2
+  %v5 = tail call i64 @__mux_get_global_offset(i32 0) #2
+  %v6 = sub i64 %v4, %v5
+  %v7 = icmp ult i64 %v6, 2147483648
+  tail call void @llvm.assume(i1 %v7)
+  %arrayidx.i.i = getelementptr inbounds i32, ptr addrspace(1) %_arg_v_acc, i64 %v6
+  %v8 = load i32, ptr addrspace(1) %arrayidx.i.i, align 4
+  ret void
+}
+
+declare void @llvm.assume(i1 noundef) #1
+
+declare i64 @__mux_get_global_id(i32) #2
+declare i64 @__mux_get_global_offset(i32) #2
+
+attributes #0 = { convergent nounwind "mux-kernel"="entry-point" "mux-orig-fn"="foo" }
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn inaccessiblememonly }
+attributes #2 = { alwaysinline norecurse nounwind readonly }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll
new file mode 100644
index 0000000000000..6b42e5fe4ca62
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/invalid_cached_vu_regression.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: not veczc -k noduplicate:4,8 -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @noduplicate(i32 addrspace(1)* %in1, i32 addrspace(1)* %out) {
+entry:
+  %tid = call i64 @__mux_get_global_id(i32 0) #3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid
+  %i1 = load i32, i32 addrspace(1)* %arrayidx, align 16
+  %dec = call i32 @llvm.loop.decrement.reg.i32(i32 %i1, i32 4)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid
+  store i32 %dec, i32 addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
+
+;CHECK: Failed to vectorize function 'noduplicate'
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
new file mode 100644
index 0000000000000..770a31740a8b2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/irreducible_loop.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k irreducible_loop -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @irreducible_loop(i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call
+  %ld = load i32, i32 addrspace(1)* %arrayidx4, align 4
+  %cmp = icmp sgt i32 %ld, -1
+  br i1 %cmp, label %label, label %do.body
+
+do.body:                                          ; preds = %entry, %label
+  %id.0 = phi i64 [ %conv10, %label ], [ %call, %entry ]
+  br label %label
+
+label:                                            ; preds = %entry, %do.body
+  %id.1 = phi i64 [ %id.0, %do.body ], [ %call, %entry ]
+  %conv10 = add i64 %id.1, 1
+  %cmp11 = icmp slt i64 %conv10, 16
+  br i1 %cmp11, label %do.body, label %do.end
+
+do.end:                                           ; preds = %label
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define spir_kernel void @__vecz_v4_irreducible_loop
+; CHECK: entry:
+; CHECK:   br label %irr.guard
+
+; CHECK: irr.guard:
+; CHECK:   br i1 %{{.+}}, label %irr.guard.pure_exit, label %irr.guard
+
+; CHECK: irr.guard.pure_exit:                              ; preds = %irr.guard
+; CHECK:   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
new file mode 100644
index 0000000000000..4ffad2c31b104
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/load_add_store.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @load_add_store(ptr %aptr, ptr %bptr, ptr %zptr) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i32, ptr %aptr, i64 %idx
+  %arrayidxb = getelementptr inbounds i32, ptr %bptr, i64 %idx
+  %arrayidxz = getelementptr inbounds i32, ptr %zptr, i64 %idx
+  %a = load i32, ptr %arrayidxa, align 4
+  %b = load i32, ptr %arrayidxb, align 4
+  %sum = add i32 %a, %b
+  store i32 %sum, ptr %arrayidxz, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_load_add_store(ptr %aptr, ptr %bptr, ptr %zptr)
+; CHECK: %idx = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %arrayidxa = getelementptr i32, ptr %aptr, i64 %idx
+; CHECK: %arrayidxb = getelementptr i32, ptr %bptr, i64 %idx
+; CHECK: %arrayidxz = getelementptr i32, ptr %zptr, i64 %idx
+; CHECK: %[[TMP0:.*]] = load <4 x i32>, ptr %arrayidxa, align 4
+; CHECK: %[[TMP1:.*]] = load <4 x i32>, ptr %arrayidxb, align 4
+; CHECK: %sum1 = add <4 x i32> %[[TMP0]], %[[TMP1]]
+; CHECK: store <4 x i32> %sum1, ptr %arrayidxz, align 4
+; CHECK: ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
new file mode 100644
index 0000000000000..5f661497b794b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/loop_call_instantiation.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-choices=InstantiateCallsInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [23 x i8] c"Hello from %d with %d\0A\00", align 1
+@.str.1 = private unnamed_addr addrspace(2) constant [14 x i8] c"Hello from %d\00", align 1
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([23 x i8], [23 x i8] addrspace(2)* @.str, i64 0, i64 0), i64 %call, i32 %0)
+  %call2 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([14 x i8], [14 x i8] addrspace(2)* @.str.1, i64 0, i64 0), i64 %call)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: define spir_kernel void @__vecz_v4_test(ptr addrspace(1) %in)
+
+; CHECK: [[LOOPHEADER1:instloop.header.*]]:
+; CHECK: %[[INSTANCE1:instance.*]] = phi i32 [ 0, {{.+}} ], [ %[[V7:[0-9]+]], %[[LOOPBODY1:instloop.body.*]] ]
+; CHECK: %[[V3:[0-9]+]] = icmp {{(samesign )?}}ult i32 %[[INSTANCE1]], 4
+; CHECK: br i1 %[[V3]], label %[[LOOPBODY1]], label {{.+}}
+
+; CHECK: [[LOOPBODY1]]:
+; CHECK: %[[V4:[0-9]+]] = extractelement <4 x i64> %0, i32 %[[INSTANCE1]]
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x i32> %{{.+}}, i32 %[[INSTANCE1]]
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @{{.+}}, i64 %[[V4]], i32 %[[V5]])
+; CHECK: %[[V7]] = add {{(nuw |nsw )*}}i32 %[[INSTANCE1]], 1
+; CHECK: br label %[[LOOPHEADER1]]
+
+; CHECK: [[LOOPHEADER2:instloop.header.*]]:
+; CHECK: %[[INSTANCE3:.+]] = phi i32 [ %[[V11:[0-9]+]], %[[LOOPBODY2:instloop.body.*]] ], [ 0, {{.+}} ]
+; CHECK: %[[V8:[0-9]+]] = icmp {{(samesign )?}}ult i32 %[[INSTANCE3]], 4
+; CHECK: br i1 %[[V8]], label %[[LOOPBODY2]], label {{.+}}
+
+; CHECK: [[LOOPBODY2]]:
+; CHECK: %[[V9:[0-9]+]] = extractelement <4 x i64> %0, i32 %[[INSTANCE3]]
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @{{.+}}, i64 %[[V9]])
+; CHECK: %[[V11]] = add {{(nuw |nsw )*}}i32 %[[INSTANCE3]], 1
+; CHECK: br label %[[LOOPHEADER2]]
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
new file mode 100644
index 0000000000000..deef39666e8a1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics.ll
@@ -0,0 +1,89 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p)
+define spir_kernel void @test_fn(ptr %p) {
+entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> {{<(i64 3(, )?)+>|splat \(i64 3\)}}, 
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp sgt i64 3, %call
+; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64>
+  %wi_p_i32 = getelementptr i32, ptr %p, i64 %call
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]]
+  %old0 = atomicrmw add ptr %p, i32 1 acquire
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]]
+  %old1 = atomicrmw add ptr %wi_p_i32, i32 1 acquire
+; CHECK: = call <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}}, <4 x i1> [[CMP]]
+  %old2 = atomicrmw umin ptr %wi_p_i32, i32 1 monotonic, align 2
+; CHECK: = call <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(
+; CHECK-SAME: <4 x ptr> [[VEC_PTR]], <4 x float> {{<(float 1.000000e\+00(, )?)+>|splat \(float 1.000000e\+00\)}}, <4 x i1> [[CMP]]
+  %old3 = atomicrmw volatile fmax ptr %wi_p_i32, float 1.0 syncscope("singlethread") seq_cst
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_add_align4_acquire_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS:#[0-9]+]] {
+; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ]
+; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]]
+; CHECK: [[VAL:%.*]] = extractelement <4 x i32> [[VALS]], i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VAL]] acquire, align 4
+; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[ATOM]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ]
+; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1
+
+; CHECK: exit:
+; CHECK: ret <4 x i32> [[MERGE]]
+
+; Assume that all masked atomicrmw operations follow the logic above. Just
+; check that the right atomicrmw instruction is being generated.
+; CHECK: define <4 x i32> @__vecz_b_v4_masked_atomicrmw_umin_align2_monotonic_1_Dv4_u3ptrDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS]] {
+; CHECK: atomicrmw umin ptr {{%.*}}, i32 {{%.*}} monotonic, align 2
+
+
+; CHECK: define <4 x float> @__vecz_b_v4_masked_atomicrmw_volatile_fmax_align4_seqcst_0_Dv4_u3ptrDv4_fDv4_b(<4 x ptr> [[PTRS:%0]], <4 x float> [[VALS:%1]], <4 x i1> [[MASK:%2]]) [[ATTRS]] {
+; CHECK: atomicrmw volatile fmax ptr {{%.*}}, float {{%.*}} syncscope("singlethread") seq_cst, align 4
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
new file mode 100644
index 0000000000000..5c061dadd28fc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_atomics_scalar.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_fn(ptr %p) {
+  %ret = call i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 1, i1 true)
+  ret void
+}
+
+declare i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask)
+
+; CHECK: define i32 @__vecz_b_v1_masked_atomicrmw_add_align4_acquire_1_u3ptrjb(ptr %p, i32 %val, i1 %mask) {
+; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[RET_PREV:%.*]] = phi i32 [ poison, %entry ], [ [[RET:%.*]], %if.else ]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[ATOM:%.*]] = atomicrmw add ptr %p, i32 %val acquire, align 4
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[RET]] = phi i32 [ [[RET_PREV]], %loopIR ], [ [[ATOM]], %if.then ]
+; CHECK: [[CMP:%.*]] = icmp ult i32 %{{.*}}, 1
+; CHECK: br i1 [[CMP]], label %loopIR, label %exit
+
+; CHECK: exit:
+; CHECK: ret i32 [[RET]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
new file mode 100644
index 0000000000000..65811dcc45ff2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_calls_max_builtin.ll
@@ -0,0 +1,81 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check if the call to max in the if block has been replaced with its vector
+; equivalent
+; CHECK: call spir_func <[[WIDTH:[0-9]+]] x i32> @_Z3maxDv[[WIDTH]]_iS_(<[[WIDTH]] x i32> {{.+}}, <[[WIDTH]] x i32> {{.+}})
+; CHECK: call spir_func <[[WIDTH]] x i32> @_Z3maxDv[[WIDTH]]_iS_(<[[WIDTH]] x i32> {{.+}}, <[[WIDTH]] x i32> {{.+}})
+
+; There shouldn't be any masked versions of max
+; CHECK-NOT: masked_Z3max
+
+define spir_kernel void @entry(ptr addrspace(1) %input, ptr addrspace(1) %output) {
+entry:
+  %call = tail call i64 @__mux_get_local_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %call
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %add = add nsw i32 %0, 1
+  %add3 = add nsw i32 %1, 1
+  %call4 = tail call spir_func i32 @_Z3maxii(i32 %add, i32 %add3)
+  %add.i = shl nsw i32 %call4, 1
+  %idxprom.i = sext i32 %add.i to i64
+  %arrayidx.i = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %idxprom.i
+  store i32 %add.i, ptr addrspace(1) %arrayidx.i, align 4
+  %2 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %3 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %4 = icmp eq i32 %2, -2147483648
+  %5 = icmp eq i32 %3, -1
+  %6 = and i1 %4, %5
+  %7 = icmp eq i32 %3, 0
+  %8 = or i1 %7, %6
+  %9 = select i1 %8, i32 1, i32 %3
+  %10 = icmp eq i32 %9, -1
+  %11 = and i1 %4, %10
+  %12 = select i1 %11, i32 1, i32 %9
+  %rem = srem i32 %2, %12
+  %tobool.not = icmp eq i32 %rem, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %call9 = tail call spir_func i32 @_Z3maxii(i32 %0, i32 %1)
+  %add.i27 = shl nsw i32 %call9, 1
+  %idxprom.i28 = sext i32 %add.i27 to i64
+  %arrayidx.i29 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %idxprom.i28
+  store i32 %add.i27, ptr addrspace(1) %arrayidx.i29, align 4
+  br label %if.end
+
+if.end:
+  %idxprom.i31.pre-phi = phi i64 [ %idxprom.i28, %if.then ], [ %idxprom.i, %entry ]
+  %add.i30.pre-phi = phi i32 [ %add.i27, %if.then ], [ %add.i, %entry ]
+  %r.0 = phi i32 [ %call9, %if.then ], [ %call4, %entry ]
+  %arrayidx.i32 = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %idxprom.i31.pre-phi
+  store i32 %add.i30.pre-phi, ptr addrspace(1) %arrayidx.i32, align 4
+  store i32 %r.0, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+declare i64 @__mux_get_local_id(i32)
+
+declare spir_func i32 @_Z3maxii(i32, i32)
+
+declare spir_func <4 x i32> @_Z3maxDv4_iS_(<4 x i32>, <4 x i32>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
new file mode 100644
index 0000000000000..bc6d2bf2b7ab7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg.ll
@@ -0,0 +1,107 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=cfg-convert,verify,packetizer,define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK: define spir_kernel void @__vecz_v4_test_fn(ptr %p, ptr %q, ptr %r)
+define spir_kernel void @test_fn(ptr %p, ptr %q, ptr %r) {
+entry:
+; CHECK: [[SPLAT_PTR_INS:%.*]] = insertelement <4 x ptr> poison, ptr %p, i64 0
+; CHECK: [[SPLAT_PTR:%.*]] = shufflevector <4 x ptr> [[SPLAT_PTR_INS]], <4 x ptr> poison, <4 x i32> zeroinitializer
+; CHECK: [[CMP:%.*]] = icmp sgt <4 x i64> {{<(i64 3(, )?)+>|splat \(i64 3\)}}, 
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp sgt i64 3, %call
+; CHECK: [[VEC_PTR:%.*]] = getelementptr i32, ptr %p, <4 x i64>
+  %wi_p_i32 = getelementptr i32, ptr %p, i64 %call
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+; CHECK: [[CALL:%.*]] = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(
+; CHECK-SAME: <4 x ptr> [[SPLAT_PTR]], <4 x i32> {{<(i32 1(, )?)+>|splat \(i32 1\)}},
+; CHECK-SAME: <4 x i32> {{<(i32 2(, )?)+>|splat \(i32 2\)}}, <4 x i1> [[CMP]]
+  %old0 = cmpxchg ptr %p, i32 1, i32 2 acquire monotonic
+  %val0 = extractvalue { i32, i1 } %old0, 0
+  %success0 = extractvalue { i32, i1 } %old0, 1
+
+  %out = getelementptr i32, ptr %q, i64 %call
+  store i32 %val0, ptr %out, align 4
+
+  %outsuccess = getelementptr i8, ptr %r, i64 %call
+  %outbyte = zext i1 %success0 to i8
+  store i8 %outbyte, ptr %outsuccess, align 1
+
+  ; Test a couple of insert/extract patterns
+; CHECK: [[INS:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[CALL]], <4 x i1> [[CMP]], 1
+; CHECK: [[EXT:%.*]] = extractvalue { <4 x i32>, <4 x i1> } [[INS]], 1
+  %testinsert = insertvalue { i32, i1 } %old0, i1 %cmp, 1
+  %testextract = extractvalue { i32, i1 } %testinsert, 1
+
+  %outbyte0 = zext i1 %testextract to i8
+  store i8 %outbyte0, ptr %outsuccess, align 1
+
+; CHECK: = call { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(
+  %old1 = cmpxchg weak volatile ptr %wi_p_i32, i32 1, i32 2 syncscope("singlethread") monotonic seq_cst, align 8
+
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align4_acquire_monotonic_1_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) [[ATTRS:#[0-9]+]] {
+; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[IDX:%.*]] = phi i32 [ 0, %entry ], [ [[IDX_NEXT:%.*]], %if.else ]
+; CHECK: [[PREV:%.*]] = phi <4 x i32> [ poison, %entry ], [ [[MERGE:%.*]], %if.else ]
+; CHECK: [[PREVSUCCESS:%.*]] = phi <4 x i1> [ poison, %entry ], [ [[MERGESUCCESS:%.*]], %if.else ]
+; CHECK: [[MASKELT:%.*]] = extractelement <4 x i1> [[MASK]], i32 [[IDX]]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 [[MASKELT]], false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[PTR:%.*]] = extractelement <4 x ptr> [[PTRS]], i32 [[IDX]]
+; CHECK: [[CMP:%.*]] = extractelement <4 x i32> [[CMPS]], i32 [[IDX]]
+; CHECK: [[NEW:%.*]] = extractelement <4 x i32> [[NEWS]], i32 [[IDX]]
+; CHECK: [[ATOM:%.*]] = cmpxchg ptr [[PTR]], i32 [[CMP]], i32 [[NEW]] acquire monotonic, align 4
+; CHECK: [[VAL:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0
+; CHECK: [[RET:%.*]] = insertelement <4 x i32> [[PREV]], i32 [[VAL]], i32 [[IDX]]
+; CHECK: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1
+; CHECK: [[RETSUCCESS:%.*]] = insertelement <4 x i1> [[PREVSUCCESS]], i1 [[SUCCESS]], i32 [[IDX]]
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[MERGE]] = phi <4 x i32> [ [[PREV]], %loopIR ], [ [[RET]], %if.then ]
+; CHECK: [[MERGESUCCESS]] = phi <4 x i1> [ [[PREVSUCCESS]], %loopIR ], [ [[RETSUCCESS]], %if.then ]
+; CHECK: [[IDX_NEXT]] = add i32 [[IDX]], 1
+
+; CHECK: exit:
+; CHECK: [[INS0:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i32> [[MERGE]], 0
+; CHECK: [[INS1:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[INS0]], <4 x i1> [[MERGESUCCESS]], 1
+; CHECK: ret { <4 x i32>, <4 x i1> } [[INS1]]
+
+; Assume that all masked cmpxchg operations follow the logic above. Just
+; check that the right cmpxchg instruction is being generated.
+; CHECK: define { <4 x i32>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_weak_volatile_align8_monotonic_seqcst_0_Dv4_u3ptrDv4_jDv4_jDv4_b(<4 x ptr> [[PTRS:%0]], <4 x i32> [[CMPS:%1]], <4 x i32> [[NEWS:%2]], <4 x i1> [[MASK:%3]]) [[ATTRS]] {
+; CHECK: cmpxchg weak volatile ptr {{%.*}}, i32 {{%.*}}, i32 {{%.*}} syncscope("singlethread") monotonic seq_cst, align 8
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
new file mode 100644
index 0000000000000..6340be83b9f66
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_cmpxchg_scalar.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=define-builtins,verify -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_fn(ptr %p) {
+  %ret = call { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 1, i32 2, i1 true)
+  ret void
+}
+
+declare { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask)
+
+; CHECK: define { i32, i1 } @__vecz_b_v1_masked_cmpxchg_align4_acquire_monotonic_1_u3ptrjjb(ptr %p, i32 %cmp, i32 %newval, i1 %mask) {
+; CHECK: entry:
+; CHECK: br label %loopIR
+
+; CHECK: loopIR:
+; CHECK: [[RETVAL_PREV:%.*]] = phi i32 [ poison, %entry ], [ [[RETVAL:%.*]], %if.else ]
+; CHECK: [[RETSUCC_PREV:%.*]] = phi i1 [ poison, %entry ], [ [[RETSUCC:%.*]], %if.else ]
+; CHECK: [[MASKCMP:%.*]] = icmp ne i1 %mask, false
+; CHECK: br i1 [[MASKCMP]], label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: [[ATOM:%.*]] = cmpxchg ptr %p, i32 %cmp, i32 %newval acquire monotonic, align 4
+; CHECK: [[EXT0:%.*]] = extractvalue { i32, i1 } [[ATOM]], 0
+; CHECK: [[EXT1:%.*]] = extractvalue { i32, i1 } [[ATOM]], 1
+; CHECK: br label %if.else
+
+; CHECK: if.else:
+; CHECK: [[RETVAL]] = phi i32 [ [[RETVAL_PREV]], %loopIR ], [ [[EXT0]], %if.then ]
+; CHECK: [[RETSUCC]] = phi i1 [ [[RETSUCC_PREV]], %loopIR ], [ [[EXT1]], %if.then ]
+; CHECK: [[CMP:%.*]] = icmp ult i32 %{{.*}}, 1
+; CHECK: br i1 [[CMP]], label %loopIR, label %exit
+
+; CHECK: exit:
+; CHECK: [[INS0:%.*]] = insertvalue { i32, i1 } poison, i32 [[RETVAL]], 0
+; CHECK: [[INS1:%.*]] = insertvalue { i32, i1 } [[INS0]], i1 [[RETSUCC]], 1
+; CHECK: ret { i32, i1 } [[INS1]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll
new file mode 100644
index 0000000000000..464c6b89db6d9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_group_collective.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes="cfg-convert" -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id()
+declare i32 @__mux_work_group_scan_inclusive_smax_i32(i32, i32)
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_foo()
+; CHECK-NOT: @__vecz_b_masked___mux_work_group_scan_inclusive_smax_i32
+define spir_kernel void @foo() {
+entry:
+  %0 = call i64 @__mux_get_local_id()
+  br i1 false, label %for.body.i11, label %if.end.i105.i
+
+for.body.i11:
+  %1 = icmp slt i64 %0, 0
+  br i1 %1, label %if.end.i13, label %if.end.i13
+
+if.end.i13:
+  br i1 false, label %exit, label %if.end.i105.i
+
+if.end.i105.i:
+  %2 = call i32 @__mux_work_group_scan_inclusive_smax_i32(i32 0, i32 0)
+  br label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
new file mode 100644
index 0000000000000..d88ad53d87e01
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_fn -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @test_fn(i32 addrspace(1)* %results) #0 {
+entry:
+  %results.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i32, align 4
+  store i32 addrspace(1)* %results, i32 addrspace(1)** %results.addr, align 8
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  store i32 %conv, i32* %tid, align 4
+  %0 = load i32, i32* %tid, align 4
+  %cmp = icmp sgt i32 3, %0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %tid, align 4
+  %mul = mul nsw i32 2, %1
+  %add = add nsw i32 %mul, 2
+  %idxprom = sext i32 %add to i64
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %results.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
+  store i32 5, i32 addrspace(1)* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(1)*)* @test_fn, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+
+; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) [[ATTRS:#[0-9]+]] {
+; CHECK: entry:
+; CHECK: %BroadcastAddr.splatinsert = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %1, {{i32|i64}} 0
+; CHECK: %BroadcastAddr.splat = shufflevector <4 x ptr addrspace(1)> %BroadcastAddr.splatinsert, <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
+; CHECK: %3 = getelementptr i32, <4 x ptr addrspace(1)> %BroadcastAddr.splat, <4 x i64> <i64 0, i64 2, i64 4, i64 6>
+; CHECK: call void @llvm.masked.scatter.v4i32.v4p1(<4 x i32> %0, <4 x ptr addrspace(1)> %3, i32{{( immarg)?}} 4, <4 x i1> %2) #
+; CHECK: ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
new file mode 100644
index 0000000000000..3999f2cf44a80
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_as_scatter.ll
@@ -0,0 +1,77 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_fn -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @test_fn(i32 addrspace(1)* %results) #0 {
+entry:
+  %results.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i32, align 4
+  store i32 addrspace(1)* %results, i32 addrspace(1)** %results.addr, align 8
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  store i32 %conv, i32* %tid, align 4
+  %0 = load i32, i32* %tid, align 4
+  %cmp = icmp sgt i32 3, %0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, i32* %tid, align 4
+  %mul = mul nsw i32 2, %1
+  %add = add nsw i32 %mul, 2
+  %idxprom = sext i32 %add to i64
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %results.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 %idxprom
+  store i32 5, i32 addrspace(1)* %arrayidx, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(1)*)* @test_fn, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+
+; CHECK: define void @__vecz_b_masked_interleaved_store4_2_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32>{{( %0)?}}, ptr addrspace(1){{( %1)?}}, <4 x i1>{{( %2)?}}) [[ATTRS:#[0-9]+]] {
+
+; Check for the address splat
+; CHECK: %[[BROADCASTADDRSPLATINSERT:.+]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %{{.+}}, {{i32|i64}} 0
+; CHECK: %[[BROADCASTADDRSPLAT:.+]] = shufflevector <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLATINSERT]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
+; CHECK: getelementptr i32, <4 x ptr addrspace(1)> %[[BROADCASTADDRSPLAT]], <4 x i64> <i64 0, i64 2, i64 4, i64 6>
+
+; CHECK: ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
new file mode 100644
index 0000000000000..0e2d567fd426c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group.ll
@@ -0,0 +1,99 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k mask -vecz-simd-width=16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @mask(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %call.tr = trunc i64 %call to i32
+  %conv = shl i32 %call.tr, 1
+  %idx.ext = sext i32 %conv to i64
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext
+  %0 = load i8, i8 addrspace(1)* %add.ptr, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1
+  %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idx.ext
+  %conv4 = sext i8 %0 to i32
+  %conv5 = sext i8 %1 to i32
+  %add = add nsw i32 %conv5, %conv4
+  %cmp = icmp slt i32 %add, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr3, i64 1
+  store i8 %0, i8 addrspace(1)* %arrayidx7, align 1
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  store i8 %1, i8 addrspace(1)* %add.ptr3, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*)* @mask, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!6 = !{!"kernel_arg_type", !"char*", !"char*"}
+!7 = !{!"kernel_arg_base_type", !"char*", !"char*"}
+!8 = !{!"kernel_arg_type_qual", !"", !""}
+!9 = !{!"kernel_arg_name", !"out", !"in"}
+
+; This test makes sure we combine a group of masked interleaved stores
+; into a single masked interleaved store using interleave operations.
+; We expect the interleaved stores to come out unaltered.
+
+; CHECK: entry:
+
+; The data to store gets interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i8>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i8>
+
+; The masks get interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+
+; The stores are masked stores:
+; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8>
+; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8>
+
+; Definitely no unmasked stores:
+; CHECK-NOT: store <16 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
new file mode 100644
index 0000000000000..5b7492f8c1761
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_interleaved_group2.ll
@@ -0,0 +1,118 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k mask -vecz-simd-width=16 -S -vecz-choices=TargetIndependentPacketization < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @mask(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(1)* %doit) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %call.tr = trunc i64 %call to i32
+  %conv = shl i32 %call.tr, 1
+  %idx.ext = sext i32 %conv to i64
+  %doit.ptr = getelementptr inbounds i8, i8 addrspace(1)* %doit, i64 %idx.ext
+  %ldbool = load i8, i8 addrspace(1)* %doit.ptr, align 1
+  %skip = icmp slt i8 %ldbool, 0
+  br i1 %skip, label %if.end, label %yes
+
+yes:                                              ; preds = %entry
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext
+  %0 = load i8, i8 addrspace(1)* %add.ptr, align 1
+  %arrayidx1 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1
+  %1 = load i8, i8 addrspace(1)* %arrayidx1, align 1
+  %add.ptr3 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idx.ext
+  %conv4 = sext i8 %0 to i32
+  %conv5 = sext i8 %1 to i32
+  %add = add nsw i32 %conv5, %conv4
+  %cmp = icmp slt i32 %add, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %yes
+  %arrayidx7 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr3, i64 1
+  store i8 %0, i8 addrspace(1)* %arrayidx7, align 1
+  br label %if.end
+
+if.else:                                          ; preds = %yes
+  store i8 %1, i8 addrspace(1)* %add.ptr3, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then, %entry
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+!opencl.kernels = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{!"clang version 8.0.0 (https://github.com/llvm-mirror/clang.git bfbe338a893dde6ba65b2bed6ffea1652a592819) (https://github.com/llvm-mirror/llvm.git a99d6d2122ca2f208e1c4bcaf02ff5930f244f34)"}
+!3 = !{void (i8 addrspace(1)*, i8 addrspace(1)*, i8 addrspace(1)*)* @mask, !4, !5, !6, !7, !8, !9}
+!4 = !{!"kernel_arg_addr_space", i32 1, i32 1}
+!5 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!6 = !{!"kernel_arg_type", !"char*", !"char*"}
+!7 = !{!"kernel_arg_base_type", !"char*", !"char*"}
+!8 = !{!"kernel_arg_type_qual", !"", !""}
+!9 = !{!"kernel_arg_name", !"out", !"in"}
+
+; This test makes sure we combine a group of masked interleaved stores
+; into a single masked interleaved store using interleave operations.
+; We expect the interleaved stores to come out unaltered.
+
+; CHECK: entry:
+; CHECK: yes:
+
+; The masks get interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+
+; The loads are masked loads:
+; CHECK: call <16 x i8> @llvm.masked.load.v16i8.p1(ptr
+; CHECK: call <16 x i8> @llvm.masked.load.v16i8.p1(ptr
+
+; The loaded data gets deinterleaved:
+; CHECK: %deinterleave{{.*}} = shufflevector <16 x i8>
+; CHECK: %deinterleave{{.*}} = shufflevector <16 x i8>
+
+; The data to store gets interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i8>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i8>
+
+; The masks get interleaved:
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+; CHECK: %interleave{{.*}} = shufflevector <16 x i1>
+
+; The stores are masked stores:
+; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8>
+; CHECK: call void @llvm.masked.store.v16i8.p1(<16 x i8>
+
+; Definitely no unmasked stores:
+; CHECK-NOT: store <16 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll
new file mode 100644
index 0000000000000..5be3ef46596f0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masked_store.ll
@@ -0,0 +1,84 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=cfg-convert,define-builtins -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_varying_if(i32 %a, ptr %b, float %on_true, float %on_false) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds ptr, ptr %b, i64 %idxprom
+  store float %on_true, ptr %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds ptr, ptr %b, i64 42
+  store float %on_false, ptr %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define spir_kernel void @test_varying_if_as3(i32 %a, ptr addrspace(3) %b, float %on_true, float %on_false) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds ptr, ptr addrspace(3) %b, i64 %idxprom
+  store float %on_true, ptr addrspace(3) %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds ptr, ptr addrspace(3) %b, i64 42
+  store float %on_false, ptr addrspace(3) %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; CHECK:     define void @__vecz_b_masked_store4_fu3ptrb(float [[A:%.*]], ptr [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS:#[0-9]+]] {
+; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:     [[IF]]:
+; CHECK-NEXT:  store float [[A]], ptr [[B]], align 4
+; CHECK-NEXT:  br label %[[EXIT]]
+; CHECK:     [[EXIT]]:
+; CHECK-NEXT:  ret void
+
+; CHECK:     define void @__vecz_b_masked_store4_fu3ptrU3AS3b(float [[A:%.*]], ptr addrspace(3) [[B:%.*]], i1 [[MASK:%.*]]) [[ATTRS]] {
+; CHECK:       br i1 [[MASK]], label %[[IF:.*]], label %[[EXIT:.*]]
+; CHECK:     [[IF]]:
+; CHECK-NEXT:  store float [[A]], ptr addrspace(3) [[B]], align 4
+; CHECK-NEXT:  br label %[[EXIT]]
+; CHECK:     [[EXIT]]:
+; CHECK-NEXT:  ret void
+
+; CHECK: attributes [[ATTRS]] = { norecurse nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
new file mode 100644
index 0000000000000..19fb3bda34b03
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/masking_exit_blocks.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [18 x i8] c"Doing stuff, yay!\00", align 1
+
+define spir_kernel void @test(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %add = add i64 %call, 1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  br label %entry.1
+
+entry.1:                                          ; preds = %entry
+  %add1 = add i64 %call, 1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add1
+  store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry.1
+  %call3 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([18 x i8], [18 x i8] addrspace(2)* @.str, i64 0, i64 0))
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry.1
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  br label %if.end1
+
+if.end1:                                          ; preds = %if.end
+  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; Check if the divergent block is masked correctly
+; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b
+; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b
+; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b
+; CHECK: @__vecz_b_masked_printf_u3ptrU3AS2b
+
+; Check if the exit block is not masked
+; CHECK: load <4 x i32>
+; CHECK: store <4 x i32>
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
new file mode 100644
index 0000000000000..ac1d5fc674484
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer1.ll
@@ -0,0 +1,72 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k entry -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @entry(%struct.S2** %result) #0 {
+entry:
+  %c_640 = alloca %struct.S2, align 16
+  %p_639 = alloca %struct.S2*, align 8
+  store %struct.S2* %c_640, %struct.S2** %p_639, align 8
+  %0 = load %struct.S2*, %struct.S2** %p_639, align 8
+  store %struct.S2* %0, %struct.S2** %result, align 8
+  ret void
+}
+
+define spir_func void @func_10(%struct.S2* %p_484, i64** %ret) {
+entry:
+  %l_462 = alloca i64, align 8
+  %l_461 = alloca i64*, align 8
+  %.cast = ptrtoint %struct.S2* %p_484 to i64
+  store i64 %.cast, i64* %l_462, align 8
+  store i64* %l_462, i64** %l_461, align 8
+  store i64* %l_462, i64** %ret, align 8
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!opencl.kernels = !{!1}
+
+!0 = !{!"clang version 3.8.1 "}
+!1 = !{void (%struct.S2**)* @entry, !2, !3, !4, !5, !6}
+!2 = !{!"kernel_arg_addr_space", i32 1}
+!3 = !{!"kernel_arg_access_qual", !"none"}
+!4 = !{!"kernel_arg_type", !"ulong*"}
+!5 = !{!"kernel_arg_base_type", !"ulong*"}
+!6 = !{!"kernel_arg_type_qual", !""}
+
+; CHECK: @__vecz_v4_entry
+
+; Check if the alloca with no value (c_640) is still here
+; CHECK: %c_640 = alloca %struct.S2, align 16
+
+; Check if the alloca with value (p_639) has been promoted
+; CHECK-NOT: %p_639 = alloca %struct.S2*, align 8
+; CHECK-NOT: store %struct.S2* %c_640, %struct.S2** %p_639, align 8
+; CHECK: ret
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
new file mode 100644
index 0000000000000..060ca2bc249fd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/mem2reg_alloca_pointer2.ll
@@ -0,0 +1,72 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k func_10 -vecz-passes="function(mem2reg),vecz-mem2reg" -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.S2 = type { i16, [7 x i32], i32, <16 x i8>, [4 x i32] }
+
+; Function Attrs: norecurse nounwind
+define spir_kernel void @entry(%struct.S2** %result) #0 {
+entry:
+  %c_640 = alloca %struct.S2, align 16
+  %p_639 = alloca %struct.S2*, align 8
+  store %struct.S2* %c_640, %struct.S2** %p_639, align 8
+  %0 = load %struct.S2*, %struct.S2** %p_639, align 8
+  store %struct.S2* %0, %struct.S2** %result, align 8
+  ret void
+}
+
+define spir_func void @func_10(%struct.S2* %p_484, i64** %ret) {
+entry:
+  %l_462 = alloca i64, align 8
+  %l_461 = alloca i64*, align 8
+  %.cast = ptrtoint %struct.S2* %p_484 to i64
+  store i64 %.cast, i64* %l_462, align 8
+  store i64* %l_462, i64** %l_461, align 8
+  store i64* %l_462, i64** %ret, align 8
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1
+
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
+!opencl.kernels = !{!1}
+
+!0 = !{!"clang version 3.8.1 "}
+!1 = !{void (%struct.S2**)* @entry, !2, !3, !4, !5, !6}
+!2 = !{!"kernel_arg_addr_space", i32 1}
+!3 = !{!"kernel_arg_access_qual", !"none"}
+!4 = !{!"kernel_arg_type", !"ulong*"}
+!5 = !{!"kernel_arg_base_type", !"ulong*"}
+!6 = !{!"kernel_arg_type_qual", !""}
+
+; Check if the alloca used for its pointer is still here
+; CHECK: @__vecz_v4_func_10
+; CHECK: %l_462 = alloca i64, align 8
+
+; Check that the other alloca(s) have been promoted
+; CHECK-NOT: alloca
+
+; Check if the store using the alloca is still here
+; CHECK:  store i64 %.cast, ptr %l_462, align 8
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
new file mode 100644
index 0000000000000..f3875519a10e8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride.ll
@@ -0,0 +1,34 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
new file mode 100644
index 0000000000000..aa872c84a60b3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride10.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %mul = mul nsw i32 %add, %conv
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
new file mode 100644
index 0000000000000..54d11670a365e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride11.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %mul = mul nsw i32 %add, 9
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
new file mode 100644
index 0000000000000..a9ed4f24f16ab
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride12.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %mul = mul nsw i32 %add, %n
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
new file mode 100644
index 0000000000000..aa872c84a60b3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride13.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %mul = mul nsw i32 %add, %conv
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
new file mode 100644
index 0000000000000..ba49af776ff08
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride14.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nuw nsw i64 %call, 18
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
new file mode 100644
index 0000000000000..0281dad79916b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride15.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = shl i32 %n, 1
+  %mul = mul i32 %add, %conv
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
new file mode 100644
index 0000000000000..d99f4a812a6ed
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride16.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = shl nuw nsw i64 %call, 1
+  %mul = mul nuw nsw i64 %add, %call
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
new file mode 100644
index 0000000000000..767d17bb96b86
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride17.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nsw i32 %conv2, %n
+  %add = add nsw i32 %mul, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
new file mode 100644
index 0000000000000..a2cf76be8fab5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride18.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 addrspace(1)* readnone %r) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = add nuw nsw i64 %call, 255
+  %idxprom = and i64 %conv, 255
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %src, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _gather_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
new file mode 100644
index 0000000000000..b6c74f6bfed51
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride2.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nuw nsw i64 %call, 9
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %add
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
new file mode 100644
index 0000000000000..ab533910ee8b2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride3.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nuw nsw i64 %call, 9
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
new file mode 100644
index 0000000000000..d1eb22ce6c643
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride4.ll
@@ -0,0 +1,37 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nsw i32 %conv, %n
+  %idxprom = sext i32 %add to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: store <
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
new file mode 100644
index 0000000000000..92dd028dc1ee0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride5.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nuw nsw i64 %call, 5
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
new file mode 100644
index 0000000000000..1a0b92bfb652f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride6.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = shl nuw nsw i64 %call, 1
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %add
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
new file mode 100644
index 0000000000000..4dc7b34841204
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride7.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %mul = mul nuw nsw i64 %call, %call
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _scatter_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
new file mode 100644
index 0000000000000..549b3a30626dc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride8.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %1 = mul nuw nsw i64 %call, 9
+  %mul = add nuw nsw i64 %1, 81
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %mul
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
new file mode 100644
index 0000000000000..744df39852de9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/memop_stride9.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @test(i32 addrspace(1)* %src, i32 addrspace(1)* %dst, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %0 = load i32, i32 addrspace(1)* %src, align 4
+  %add = add nuw nsw i32 %conv, 9
+  %mul = mul nsw i32 %add, %n
+  %idxprom = sext i32 %mul to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %dst, i64 %idxprom
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @test
+; CHECK: _interleaved_
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
new file mode 100644
index 0000000000000..65031454b1470
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_exit_blocks.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k multiple_exit_blocks -vecz-passes="function(simplifycfg,dce),mergereturn,cfg-convert" -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @multiple_exit_blocks(i64 %n) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %lid = tail call i64 @__mux_get_local_id(i32 0)
+  %cmp1 = icmp slt i64 %lid, %n
+  %cmp2 = icmp slt i64 %gid, %n
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                             ; preds = %entry
+  %cmp3 = and i1 %cmp1, %cmp2
+  br i1 %cmp3, label %if.then2, label %if.else2
+
+if.then2:                                             ; preds = %if.then
+  br label %if.else2
+
+if.else2:                                             ; preds = %if.then, %if.then2
+  br i1 %cmp1, label %if.then3, label %if.end
+
+if.then3:                                             ; preds = %if.else2
+  br label %if.end
+
+if.end:                                             ; preds = %entry, %if.else2, %if.then3
+  ret void
+}
+
+; The purpose of this test is to make sure we do not have a kernel that has more
+; than one exit block after following the preparation pass.
+
+; CHECK: define spir_kernel void @__vecz_v4_multiple_exit_blocks
+
+; We don't want to generate any ROSCC branches:
+; CHECK-NOT: entry.ROSCC:
+
+; Only one return statement:
+; CHECK: ret void
+; CHECK-NOT: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
new file mode 100644
index 0000000000000..dfb67303ad8ed
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_kernels_inlining.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k foo3 -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+define void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  call void @foo1(i32 addrspace(1)* %in, i32 addrspace(1)* %out)
+  ret void
+}
+
+define spir_kernel void @foo3(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  call void @foo2(i32 addrspace(1)* %in, i32 addrspace(1)* %out)
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_foo3(ptr addrspace(1) %in, ptr addrspace(1) %out)
+; CHECK-NOT: call spir_kernel
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+; CHECK: load <4 x i32>, ptr addrspace(1) %{{.+}}, align 4
+; CHECK: store <4 x i32> %{{.+}}, ptr addrspace(1) %{{.+}}, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
new file mode 100644
index 0000000000000..6adc22cf4efe8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorization_flags.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check some basic properties of the veczc command line interface for multiple
+; vectorizations works in various configurations. The kernel outputs here are
+; not interesting, only their names.
+; RUN: veczc -w 8 -k foo:4,8,16.2@32s -k bar:,64s -S < %s | FileCheck %s
+
+; CHECK-DAG: define spir_kernel void @foo
+; CHECK-DAG: define spir_kernel void @bar
+; CHECK-DAG: define spir_kernel void @__vecz_v4_foo
+; CHECK-DAG: define spir_kernel void @__vecz_v8_foo
+; CHECK-DAG: define spir_kernel void @__vecz_nxv16_foo
+; CHECK-DAG: define spir_kernel void @__vecz_v8_bar
+; CHECK-DAG: define spir_kernel void @__vecz_nxv64_bar
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @foo(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) {
+entry:
+  ret void
+}
+
+define spir_kernel void @bar(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) {
+entry:
+  ret void
+}
+
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
new file mode 100644
index 0000000000000..a30fafd7a5b56
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations.ll
@@ -0,0 +1,133 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that veczc can vectorize a kernel multiple times in one go, with a
+; correct mapping between the vectorized versions of the kernels and their
+; scalar base
+; RUN: veczc -k add:4,8,16 -S < %s | FileCheck %s
+
+; CHECK: define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.base ![[BASE_1:[0-9]+]] !codeplay_ca_vecz.base ![[BASE_2:[0-9]+]] !codeplay_ca_vecz.base ![[BASE_3:[0-9]+]] {
+; CHECK: define spir_kernel void @__vecz_v[[DERIVED_1_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_1:[0-9]+]] {
+; CHECK: define spir_kernel void @__vecz_v[[DERIVED_2_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_2:[0-9]+]] {
+; CHECK: define spir_kernel void @__vecz_v[[DERIVED_3_VF:[0-9]+]]_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {{.*}} !codeplay_ca_vecz.derived ![[DERIVED_3:[0-9]+]] {
+
+; CHECK: ![[BASE_1]] = !{![[VFMD_1:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_1_VF]]_add
+; CHECK: ![[VFMD_1]] = !{i32 [[DERIVED_1_VF]], i32 0, i32 0, i32 0}
+; CHECK: ![[BASE_2]] = !{![[VFMD_2:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_2_VF]]_add
+; CHECK: ![[VFMD_2]] = !{i32 [[DERIVED_2_VF]], i32 0, i32 0, i32 0}
+; CHECK: ![[BASE_3]] = !{![[VFMD_3:[0-9]+]], {{.*}} @__vecz_v[[DERIVED_3_VF]]_add
+; CHECK: ![[VFMD_3]] = !{i32 [[DERIVED_3_VF]], i32 0, i32 0, i32 0}
+
+; CHECK: ![[DERIVED_1]] = !{![[VFMD_1]], {{.*}} @add
+; CHECK: ![[DERIVED_2]] = !{![[VFMD_2]], {{.*}} @add
+; CHECK: ![[DERIVED_3]] = !{![[VFMD_3]], {{.*}} @add
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 {
+entry:
+  %in1.addr = alloca i32 addrspace(1)*, align 8
+  %in2.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
+  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
+  store i64 %call, i64* %tid, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
+  %0 = load i64, i64* %tid, align 8, !dbg !32
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
+  store i32 %2, i32* %a, align 4, !dbg !32
+  call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
+  %3 = load i64, i64* %tid, align 8, !dbg !33
+  %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33
+  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33
+  store i32 %5, i32* %b, align 4, !dbg !33
+  %6 = load i32, i32* %a, align 4, !dbg !34
+  %7 = load i32, i32* %b, align 4, !dbg !34
+  %add = add nsw i32 %6, %7, !dbg !34
+  %8 = load i64, i64* %tid, align 8, !dbg !34
+  %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34
+  ret void, !dbg !35
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare i64 @__mux_get_global_id(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/tmp")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10)
+!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !8}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !{!11, !12, !13, !14, !19, !20}
+!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8)
+!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8)
+!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15)
+!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17)
+!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp")
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18)
+!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9)
+!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9)
+!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26}
+!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"clang version 3.8.0 "}
+!29 = !DIExpression()
+!30 = !DILocation(line: 1, scope: !4)
+!31 = !DILocation(line: 3, scope: !4)
+!32 = !DILocation(line: 5, scope: !4)
+!33 = !DILocation(line: 6, scope: !4)
+!34 = !DILocation(line: 7, scope: !4)
+!35 = !DILocation(line: 8, scope: !4)
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
new file mode 100644
index 0000000000000..3aa408292b16e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_nested.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that veczc can vectorize a kernel then vectorize the vectorized kernel,
+; with base mappings from 1->2 and 2->3 and derived mappings back from 2->1 and
+; 3->2.
+; RUN: veczc -k add:2 -S < %s > %t2
+; RUN: veczc -k __vecz_v2_add:4 -S < %t2 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) {
+entry:
+  %tid = call i64 @__mux_get_global_id(i32 0) #3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid
+  %i1 = load i32, i32 addrspace(1)* %arrayidx, align 16
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2, i64 %tid
+  %i2 = load i32, i32 addrspace(1)* %arrayidx1, align 16
+  %add = add nsw i32 %i1, %i2
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #2
+
+; CHECK: define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.base ![[BASE_1:[0-9]+]]
+; CHECK: define spir_kernel void @__vecz_v2_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.base ![[BASE_2:[0-9]+]] !codeplay_ca_vecz.derived ![[DERIVED_1:[0-9]+]] {
+  ; CHECK: define spir_kernel void @__vecz_v4___vecz_v2_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out){{.*}} !codeplay_ca_vecz.derived ![[DERIVED_2:[0-9]+]] {
+
+; CHECK: ![[BASE_1]] = !{![[VMD_1:[0-9]+]], {{.*}} @__vecz_v2_add}
+; CHECK: ![[VMD_1]] = !{i32 2, i32 0, i32 0, i32 0}
+; CHECK: ![[BASE_2]] = !{![[VMD_2:[0-9]+]], {{.*}} @__vecz_v4___vecz_v2_add}
+; CHECK: ![[VMD_2]] = !{i32 4, i32 0, i32 0, i32 0}
+; CHECK: ![[DERIVED_1]] = !{![[VMD_1]], {{.*}} @add}
+; CHECK: ![[DERIVED_2]] = !{![[VMD_2]], {{.*}} @__vecz_v2_add}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
new file mode 100644
index 0000000000000..ed574554f9426
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/multiple_vectorizations_vp.ll
@@ -0,0 +1,39 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that veczc can vectorize a kernel multiple times in one go, with an
+; equal width but with one enabling vector predication.
+; RUN: veczc -k add:1s,1sp -S < %s | FileCheck %s
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: define spir_kernel void @add(
+define spir_kernel void @add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in1 = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %idx
+  %arrayidx.in2 = getelementptr inbounds i32, ptr addrspace(1) %in1, i64 %idx
+  %in1.v = load i32, ptr addrspace(1) %arrayidx.in1, align 4
+  %in2.v = load i32, ptr addrspace(1) %arrayidx.in2, align 4
+  %add.v = add i32 %in1.v, %in2.v
+  %arrayidx.out = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %idx
+  store i32 %add.v, ptr addrspace(1) %arrayidx.out
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_nxv1_add
+
+; CHECK: define spir_kernel void @__vecz_nxv1_vp_add
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
new file mode 100644
index 0000000000000..3656854643217
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_instantiate_memop.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k priv -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @priv(i32 addrspace(3)* %a) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp ult i32 %storemerge, %conv
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %a, i64 %idxprom
+  store i32 %conv, i32 addrspace(3)* %arrayidx, align 4
+  %inc = add i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(3)*)* @priv, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 3}
+!2 = !{!"kernel_arg_access_qual", !"none"}
+!3 = !{!"kernel_arg_type", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+
+; Test if the masked store is defined correctly
+; CHECK: call void @__vecz_b_masked_scatter_store4_Dv4_jDv4_u3ptrU3AS3Dv4_b
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
new file mode 100644
index 0000000000000..4ebcd9ec22693
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_over_scalarization.ll
@@ -0,0 +1,68 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k memop_loop_dep -vecz-passes=builtin-inlining,scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-s128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @memop_loop_dep(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %i, i32 %e) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.addr.0 = phi i32 [ %i, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.addr.0, %e
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call1 = call spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64 %call, i32 addrspace(1)* %in)
+  call spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32> %call1, i64 %call, i32 addrspace(1)* %out)
+  %0 = extractelement <4 x i32> %call1, i64 0
+  %tobool = icmp ne i32 %0, 0
+  %tobool2 = icmp eq i64 %call, 0
+  %or.cond = and i1 %tobool2, %tobool
+  br i1 %or.cond, label %while.cond, label %for.inc
+
+while.cond:                                       ; preds = %while.cond, %for.body
+  %tobool3 = icmp eq i64 %call, 0
+  br i1 %tobool3, label %for.inc, label %while.cond
+
+for.inc:                                          ; preds = %for.body, %while.cond
+  %inc = add nsw i32 %i.addr.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare spir_func <4 x i32> @_Z6vload4mPKU3AS1i(i64, i32 addrspace(1)*)
+
+declare spir_func void @_Z7vstore4Dv4_imPU3AS1i(<4 x i32>, i64, i32 addrspace(1)*)
+
+; CHECK: define spir_kernel void @__vecz_v4_memop_loop_dep
+
+; Make sure Scalarization only results in four loads, NOT FIVE
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+; CHECK: load i32
+; CHECK-NOT: load i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
new file mode 100644
index 0000000000000..d61a641d5251e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz1.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s -vecz-auto | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @no_vecz1(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.cond.preheader, label %if.end
+
+for.cond.preheader:                               ; preds = %entry
+  %cmp19 = icmp sgt i32 %n, 0
+  %spec.select = select i1 %cmp19, i32 %n, i32 0
+  store i32 %spec.select, i32 addrspace(1)* %out, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.cond.preheader, %entry
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK-NOT: insertelement
+; CHECK-NOT: shufflevector
+; CHECK-NOT: extractelement
+; CHECK-NOT: define void @__vecz_b_masked_store
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
new file mode 100644
index 0000000000000..709ae760784a9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/no_vecz2.ll
@@ -0,0 +1,57 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s -vecz-auto | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @no_vecz2(i32 addrspace(1)* %out, i32 %n, i32 addrspace(1)* %m) {
+entry:
+  %0 = load i32, i32 addrspace(1)* %m, align 4
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.cond.preheader, label %if.end
+
+for.cond.preheader:                               ; preds = %entry
+  %cmp167 = icmp sgt i32 %n, 0
+  br i1 %cmp167, label %for.body29.lr.ph, label %for.cond.cleanup28
+
+for.body29.lr.ph:                                 ; preds = %for.cond.preheader
+  %add = add i32 %0, 1
+  %factor = shl i32 %0, 2
+  %1 = shl i32 %n, 2
+  %2 = add i32 %1, -4
+  %reass.mul = mul i32 %2, %add
+  %3 = add i32 %factor, 4
+  %4 = add i32 %3, %reass.mul
+  br label %for.cond.cleanup28
+
+for.cond.cleanup28:                               ; preds = %for.body29.lr.ph, %for.cond.preheader
+  %ret.3.lcssa = phi i32 [ %4, %for.body29.lr.ph ], [ 0, %for.cond.preheader ]
+  store i32 %ret.3.lcssa, i32 addrspace(1)* %out, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %for.cond.cleanup28, %entry
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @{{(__vecz_v16_)?}}no_vecz2
+; CHECK-NOT: extractelement
+; CHECK-NOT: define void @__vecz_b_masked_store
+; CHECK: store i32
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
new file mode 100644
index 0000000000000..b455570f66c49
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/offset_info_analysis.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k offset_info_analysis -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @offset_info_analysis(i8 addrspace(1)* noalias %in, i8 addrspace(1)* noalias %out, i32 %width) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %call1 = call i64 @__mux_get_global_id(i32 1) #2
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %width
+  %0 = xor i32 %width, -1
+  %add = add i32 %conv, %0
+  %add5 = add i32 %add, %mul
+  %idxprom = sext i32 %add5 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %1 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %mul10 = mul nsw i32 %conv2, %width
+  %add11 = add nsw i32 %mul10, %conv
+  %idxprom15 = sext i32 %add11 to i64
+  %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom15
+  store i8 %1, i8 addrspace(1)* %arrayidx16, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks that a 'xor' as a binop operand does correctly get analyzed.
+; and masked properly
+; CHECK: define spir_kernel void @__vecz_v4_offset_info_analysis
+; CHECK: load <4 x i8>, ptr addrspace(1)
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load_Dv4_hDv4_u3ptrU3AS1
+; CHECK: ret void
+
+; Check the gather load definition is not generated.
+;CHECK-NOT: declare <4 x i8> @__vecz_b_gather_load_Dv4_hDv4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
new file mode 100644
index 0000000000000..fc1d4b9eac4b0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_d.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isfiniteDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_d
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
new file mode 100644
index 0000000000000..1cff6e5415803
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfiniteDv4_f.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isfiniteDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfiniteDv4_f
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
new file mode 100644
index 0000000000000..0ce5bfdfdd701
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinited.ll
@@ -0,0 +1,268 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isfinited -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfinited
+; CHECK: and <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
new file mode 100644
index 0000000000000..168bf625a4c37
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isfinitef.ll
@@ -0,0 +1,268 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isfinitef -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isfinitef
+; CHECK: and <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
new file mode 100644
index 0000000000000..c11210f1097ea
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_d.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isinfDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_d
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
new file mode 100644
index 0000000000000..67b641587a6af
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfDv4_f.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isinfDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfDv4_f
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
new file mode 100644
index 0000000000000..56129f29e5ddd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinfd.ll
@@ -0,0 +1,268 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isinfd -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinfd
+; CHECK: and <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
new file mode 100644
index 0000000000000..ef9cadee9528c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isinff.ll
@@ -0,0 +1,268 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isinff -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isinff
+; CHECK: and <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
new file mode 100644
index 0000000000000..75862737a2c86
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_d.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnanDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_d
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
new file mode 100644
index 0000000000000..0d2c7e0073757
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanDv4_f.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnanDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanDv4_f
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: and <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
new file mode 100644
index 0000000000000..3b885da041f3f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnand.ll
@@ -0,0 +1,271 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnand -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnand
+; CHECK: and <4 x i64>
+; CHECK: icmp eq <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: icmp ne <4 x i64>
+; CHECK: and <4 x i1>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
new file mode 100644
index 0000000000000..1a5b038b5489d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnanf.ll
@@ -0,0 +1,271 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnanf -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnanf
+; CHECK: and <4 x i32>
+; CHECK: icmp eq <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: icmp ne <4 x i32>
+; CHECK: and <4 x i1>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
new file mode 100644
index 0000000000000..6dee2711d597c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_d.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnormalDv4_d -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_d
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: and <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
new file mode 100644
index 0000000000000..6ffb049b982e0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalDv4_f.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnormalDv4_f -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalDv4_f
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: and <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: sext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
new file mode 100644
index 0000000000000..880bb8d621d10
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormald.ll
@@ -0,0 +1,269 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnormald -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormald
+; CHECK: and <4 x i64>
+; CHECK: add nsw <4 x i64>
+; CHECK: icmp ult <4 x i64>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
new file mode 100644
index 0000000000000..0e0c0a7574e83
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/onearg_relationals_isnormalf.ll
@@ -0,0 +1,269 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_isnormalf -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+declare spir_func i32 @_Z5isinfd(double)
+declare spir_func i32 @_Z5isinff(float)
+declare spir_func i32 @_Z5isnand(double)
+declare spir_func i32 @_Z5isnanf(float)
+declare spir_func i32 @_Z7signbitd(double)
+declare spir_func i32 @_Z7signbitf(float)
+declare spir_func i32 @_Z8isfinited(double)
+declare spir_func i32 @_Z8isfinitef(float)
+declare spir_func i32 @_Z8isnormald(double)
+declare spir_func i32 @_Z8isnormalf(float)
+declare spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float>)
+declare spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float>)
+declare spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double>)
+declare spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double>)
+
+define spir_kernel void @test_isfinitef(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isfinitef(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfinited(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isfinited(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isfiniteDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isfiniteDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isfiniteDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isinff(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isinff(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isinfd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isinfDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isinfDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isinfDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnormalf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z8isnormalf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormald(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z8isnormald(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z8isnormalDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnormalDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z8isnormalDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_isnanf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z5isnanf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnand(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z5isnand(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z5isnanDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_isnanDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z5isnanDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+define spir_kernel void @test_signbitf(float addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 @_Z7signbitf(float %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitd(double addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds double, double addrspace(1)* %in, i64 %call
+  %0 = load double, double addrspace(1)* %arrayidx, align 8
+  %call1 = call spir_func i32 @_Z7signbitd(double %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_f(<4 x float> addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %call
+  %0 = load <4 x float>, <4 x float> addrspace(1)* %arrayidx, align 16
+  %call1 = call spir_func <4 x i32> @_Z7signbitDv4_f(<4 x float> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %out, i64 %call
+  store <4 x i32> %call1, <4 x i32> addrspace(1)* %arrayidx2, align 16
+  ret void
+}
+
+define spir_kernel void @test_signbitDv4_d(<4 x double> addrspace(1)* %in, <4 x i64> addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %in, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %call1 = call spir_func <4 x i64> @_Z7signbitDv4_d(<4 x double> %0)
+  %arrayidx2 = getelementptr inbounds <4 x i64>, <4 x i64> addrspace(1)* %out, i64 %call
+  store <4 x i64> %call1, <4 x i64> addrspace(1)* %arrayidx2, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_isnormalf
+; CHECK: and <4 x i32>
+; CHECK: add nsw <4 x i32>
+; CHECK: icmp ult <4 x i32>
+; CHECK: zext <4 x i1>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
new file mode 100644
index 0000000000000..eaf7917c6dfa0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata1.ll
@@ -0,0 +1,77 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(2)* %in, i32 addrspace(1)* %out, i8 addrspace(2)* %text, double %f) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %call
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+define spir_kernel void @second_test(i32 %a, i32 %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+!opencl.kernels = !{!0, !6}
+!opencl.kernel_wg_size_info = !{!12}
+!llvm.ident = !{!13}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"}
+!5 = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""}
+!6 = !{void (i32, i32)* @second_test, !7, !8, !9, !10, !11}
+!7 = !{!"kernel_arg_addr_space", i32 0, i32 0}
+!8 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!9 = !{!"kernel_arg_type", !"int", !"int"}
+!10 = !{!"kernel_arg_base_type", !"int", !"int"}
+!11 = !{!"kernel_arg_type_qual", !"", !""}
+!12 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, i32 16, i32 1, i32 1, i1 true}
+!13 = !{!"clang version 3.8.1 "}
+
+; Sanity checking
+;CHECK-DAG: define spir_kernel void @test(ptr addrspace(2) %in, ptr addrspace(1) %out, ptr addrspace(2) %text, double %f)
+;CHECK-DAG: define spir_kernel void @__vecz_v4_test(ptr addrspace(2) %in, ptr addrspace(1) %out, ptr addrspace(2) %text, double %f)
+
+; Check if we have the metadata for the kernels
+; CHECK: !opencl.kernels = !{![[MD0:[0-9]+]], ![[MD6:[0-9]+]], ![[MD12:[0-9]+]]}
+; CHECK: !opencl.kernel_wg_size_info = !{![[MD13:[0-9]+]], ![[MD14:[0-9]+]]}
+; CHECK: !llvm.ident = !{![[MD15:[0-9]+]]}
+
+; Check the actual metadata
+; CHECK: ![[MD0]] = !{ptr @test, ![[MD1:[0-9]+]], ![[MD2:[0-9]+]], ![[MD3:[0-9]+]], ![[MD4:[0-9]+]], ![[MD5:[0-9]+]]}
+; CHECK: ![[MD1]] = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0}
+; CHECK: ![[MD2]] = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+; CHECK: ![[MD3]] = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"}
+; CHECK: ![[MD4]] = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"}
+; CHECK: ![[MD5]] = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""}
+; CHECK: ![[MD12]] = !{ptr @__vecz_v4_test, ![[MD1]], ![[MD2]], ![[MD3]], ![[MD4]], ![[MD5]]}
+; CHECK: ![[MD13]] = !{ptr @test, i32 16, i32 1, i32 1, i1 true}
+; CHECK: ![[MD14]] = !{ptr @__vecz_v4_test, i32 16, i32 1, i32 1, i1 true}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
new file mode 100644
index 0000000000000..0438341148fdc
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/opencl_metadata2.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k second_test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(2)* %in, i32 addrspace(1)* %out, i8 addrspace(2)* %text, double %f) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %call
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %0, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+define spir_kernel void @second_test(i32 %a, i32 %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+!opencl.kernels = !{!0, !6}
+!opencl.kernel_wg_size_info = !{!12}
+!llvm.ident = !{!13}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1, i32 2, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*", !"char*", !"double"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"char*", !"double"}
+!5 = !{!"kernel_arg_type_qual", !"const", !"", !"const", !""}
+!6 = !{void (i32, i32)* @second_test, !7, !8, !9, !10, !11}
+!7 = !{!"kernel_arg_addr_space", i32 0, i32 0}
+!8 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!9 = !{!"kernel_arg_type", !"int", !"int"}
+!10 = !{!"kernel_arg_base_type", !"int", !"int"}
+!11 = !{!"kernel_arg_type_qual", !"", !""}
+!12 = !{void (i32 addrspace(2)*, i32 addrspace(1)*, i8 addrspace(2)*, double)* @test, i32 16, i32 1, i32 1, i1 true}
+!13 = !{!"clang version 3.8.1 "}
+
+; Sanity checking
+; CHECK: define spir_kernel void @second_test(i32 %a, i32 %b)
+; CHECK: define spir_kernel void @__vecz_v4_second_test(i32 %a, i32 %b)
+
+; Check if we have the metadata for the kernels
+; CHECK: !opencl.kernels = !{![[MD0:[0-9]+]], ![[MD6:[0-9]+]], ![[MD12:[0-9]+]]}
+; CHECK: !opencl.kernel_wg_size_info = !{![[MD13:[0-9]+]]}
+; CHECK: !llvm.ident = !{![[MD14:[0-9]+]]}
+
+; Check the actual metadata
+; CHECK: ![[MD6]] = !{ptr @second_test, ![[MD7:[0-9]+]], ![[MD8:[0-9]+]], ![[MD9:[0-9]+]], ![[MD10:[0-9]+]], ![[MD11:[0-9]+]]}
+; CHECK: ![[MD7]] = !{!"kernel_arg_addr_space", i32 0, i32 0}
+; CHECK: ![[MD8]] = !{!"kernel_arg_access_qual", !"none", !"none"}
+; CHECK: ![[MD9]] = !{!"kernel_arg_type", !"int", !"int"}
+; CHECK: ![[MD10]] = !{!"kernel_arg_base_type", !"int", !"int"}
+; CHECK: ![[MD11]] = !{!"kernel_arg_type_qual", !"", !""}
+; CHECK: ![[MD12]] = !{ptr @__vecz_v4_second_test, ![[MD7]], ![[MD8]], ![[MD9]], ![[MD10]], ![[MD11]]}
+; CHECK: ![[MD13]] = !{ptr @test, i32 16, i32 1, i32 1, i1 true}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
new file mode 100644
index 0000000000000..ae11c9692391e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/overaligned_allocas.ll
@@ -0,0 +1,80 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-auto -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@entry_test_alloca.lm = external unnamed_addr addrspace(3) constant [16 x <2 x float>], align 8
+
+define spir_kernel void @test(<2 x float> addrspace(1)* nocapture readonly %in, <2 x float> addrspace(1)* nocapture %out, i32 %offset) local_unnamed_addr {
+entry:
+  %a.sroa.0 = alloca <2 x float>, align 16
+  %b.sroa.2 = alloca <2 x float>, align 16
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_local_id(i32 0)
+  %a.sroa.0.0..sroa_cast = bitcast <2 x float>* %a.sroa.0 to i8*
+  %b.sroa.2.0..sroa_cast = bitcast <2 x float>* %b.sroa.2 to i8*
+  %arrayidx2 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %call1
+  %0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx2, align 8
+  %conv = sext i32 %offset to i64
+  %add = add i64 %call1, %conv
+  %arrayidx4 = getelementptr inbounds [16 x <2 x float>], [16 x <2 x float>] addrspace(3)* @entry_test_alloca.lm, i64 0, i64 %add
+  %1 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx4, align 8
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup10
+  %mul.le.le = fmul <2 x float> %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0., %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8.
+  %arrayidx17 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %out, i64 %call
+  store <2 x float> %mul.le.le, <2 x float> addrspace(1)* %arrayidx17, align 8
+  ret void
+
+for.body:                                         ; preds = %for.cond.cleanup10, %entry
+  %i.038 = phi i32 [ 0, %entry ], [ %inc15, %for.cond.cleanup10 ]
+  store volatile <2 x float> %0, <2 x float>* %a.sroa.0, align 8
+  store volatile <2 x float> %1, <2 x float>* %b.sroa.2, align 8
+  br label %for.body11
+
+for.cond.cleanup10:                               ; preds = %for.body11
+  %inc15 = add nuw nsw i32 %i.038, 1
+  %cmp = icmp ult i32 %inc15, 16
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body11:                                       ; preds = %for.body11, %for.body
+  %i6.037 = phi i32 [ 0, %for.body ], [ %inc, %for.body11 ]
+  %a.sroa.0.0.a.sroa.0.0.a.sroa.0.0. = load volatile <2 x float>, <2 x float>* %a.sroa.0, align 8
+  %b.sroa.2.0.b.sroa.2.0.b.sroa.2.8. = load volatile <2 x float>, <2 x float>* %b.sroa.2, align 8
+  %inc = add nuw nsw i32 %i6.037, 1
+  %cmp8 = icmp ult i32 %inc, 16
+  br i1 %cmp8, label %for.body11, label %for.cond.cleanup10
+}
+
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
+declare i64 @__mux_get_local_id(i32) local_unnamed_addr
+
+; Check that all the allocas come before anything else
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %a.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
+; CHECK-NEXT: %b.sroa.{{[0-9]+}} = alloca <2 x float>, align 16
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
new file mode 100644
index 0000000000000..16b63d1e1c451
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_branch.ll
@@ -0,0 +1,66 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_branch -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_branch(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks if the branch conditions and the branch BBs are vectorized
+; and masked properly
+; CHECK: define spir_kernel void @__vecz_v4_test_branch(i32 %a, ptr %b)
+; CHECK: %conv = sext i32 %a to i64
+; CHECK: %[[A_SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %conv, {{i32|i64}} 0
+; CHECK: %[[A_SPLAT:.+]] = shufflevector <4 x i64> %[[A_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: %call = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %[[GID_SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %call, {{i32|i64}} 0
+; CHECK: %[[GID_SPLAT:.+]] = shufflevector <4 x i64> %[[GID_SPLATINSERT:.+]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: %[[GID:.+]] = add <4 x i64> %[[GID_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK: %[[CMP3:.+]] = icmp eq <4 x i64> %[[A_SPLAT]], %[[GID]]
+; CHECK: %[[NOT_CMP4:.+]] = xor <4 x i1> %[[CMP3]], {{<(i1 true(, )?)+>|splat \(i1 true\)}}
+
+; CHECK: %[[IDX:.+]] = sext i32 %a to i64
+; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr %b, i64 %[[IDX]]
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 11, ptr %[[GEP1]], i1 %{{any_of_mask[0-9]*}})
+
+; CHECK: %[[GEP2:.+]] = getelementptr inbounds i32, ptr %b, i64 42
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrb(i32 13, ptr %[[GEP2]], i1 %{{any_of_mask[0-9]*}})
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
new file mode 100644
index 0000000000000..2d953c5daa499
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_debug_info.ll
@@ -0,0 +1,149 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that debug info is preserved in the vectorized kernel.
+; Specifically that the packetization pass creates vector types
+; in the DI for the variables.
+; RUN: veczc -k add -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Vectorized kernel function
+; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_add({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]]
+; Check that intrinsics for user variable locations are still present
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 {
+entry:
+  %in1.addr = alloca i32 addrspace(1)*, align 8
+  %in2.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
+; CHECK: #dbg_value(ptr addrspace(1) %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]]
+; CHECK-SAME: [[PARAM_LOC:![0-9]+]]
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
+  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
+; CHECK: #dbg_value(ptr addrspace(1) %in2, [[DI_IN2:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[PARAM_LOC]]
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+; CHECK: #dbg_value(ptr addrspace(1) %out, [[DI_OUT:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[PARAM_LOC]]
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+; CHECK: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[TID_LOC:![0-9]+]]
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
+  store i64 %call, i64* %tid, align 8, !dbg !31
+; CHECK: #dbg_value(i32 poison, [[DI_A:![0-9]+]], !DIExpression(),
+; CHECK-SAME: [[A_LOC:![0-9]+]]
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
+  %0 = load i64, i64* %tid, align 8, !dbg !32
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
+  store i32 %2, i32* %a, align 4, !dbg !32
+; CHECK: #dbg_value(i32 poison, [[DI_B:![0-9]+]], !DIExpression(),
+; CHECK-SAME: [[B_LOC:![0-9]+]]
+  call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
+  %3 = load i64, i64* %tid, align 8, !dbg !33
+  %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33
+  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33
+  store i32 %5, i32* %b, align 4, !dbg !33
+  %6 = load i32, i32* %a, align 4, !dbg !34
+  %7 = load i32, i32* %b, align 4, !dbg !34
+  %add = add nsw i32 %6, %7, !dbg !34
+  %8 = load i64, i64* %tid, align 8, !dbg !34
+  %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34
+  ret void, !dbg !35
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare i64 @__mux_get_global_id(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/tmp")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10)
+!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !8}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !{!11, !12, !13, !14, !19, !20}
+!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8)
+!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8)
+!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15)
+!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17)
+!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp")
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18)
+!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9)
+!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9)
+!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26}
+!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"clang version 3.8.0 "}
+!29 = !DIExpression()
+!30 = !DILocation(line: 1, scope: !4)
+!31 = !DILocation(line: 3, scope: !4)
+!32 = !DILocation(line: 5, scope: !4)
+!33 = !DILocation(line: 6, scope: !4)
+!34 = !DILocation(line: 7, scope: !4)
+!35 = !DILocation(line: 8, scope: !4)
+
+
+; Debug info metadata entries
+; CHECK:[[PTR_TYPE:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[DI_BASE:![0-9]+]], size: 64, align: 64)
+; CHECK:[[DI_BASE]] = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+
+; CHECK: [[VECZ_SUBPROG]] = distinct !DISubprogram(name: "add",
+; CHECK-SAME: retainedNodes: [[VECZ_VARS:![0-9]+]]
+
+; CHECK: [[VECZ_VARS]] = !{[[DI_IN1]], [[DI_IN2]], [[DI_OUT]], [[DI_TID]], [[DI_A:![0-9]+]], [[DI_B:![0-9]+]]}
+; CHECK: [[DI_IN1]] = !DILocalVariable(name: "in1", arg: 1, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
+; CHECK: [[DI_IN2]] = !DILocalVariable(name: "in2", arg: 2, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
+; CHECK: [[DI_OUT]] = !DILocalVariable(name: "out", arg: 3, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 1, type: [[PTR_TYPE]]
+
+; CHECK: [[DI_TID]] = !DILocalVariable(name: "tid", scope: [[VECZ_SUBPROG]]
+; CHECK: [[DI_A]] = !DILocalVariable(name: "a", scope: [[VECZ_SUBPROG]],
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
new file mode 100644
index 0000000000000..9750f6bae94d0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_nonvarying.ll
@@ -0,0 +1,93 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_nonvarying_loadstore -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_branch(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define spir_kernel void @test_uniform_branch(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i32 %a, 42
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %idxadd = add i64 %idxprom, %call
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxadd
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %call
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  %ptr = phi i32* [ %arrayidx, %if.then ], [ %arrayidx2, %if.else ]
+  %ptrplus = getelementptr inbounds i32, i32* %ptr, i64 %call
+  store i32 17, i32* %ptrplus, align 4
+  ret void
+}
+
+define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) {
+  %index = call i64 @__mux_get_global_id(i32 0)
+  %a.i = getelementptr i32, i32* %a, i64 %index
+  %b.i = getelementptr i32, i32* %b, i64 %index
+  %c.i = getelementptr i32, i32* %c, i64 %index
+  %a.load = load i32, i32* %a.i, align 4
+  %b.load = load i32, i32* %b.i, align 4
+  %add = add i32 %a.load, %b.load
+  store i32 %add, i32* %c.i
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks if a simple kernel is vectorized without any masks
+; CHECK: define spir_func void @__vecz_v4_test_nonvarying_loadstore(ptr %a, ptr %b, ptr %c)
+; CHECK: %index = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %a.i = getelementptr i32, ptr %a, i64 %index
+; CHECK: %b.i = getelementptr i32, ptr %b, i64 %index
+; CHECK: %c.i = getelementptr i32, ptr %c, i64 %index
+; CHECK: %[[LAV:.+]] = load <4 x i32>, ptr %a.i{{(, align 4)?}}
+; CHECK: %[[LBV:.+]] = load <4 x i32>, ptr %b.i{{(, align 4)?}}
+; CHECK: %[[ADD1:.+]] = add <4 x i32> %[[LAV]], %[[LBV]]
+; CHECK: store <4 x i32> %[[ADD1]], ptr %c.i{{(, align 4)?}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
new file mode 100644
index 0000000000000..9f82652ea8a23
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetization_uniform_branch.ll
@@ -0,0 +1,105 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_uniform_branch -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_branch(i32 %a, i32* %b) {
+entry:
+  %conv = sext i32 %a to i64
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %conv, %call
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxprom
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 42
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define spir_kernel void @test_uniform_branch(i32 %a, i32* %b) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i32 %a, 42
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %idxprom = sext i32 %a to i64
+  %idxadd = add i64 %idxprom, %call
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %idxadd
+  store i32 11, i32* %arrayidx, align 4
+  br label %if.end
+
+if.else:
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %call
+  store i32 13, i32* %arrayidx2, align 4
+  br label %if.end
+
+if.end:
+  %ptr = phi i32* [ %arrayidx, %if.then ], [ %arrayidx2, %if.else ]
+  %ptrplus = getelementptr inbounds i32, i32* %ptr, i64 %call
+  store i32 17, i32* %ptrplus, align 4
+  ret void
+}
+
+define spir_func void @test_nonvarying_loadstore(i32* %a, i32* %b, i32* %c) {
+  %index = call i64 @__mux_get_global_id(i32 0)
+  %a.i = getelementptr i32, i32* %a, i64 %index
+  %b.i = getelementptr i32, i32* %b, i64 %index
+  %c.i = getelementptr i32, i32* %c, i64 %index
+  %a.load = load i32, i32* %a.i, align 4
+  %b.load = load i32, i32* %b.i, align 4
+  %add = add i32 %a.load, %b.load
+  store i32 %add, i32* %c.i
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks if the if blocks are vectorized without masks and if the phi
+; node is also vectorized properly
+; CHECK: define spir_kernel void @__vecz_v4_test_uniform_branch(i32 %a, ptr %b)
+; CHECK: %call = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %[[SPLATINSERT:.+]] = insertelement <4 x i64> poison, i64 %call, {{i32|i64}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i64> %[[SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: %[[GID:.+]] = add <4 x i64> %[[SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK: %cmp = icmp eq i32 %a, 42
+; CHECK: br i1 %cmp, label %if.then, label %if.else
+
+; CHECK: if.then:
+; CHECK: %[[GEP1:.+]] = getelementptr i32, ptr %b, <4 x i64>
+; CHECK: store <4 x i32> {{<(i32 11(, )?)+>|splat \(i32 11\)}}, ptr %{{.+}}, align 4
+; CHECK: br label %if.end
+
+; CHECK: if.else:
+; CHECK: %[[GEP2:.+]] = getelementptr i32, ptr %b, <4 x i64>
+; CHECK: store <4 x i32> {{<(i32 13(, )?)+>|splat \(i32 13\)}}, ptr %{{.+}}, align 4
+; CHECK: br label %if.end
+
+; CHECK: if.end:
+; CHECK: %[[PTR:.+]] = phi <4 x ptr> [ %[[GEP1]], %if.then ], [ %[[GEP2]], %if.else ]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll
new file mode 100644
index 0000000000000..7b27991e6740f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_i48.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+
+define spir_kernel void @test(ptr %0, ptr %1) {
+entry:
+  %lid = tail call i64 @__mux_get_local_id(i32 0)
+  %ptr.0 = getelementptr i32, ptr %0, i64 %lid
+  %ptr.1 = getelementptr i32, ptr %1, i64 %lid
+  %val = load i48, ptr %ptr.0
+  store i48 %val, ptr %ptr.1
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @test
+; CHECK: load i48
+; CHECK-NOT: load i48
+; CHECK: store i48
+; CHECK-NOT: store i48
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test
+; CHECK: load i48
+; CHECK: load i48
+; CHECK: load i48
+; CHECK: load i48
+; CHECK-NOT: load i48
+; CHECK: store i48
+; CHECK: store i48
+; CHECK: store i48
+; CHECK: store i48
+; CHECK-NOT: store i48
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll
new file mode 100644
index 0000000000000..4723d16da1af5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_phi_struct.ll
@@ -0,0 +1,41 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_foo()
+define spir_kernel void @foo() {
+; CHECK-LABEL: entry:
+entry:
+  ; CHECK: %0 = call { <4 x i64>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align8_monotonic_monotonic_1_Dv4_u3ptrDv4_mDv4_mDv4_b(
+  %0 = cmpxchg ptr null, i64 0, i64 0 monotonic monotonic, align 8
+  ; CHECK: br label %bb.1
+  br label %bb.1
+
+; CHECK-LABEL: bb.1:
+bb.1:
+  ; CHECK: %1 = phi { <4 x i64>, <4 x i1> } [ %0, %bb.1 ], [ %0, %entry ]
+  %1 = phi { i64, i1 } [ %0, %bb.1 ], [ %0, %entry ]
+  ; CHECK: %2 = extractvalue { <4 x i64>, <4 x i1> } %1, 0
+  %2 = extractvalue { i64, i1 } %1, 0
+  ; %3 = call { <4 x i64>, <4 x i1> } @__vecz_b_v4_masked_cmpxchg_align8_monotonic_monotonic_1_Dv4_u3ptrDv4_mDv4_mDv4_b(
+  %3 = cmpxchg ptr null, i64 0, i64 %2 monotonic monotonic, align 8
+  ; CHECK: br label %bb.1
+  br label %bb.1
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
new file mode 100644
index 0000000000000..c23396643f9d1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_struct_gep.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.T = type { i32, i8, float, i64 }
+
+; Function Attrs: nounwind
+define spir_kernel void @test(%struct.T addrspace(1)* %in, %struct.T addrspace(1)* %out, i32 addrspace(1)* %offsets) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %offsets, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %conv = sext i32 %0 to i64
+  %add = add i64 %conv, %call
+  %c = getelementptr inbounds %struct.T, %struct.T addrspace(1)* %in, i64 %add, i32 2
+  %1 = load float, float addrspace(1)* %c, align 8
+  %c3 = getelementptr inbounds %struct.T, %struct.T addrspace(1)* %out, i64 %add, i32 2
+  store float %1, float addrspace(1)* %c3, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; Check if we can packetize GEPs on structs
+; Note that we only need to packetize the non-uniform operands..
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: getelementptr %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2
+; CHECK: getelementptr %struct.T, ptr addrspace(1) %{{.+}}, <4 x i64> %{{.+}}, i32 2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
new file mode 100644
index 0000000000000..4c5d2b32da7f4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_conditional.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k conditional -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks if the "packetize uniform" Vecz choice works on uniform
+; values used by varying values, but not on uniform values used by other uniform
+; values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
+; CHECK: insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) %in, {{(i32|i64)}} 0
+; CHECK: shufflevector <4 x ptr addrspace(1)>
+; CHECK: call <4 x i32> @__vecz_b_gather_load4_Dv4_jDv4_u3ptrU3AS1
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
new file mode 100644
index 0000000000000..89d5118c8fbc1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_conditional.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k conditional -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks the kernel when the "packetize uniform" Vecz choice is not
+; explicitly set. Currently, this means that the uniform values should not be
+; packetized.
+
+; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
+; CHECK: load i32, ptr
+; CHECK: insertelement <4 x i32> poison
+; CHECK: shufflevector <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
new file mode 100644
index 0000000000000..982b2352ced3a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k noreduce -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks the kernel when the "packetize uniform" Vecz choice is not
+; explicitly set. Currently, this means that the uniform values should not be
+; packetized.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: and i32{{.*}}, 3
+; CHECK: icmp eq i32
+; CHECK: shl i32
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
new file mode 100644
index 0000000000000..2e5a7b31a1665
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_noreduce2.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k noreduce2 -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 8
+  %1 = select i1 %0, i32 17, i32 %storemerge
+  %rem = urem i32 37, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks the kernel when the "packetize uniform" Vecz choice is not
+; explicitly set. Currently, this means that the uniform values should not be
+; packetized.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64 %{{.+}}, 1
+; CHECK: phi i32
+; CHECK: icmp eq i32 %{{.+}}, 8
+; CHECK: urem i32 37
+; CHECK: icmp eq i32 %{{.+}}, 0
+; CHECK: store i32 5
+; CHECK: shl i32 %{{.+}}, 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
new file mode 100644
index 0000000000000..6fc47a670a781
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_default_reduce.ll
@@ -0,0 +1,165 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k reduce -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks the kernel when the "packetize uniform" Vecz choice is not
+; explicitly set. Currently, this means that the uniform values should not be
+; packetized.
+
+; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: insertelement <4 x i64> poison, i64
+; CHECK: shufflevector <4 x i64>
+; CHECK: %[[LOCAL_SIZE:[^ ]+]] = call i64 @__mux_get_local_size(i32 0)
+; CHECK: icmp {{(ugt|ult)}} i64 %[[LOCAL_SIZE]], {{(1|2)}}
+; CHECK-NEXT: br
+; CHECK: phi i32
+; CHECK: mul i32 %{{.+}}, 3
+; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
+; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS3Dv4_b(<4 x i32> {{<(i32 5(, )?)+>|splat \(i32 5\)}}
+; CHECK: shl i32 %{{.+}}, 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
new file mode 100644
index 0000000000000..1a4a89972205f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_conditional.ll
@@ -0,0 +1,160 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k conditional -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks if the "packetize uniform in loops" Vecz choice works on
+; uniform values used by varying values in loops, but not on uniform values used
+; by other uniform values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_conditional(ptr addrspace(1) %in, ptr addrspace(1) %out)
+; CHECK: load i32, {{(ptr|i32)}}
+; CHECK: load i32, {{(ptr|i32)}}
+; CHECK: insertelement <4 x i32> poison
+; CHECK: shufflevector <4 x i32>
+; CHECK: store <4 x i32>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
new file mode 100644
index 0000000000000..fd65118718f99
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k noreduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks if the "packetize uniform in loops" Vecz choice works on
+; uniform values used by varying values in loops, but not on uniform values used
+; by other uniform values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: and i32{{.*}}, 3
+; CHECK: icmp eq i32
+; CHECK: shl i32
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
new file mode 100644
index 0000000000000..9e0a24b6879e6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_noreduce2.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k noreduce2 -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 8
+  %1 = select i1 %0, i32 17, i32 %storemerge
+  %rem = urem i32 37, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks if the "packetize uniform in loops" Vecz choice works on
+; uniform values used by varying values in loops, but not on uniform values used
+; by other uniform values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: phi i32
+; CHECK: icmp eq i32
+; CHECK: urem i32 37
+; CHECK: icmp eq i32
+; CHECK: store i32 5
+; CHECK: shl i32 %{{.+}}, 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
new file mode 100644
index 0000000000000..e251cc4bd07e1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_loops_reduce.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k reduce -vecz-choices=PacketizeUniformInLoops -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks if the "packetize uniform in loops" Vecz choice works on
+; uniform values used by varying values in loops, but not on uniform values used
+; by other uniform values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: insertelement <4 x i64> poison, i64 %{{.+}}, {{(i32|i64)}} 0
+; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: phi <4 x i32>
+; CHECK: mul <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}}
+; CHECK: urem <4 x i64>
+; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
+
+; The branch condition is actually Uniform, despite the divergence analysis
+; CHECK: icmp ugt i64
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
new file mode 100644
index 0000000000000..93634442feb66
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce.ll
@@ -0,0 +1,159 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k noreduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 0
+  %1 = select i1 %0, i32 1, i32 %storemerge
+  %rem = urem i32 3, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: nounwind
+define spir_kernel void @conditional(i32 addrspace(1)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %0 = load i32, i32 addrspace(1)* %in, align 4
+  %rem1 = and i32 %0, 1
+  %tobool = icmp eq i32 %rem1, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = sext i32 %0 to i64
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %1, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+; This test checks if the "packetize uniform" Vecz choice works on uniform
+; values used by varying values, but not on uniform values used by other uniform
+; values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: and i32{{.*}}, 3
+; CHECK: icmp eq i32
+; CHECK: shl i32
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
new file mode 100644
index 0000000000000..716ee2540db66
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_noreduce2.ll
@@ -0,0 +1,73 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k noreduce2 -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @noreduce2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = icmp eq i32 %storemerge, 8
+  %1 = select i1 %0, i32 17, i32 %storemerge
+  %rem = urem i32 37, %1
+  %cmp3 = icmp eq i32 %rem, 0
+  br i1 %cmp3, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %idxprom = zext i32 %storemerge to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %idxprom
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks if the "packetize uniform" Vecz choice works on uniform
+; values used by varying values, but not on uniform values used by other uniform
+; values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_noreduce2(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: icmp ugt i64
+; CHECK: phi i32
+; CHECK: icmp eq i32
+; CHECK: urem i32 37
+; CHECK: icmp eq i32
+; CHECK: store i32 5
+; CHECK: shl i32 %{{.+}}, 1
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
new file mode 100644
index 0000000000000..3815fce8e6637
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/packetize_uniform_reduce.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k reduce -vecz-choices=PacketizeUniform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_local_size(i32)
+
+; Function Attrs: nounwind
+define spir_kernel void @reduce(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0)
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i32 [ 1, %entry ], [ %mul6, %for.inc ]
+  %conv = zext i32 %storemerge to i64
+  %call1 = call i64 @__mux_get_local_size(i32 0)
+  %cmp = icmp ult i64 %conv, %call1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %mul = mul i32 %storemerge, 3
+  %conv3 = zext i32 %mul to i64
+  %0 = icmp eq i32 %mul, 0
+  %1 = select i1 %0, i64 1, i64 %conv3
+  %rem = urem i64 %call, %1
+  %cmp4 = icmp eq i64 %rem, 0
+  br i1 %cmp4, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %out, i64 %call
+  store i32 5, i32 addrspace(3)* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %mul6 = shl i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; This test checks if the "packetize uniform" Vecz choice works on uniform
+; values used by varying values, but not on uniform values used by other uniform
+; values only.
+
+; CHECK: define spir_kernel void @__vecz_v4_reduce(ptr addrspace(3) %in, ptr addrspace(3) %out)
+; CHECK: insertelement <4 x i64> poison, i64 %{{.+}}, {{(i32|i64)}} 0
+; CHECK: shufflevector <4 x i64> %{{.+}}, <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: phi <4 x i32>
+; CHECK: mul <4 x i32> %{{.+}}, {{<(i32 3(, )?)+>|splat \(i32 3\)}}
+; CHECK: urem <4 x i64>
+; CHECK: icmp eq <4 x i64> %{{.+}}, zeroinitializer
+
+; The branch condition is actually Uniform, despite the divergence analysis
+; CHECK: icmp ugt i64
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
new file mode 100644
index 0000000000000..873ea7a983eae
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization0.ll
@@ -0,0 +1,377 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization0 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;        a
+;       / \
+;      b   c
+;       \ /
+;        d
+;        |
+;        e
+;       / \
+;      /   \
+;     f     g
+;    / \   / \
+;   h   i j   k
+;    \ /   \ /
+;     l     m
+;      \   /
+;       \ /
+;        n
+;
+; * where node e is a uniform branch, and nodes a, f and g are varying
+;   branches.
+; * where nodes b, c, d, h, i, j, k, l, m are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     c
+;     |
+;     b
+;     |
+;     d
+;     |
+;     e
+;    / \
+;   f   g
+;   |   |
+;   i   k
+;   |   |
+;   h   j
+;   |   |
+;   l   m
+;    \ /
+;     n
+;
+; instead of:
+;
+;   a
+;   |
+;   b
+;   |
+;   c
+;   |
+;   d
+;   |
+;   e
+;   |
+;   g
+;   |
+;   j
+;   |
+;   k
+;   |
+;   m
+;   |
+;   f
+;   |
+;   i
+;   |
+;   h
+;   |
+;   l
+;   |
+;   n
+;
+; __kernel void partial_linearization0(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (id % 5 == 0) {
+;     for (int i = 0; i < n * 2; i++) ret++;
+;   } else {
+;     for (int i = 0; i < n / 4; i++) ret++;
+;   }
+;
+;   if (n > 10) { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n + 10; i++) ret++;
+;     } else { // varying
+;       for (int i = 0; i < n + 10; i++) ret *= 2;
+;     }
+;     ret += id * 10;
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n + 8; i++) ret++;
+;     } else { // varying
+;       for (int i = 0; i < n + 8; i++) ret *= 2;
+;     }
+;     ret += id / 2;
+;   }
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization0(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %rem = srem i32 %conv, 5
+  %cmp = icmp eq i32 %rem, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
+  %storemerge8 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge8, %mul
+  br i1 %cmp2, label %for.body, label %if.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge8, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %entry
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.body9, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %inc10, %for.body9 ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc12, %for.body9 ]
+  %div = sdiv i32 %n, 4
+  %cmp7 = icmp slt i32 %storemerge, %div
+  br i1 %cmp7, label %for.body9, label %if.end
+
+for.body9:                                        ; preds = %for.cond6
+  %inc10 = add nsw i32 %ret.1, 1
+  %inc12 = add nsw i32 %storemerge, 1
+  br label %for.cond6
+
+if.end:                                           ; preds = %for.cond6, %for.cond
+  %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond6 ]
+  %cmp14 = icmp sgt i32 %n, 10
+  %rem175 = and i32 %conv, 1
+  %cmp18 = icmp eq i32 %rem175, 0
+  br i1 %cmp14, label %if.then16, label %if.else44
+
+if.then16:                                        ; preds = %if.end
+  br i1 %cmp18, label %if.then20, label %if.else30
+
+if.then20:                                        ; preds = %if.then16
+  br label %for.cond22
+
+for.cond22:                                       ; preds = %for.body25, %if.then20
+  %ret.3 = phi i32 [ %ret.2, %if.then20 ], [ %inc26, %for.body25 ]
+  %storemerge7 = phi i32 [ 0, %if.then20 ], [ %inc28, %for.body25 ]
+  %add = add nsw i32 %n, 10
+  %cmp23 = icmp slt i32 %storemerge7, %add
+  br i1 %cmp23, label %for.body25, label %if.end41
+
+for.body25:                                       ; preds = %for.cond22
+  %inc26 = add nsw i32 %ret.3, 1
+  %inc28 = add nsw i32 %storemerge7, 1
+  br label %for.cond22
+
+if.else30:                                        ; preds = %if.then16
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.4 = phi i32 [ %ret.2, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge6 = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 10
+  %cmp34 = icmp slt i32 %storemerge6, %add33
+  br i1 %cmp34, label %for.body36, label %if.end41
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.4, 1
+  %inc39 = add nsw i32 %storemerge6, 1
+  br label %for.cond32
+
+if.end41:                                         ; preds = %for.cond32, %for.cond22
+  %ret.5 = phi i32 [ %ret.3, %for.cond22 ], [ %ret.4, %for.cond32 ]
+  %mul42 = mul nsw i32 %conv, 10
+  %add43 = add nsw i32 %ret.5, %mul42
+  br label %if.end73
+
+if.else44:                                        ; preds = %if.end
+  br i1 %cmp18, label %if.then48, label %if.else59
+
+if.then48:                                        ; preds = %if.else44
+  br label %for.cond50
+
+for.cond50:                                       ; preds = %for.body54, %if.then48
+  %ret.6 = phi i32 [ %ret.2, %if.then48 ], [ %inc55, %for.body54 ]
+  %storemerge4 = phi i32 [ 0, %if.then48 ], [ %inc57, %for.body54 ]
+  %add51 = add nsw i32 %n, 8
+  %cmp52 = icmp slt i32 %storemerge4, %add51
+  br i1 %cmp52, label %for.body54, label %if.end70
+
+for.body54:                                       ; preds = %for.cond50
+  %inc55 = add nsw i32 %ret.6, 1
+  %inc57 = add nsw i32 %storemerge4, 1
+  br label %for.cond50
+
+if.else59:                                        ; preds = %if.else44
+  br label %for.cond61
+
+for.cond61:                                       ; preds = %for.body65, %if.else59
+  %ret.7 = phi i32 [ %ret.2, %if.else59 ], [ %mul66, %for.body65 ]
+  %storemerge2 = phi i32 [ 0, %if.else59 ], [ %inc68, %for.body65 ]
+  %add62 = add nsw i32 %n, 8
+  %cmp63 = icmp slt i32 %storemerge2, %add62
+  br i1 %cmp63, label %for.body65, label %if.end70
+
+for.body65:                                       ; preds = %for.cond61
+  %mul66 = shl nsw i32 %ret.7, 1
+  %inc68 = add nsw i32 %storemerge2, 1
+  br label %for.cond61
+
+if.end70:                                         ; preds = %for.cond61, %for.cond50
+  %ret.8 = phi i32 [ %ret.6, %for.cond50 ], [ %ret.7, %for.cond61 ]
+  %div71 = sdiv i32 %conv, 2
+  %add72 = add nsw i32 %ret.8, %div71
+  br label %if.end73
+
+if.end73:                                         ; preds = %if.end70, %if.end41
+  %storemerge3 = phi i32 [ %add72, %if.end70 ], [ %add43, %if.end41 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization0, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization0
+; CHECK: br label %[[FORCOND6PREHEADER:.+]]
+
+; CHECK: [[FORCOND6PREHEADER]]:
+; CHECK: br label %[[FORCOND6:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{((%[0-9A-Za-z\.]+))|(false)}}, label %[[FORBODY:.+]], label %[[IFENDLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND6]]:
+; CHECK: %[[CMP7:.+]] = icmp
+; CHECK: br i1 %[[CMP7]], label %[[FORBODY9:.+]], label %[[IFENDLOOPEXIT6:.+]]
+
+; CHECK: [[FORBODY9]]:
+; CHECK: br label %[[FORCOND6]]
+
+; CHECK: [[IFENDLOOPEXIT]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[IFENDLOOPEXIT6]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[IFEND]]:
+; CHECK: %[[CMP14:.+]] = icmp
+; CHECK: br i1 %[[CMP14]], label %[[IFTHEN16:.+]], label %[[IFELSE44:.+]]
+
+; CHECK: [[IFTHEN16]]:
+; CHECK: br label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND32PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND22PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND22:.+]]
+
+; CHECK: [[FORCOND22]]:
+; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[FORBODY25:.+]], label %[[IFEND41LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY25]]:
+; CHECK: br label %[[FORCOND22]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: %[[CMP34:.+]] = icmp
+; CHECK: br i1 %[[CMP34]], label %[[FORBODY36:.+]], label %[[IFEND41LOOPEXIT4:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[IFEND41LOOPEXIT]]:
+; CHECK: br label %[[IFEND41:.+]]
+
+; CHECK: [[IFEND41LOOPEXIT4]]:
+; CHECK: br label %[[FORCOND22PREHEADER]]
+
+; CHECK: [[IFEND41]]:
+; CHECK: br label %[[IFEND73:.+]]
+
+; CHECK: [[IFELSE44]]:
+; CHECK: br label %[[FORCOND61PREHEADER:.+]]
+
+; CHECK: [[FORCOND61PREHEADER]]:
+; CHECK: br label %[[FORCOND61:.+]]
+
+; CHECK: [[FORCOND50PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND50:.+]]
+
+; CHECK: [[FORCOND50]]:
+; CHECK: br i1 {{(%([0-9A-Za-z\.])+)|(false)}}, label %[[FORBODY54:.+]], label %[[IFEND70LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY54]]:
+; CHECK: br label %[[FORCOND50]]
+
+; CHECK: [[FORCOND61]]:
+; CHECK: %[[CMP63:.+]] = icmp
+; CHECK: br i1 %[[CMP63]], label %[[FORBODY65:.+]], label %[[IFEND70LOOPEXIT5:.+]]
+
+; CHECK: [[FORBODY65]]:
+; CHECK: br label %[[FORCOND61]]
+
+; CHECK: [[IFEND70LOOPEXIT]]:
+; CHECK: br label %[[IFEND70:.+]]
+
+; CHECK: [[IFEND70LOOPEXIT5]]:
+; CHECK: br label %[[FORCOND50PREHEADER]]
+
+; CHECK: [[IFEND70]]:
+; CHECK: br label %[[IFEND73]]
+
+; CHECK: [[IFEND73]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
new file mode 100644
index 0000000000000..7e5becea883fa
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization1.ll
@@ -0,0 +1,261 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization1 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ g
+;      \|
+;       h
+;
+; * where nodes c and f are uniform branches, and node b is a varying
+;   branch.
+; * where nodes c, d, e, f, g and h are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;   a
+;   |
+;   b <.
+;   |  |
+;   d  |
+;   |  |
+;   c  |
+;   |  |
+;   f -'
+;   |
+;   g
+;   |
+;   e
+;   |
+;   h
+;
+; __kernel void partial_linearization1(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (id + i % 2 == 0) {
+;       if (n > 2) {
+;         goto e;
+;       }
+;     } else {
+;       for (int i = 0; i < n + 10; i++) ret++;
+;     }
+;     if (n <= 2) break;
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization1(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end14, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.2, %if.end14 ]
+  %cmp = icmp eq i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 2
+  br i1 %cmp2, label %e, label %if.end10
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.else
+  %ret.1 = phi i32 [ %ret.0, %if.else ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc9, %for.body ]
+  %add6 = add nsw i32 %n, 10
+  %cmp7 = icmp slt i32 %storemerge, %add6
+  br i1 %cmp7, label %for.body, label %if.end10
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc9 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+if.end10:                                         ; preds = %for.cond, %if.then
+  %ret.2 = phi i32 [ %ret.0, %if.then ], [ %ret.1, %for.cond ]
+  %cmp11 = icmp slt i32 %n, 3
+  br i1 %cmp11, label %while.end, label %if.end14
+
+if.end14:                                         ; preds = %if.end10
+  br label %while.body
+
+while.end:                                        ; preds = %if.end10
+  %mul = mul i32 %n, 2
+  %add15 = add nsw i32 %ret.2, %mul
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.body21, %while.end
+  %ret.3 = phi i32 [ %add15, %while.end ], [ %sub, %for.body21 ]
+  %storemerge1 = phi i32 [ 0, %while.end ], [ %inc23, %for.body21 ]
+  %mul18 = shl nsw i32 %n, 1
+  %cmp19 = icmp slt i32 %storemerge1, %mul18
+  br i1 %cmp19, label %for.body21, label %for.end24
+
+for.body21:                                       ; preds = %for.cond17
+  %sub = sub nsw i32 %ret.3, %storemerge1
+  %inc23 = add nsw i32 %storemerge1, 1
+  br label %for.cond17
+
+for.end24:                                        ; preds = %for.cond17
+  %0 = icmp eq i32 %ret.3, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.3, %5
+  br label %early
+
+e:                                                ; preds = %if.then
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.body30, %e
+  %ret.4 = phi i32 [ %ret.0, %e ], [ %div31, %for.body30 ]
+  %storemerge3 = phi i32 [ 0, %e ], [ %inc33, %for.body30 ]
+  %add27 = add nsw i32 %n, 5
+  %cmp28 = icmp slt i32 %storemerge3, %add27
+  br i1 %cmp28, label %for.body30, label %for.end34
+
+for.body30:                                       ; preds = %for.cond26
+  %div31 = sdiv i32 %ret.4, 2
+  %inc33 = add nsw i32 %storemerge3, 1
+  br label %for.cond26
+
+for.end34:                                        ; preds = %for.cond26
+  %sub35 = sub nsw i32 %ret.4, %n
+  br label %early
+
+early:                                            ; preds = %for.end34, %for.end24
+  %storemerge2 = phi i32 [ %div, %for.end24 ], [ %sub35, %for.end34 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization1, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization1
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[IFEND10:.+]]
+
+; CHECK: [[FORCOND26PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[IFEND10LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[IFEND10LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[IFEND10]]:
+; CHECK: %[[CMP11:.+]] = icmp
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[FORCOND17:.+]]
+
+; CHECK: [[WHILEENDELSE:.+]]:
+; CHECK: br label %[[FORCOND26PREHEADER]]
+
+; CHECK: [[FORCOND17]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY21:.+]], label %[[FOREND24:.+]]
+
+; CHECK: [[FORBODY21]]:
+; CHECK: br label %[[FORCOND17]]
+
+; CHECK: [[FOREND24]]:
+; CHECK: br label %[[WHILEENDELSE]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: %[[CMP28:.+]] = icmp
+; CHECK: br i1 %[[CMP28]], label %[[FORBODY30:.+]], label %[[FOREND34:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[FOREND34]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
new file mode 100644
index 0000000000000..17d186cc11900
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization10.ll
@@ -0,0 +1,465 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization10 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;            a
+;            |
+;            b <-----.
+;           / \      |
+;          c   d     |
+;         / \ /      |
+;        /   e       |
+;       /    |       |
+;      /     g <---. |
+;     /     / \    | |
+;    /     h   i   | |
+;   f     / \ / \  | |
+;   |    j   k   l | |
+;   |   /|  / \ /  | |
+;   |  m | n   o --' |
+;   | /  |/          |
+;   |/   q ----------'
+;   p    |
+;    \   r
+;     \ /
+;      s
+;
+; * where nodes b, c, g, h, j, k and q are uniform branches, and node i is a
+;   varying branch.
+; * where nodes k, l, o, n, m, p, q, r and s are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;          a
+;          |
+;          b <-----.
+;         / \      |
+;        c   d     |
+;       / \ /      |
+;      /   e       |
+;     /    |       |
+;    /     g <---. |
+;   f     / \    | |
+;   |    /   \   | |
+;   |   h     i  | |
+;   |  / \    |  | |
+;   | j   |   l  | |
+;   | |    \ /   | |
+;   | |     k    | |
+;   |  \    |    | |
+;   |   \   o ---' |
+;   |    \ /       |
+;   |     n        |
+;    \    |        |
+;     \   q -------'
+;      \ /
+;       m
+;       |
+;       r
+;       |
+;       p
+;       |
+;       s
+;
+; __kernel void partial_linearization10(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n <= 10) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n & 1) { // g
+;         // h
+;         if (n < 3) {
+;           // j
+;           goto j;
+;         }
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // l
+;           ret /= n * n + ret;
+;           goto o;
+;         }
+;       }
+;       // k
+;       if (n & 1) {
+;         // n
+;         ret += n * ret;
+;         goto n;
+;       }
+;       // o
+; o:
+;       ret++;
+;     }
+; j:
+;     if (n < 2) {
+;       // m
+;       ret += n * 2 + 20;
+;       goto p;
+;     } else {
+;       goto q;
+;     }
+; n:
+;     ret *= 4;
+; q:
+;     if (n & 1) {
+;       // r
+;       ret++;
+;       goto r;
+;     }
+;   }
+;
+; r:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto s;
+;
+; f:
+;   ret /= n;
+;   goto p;
+;
+; p:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; s:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization10(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end55, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.5, %if.end55 ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.1 = phi i32 [ %ret.0, %if.then ], [ %inc, %for.body ]
+  %storemerge5 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge5, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc4 = add nsw i32 %storemerge5, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 11
+  br i1 %cmp5, label %f, label %if.end17
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond9
+
+for.cond9:                                        ; preds = %for.body12, %if.else
+  %ret.2 = phi i32 [ %ret.0, %if.else ], [ %inc13, %for.body12 ]
+  %storemerge = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.2, 1
+  %inc15 = add nsw i32 %storemerge, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %o, %if.end17
+  %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %o ]
+  %storemerge1 = add nsw i32 %storemerge1.in, 1
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.else26, label %if.then21
+
+if.then21:                                        ; preds = %while.body20
+  %cmp22 = icmp slt i32 %n, 3
+  br i1 %cmp22, label %j, label %if.end34
+
+if.else26:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge1, %conv
+  %cmp27 = icmp slt i32 %add, %n
+  br i1 %cmp27, label %if.end34, label %if.then29
+
+if.then29:                                        ; preds = %if.else26
+  %mul30 = mul nsw i32 %n, %n
+  %add31 = add nsw i32 %storemerge1, %mul30
+  %0 = icmp eq i32 %add31, 0
+  %1 = select i1 %0, i32 1, i32 %add31
+  %div32 = sdiv i32 %storemerge1, %1
+  br label %o
+
+if.end34:                                         ; preds = %if.else26, %if.then21
+  %and35 = and i32 %n, 1
+  %tobool36 = icmp eq i32 %and35, 0
+  br i1 %tobool36, label %o, label %if.then37
+
+if.then37:                                        ; preds = %if.end34
+  %mul38 = mul nsw i32 %storemerge1, %n
+  %add39 = add nsw i32 %mul38, %storemerge1
+  %mul50 = shl nsw i32 %add39, 2
+  br label %q
+
+o:                                                ; preds = %if.end34, %if.then29
+  %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ]
+  br label %while.body20
+
+j:                                                ; preds = %if.then21
+  %cmp42 = icmp eq i32 %n, 2
+  br i1 %cmp42, label %q, label %if.then44
+
+if.then44:                                        ; preds = %j
+  %mul45 = mul i32 %n, 2
+  %add46 = add nsw i32 %mul45, 20
+  %add47 = add nsw i32 %add46, %storemerge1
+  br label %p
+
+q:                                                ; preds = %j, %if.then37
+  %ret.5 = phi i32 [ %mul50, %if.then37 ], [ %storemerge1, %j ]
+  %and51 = and i32 %n, 1
+  %tobool52 = icmp eq i32 %and51, 0
+  br i1 %tobool52, label %if.end55, label %if.then53
+
+if.then53:                                        ; preds = %q
+  br label %for.cond57
+
+if.end55:                                         ; preds = %q
+  br label %while.body
+
+for.cond57:                                       ; preds = %for.body61, %if.then53
+  %ret.6.in = phi i32 [ %ret.5, %if.then53 ], [ %ret.6, %for.body61 ]
+  %storemerge2 = phi i32 [ 0, %if.then53 ], [ %inc64, %for.body61 ]
+  %ret.6 = add nsw i32 %ret.6.in, 1
+  %div58 = sdiv i32 %n, 4
+  %cmp59 = icmp slt i32 %storemerge2, %div58
+  br i1 %cmp59, label %for.body61, label %s
+
+for.body61:                                       ; preds = %for.cond57
+  %inc64 = add nsw i32 %storemerge2, 1
+  br label %for.cond57
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %ret.1, -2147483648
+  %3 = icmp eq i32 %n, -1
+  %4 = and i1 %3, %2
+  %5 = icmp eq i32 %n, 0
+  %6 = or i1 %5, %4
+  %7 = select i1 %6, i32 1, i32 %n
+  %div66 = sdiv i32 %ret.1, %7
+  br label %p
+
+p:                                                ; preds = %f, %if.then44
+  %storemerge3 = phi i32 [ %add47, %if.then44 ], [ %div66, %f ]
+  br label %for.cond68
+
+for.cond68:                                       ; preds = %for.body72, %p
+  %ret.7 = phi i32 [ %storemerge3, %p ], [ %inc73, %for.body72 ]
+  %storemerge4 = phi i32 [ 0, %p ], [ %inc75, %for.body72 ]
+  %mul69 = shl nsw i32 %n, 1
+  %cmp70 = icmp slt i32 %storemerge4, %mul69
+  br i1 %cmp70, label %for.body72, label %s
+
+for.body72:                                       ; preds = %for.cond68
+  %inc73 = add nsw i32 %ret.7, 1
+  %inc75 = add nsw i32 %storemerge4, 1
+  br label %for.cond68
+
+s:                                                ; preds = %for.cond68, %for.cond57
+  %ret.8 = phi i32 [ %ret.6, %for.cond57 ], [ %ret.7, %for.cond68 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization10, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization10
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[AND:.+]] = and i32
+; CHECK: %[[TOBOOL:.+]] = icmp eq i32 %[[AND]]
+; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]]
+
+; CHECK: [[IFTHEN21]]:
+; CHECK: %[[CMP22:.+]] = icmp
+; CHECK: br i1 %[[CMP22]], label %[[J:.+]], label %[[IFEND34:.+]]
+
+; CHECK: [[IFELSE26]]:
+; CHECK: br label %[[IFTHEN29:.+]]
+
+; CHECK: [[IFTHEN29]]:
+; CHECK: br label %[[IFEND34]]
+
+; CHECK: [[IFEND34]]:
+; CHECK: br label %[[O:.+]]
+
+; CHECK: [[IFTHEN37:.+]]:
+; CHECK: br label %[[IFTHEN37ELSE:.+]]
+
+; CHECK: [[IFTHEN37ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[JELSE:.+]], label %[[JSPLIT:.+]]
+
+; CHECK: [[O]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[IFTHEN37]]
+
+; CHECK: [[J]]:
+; CHECK: br label %[[WHILEBODY20PUREEXIT]]
+
+; CHECK: [[JELSE]]:
+; CHECK: br label %[[Q:.+]]
+
+; CHECK: [[JSPLIT]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[IFTHEN44:.+]]:
+; CHECK: br label %[[IFTHEN44ELSE:.+]]
+
+; CHECK: [[IFTHEN44ELSE]]:
+; CHECK: br label %[[FORCOND57PREHEADER:.+]]
+
+; CHECK: [[Q]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFTHEN44]]
+
+; CHECK: [[FORCOND57PREHEADER]]:
+; CHECK: br label %[[FORCOND57:.+]]
+
+; CHECK: [[FORCOND57PREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[FORCOND57]]:
+; CHECK: %[[CMP59:.+]] = icmp
+; CHECK: br i1 %[[CMP59]], label %[[FORBODY61:.+]], label %[[SLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY61]]:
+; CHECK: br label %[[FORCOND57]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[P]]
+
+; CHECK: [[P]]:
+; CHECK: br label %[[FORCOND68:.+]]
+
+; CHECK: [[FORCOND68]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY72:.+]], label %[[SLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY72]]:
+; CHECK: br label %[[FORCOND68]]
+
+; CHECK: [[SLOOPEXIT]]:
+; CHECK: br label %[[S:.+]]
+
+; CHECK: [[SLOOPEXIT2]]:
+; CHECK: br label %[[FORCOND57PREHEADERELSE]]
+
+; CHECK: [[S]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
new file mode 100644
index 0000000000000..7721a7577a09a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization11.ll
@@ -0,0 +1,357 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization11 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-------.
+;       |         |
+;       c <---.   |
+;      / \    |   |
+;     d   e   |   |
+;    / \ / \  |   |
+;   i   f   g |   |
+;   |  / \ / \|   |
+;   | j   h --'   |
+;   | |        \  |
+;   | |         k |
+;   |  \       /  |
+;   |   \     /   |
+;   |    \   /    |
+;   |     \ /     |
+;   |      l -----'
+;   |     /
+;    \   m
+;     \ /
+;      n
+;
+; * where nodes c, d, f, g, and l are uniform branches, and node e is a
+;   varying branch.
+; * where nodes i, f, g, j, h, k, l, m and n are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <----.
+;     |      |
+;     c <--. |
+;    / \   | |
+;   d   e  | |
+;   |   |  | |
+;   |   g  | |
+;    \ /   | |
+;     f    | |
+;     |    | |
+;     h ---' |
+;     |      |
+;     k      |
+;     |      |
+;     j      |
+;     |      |
+;     l -----'
+;     |
+;     m
+;     |
+;     i
+;     |
+;     n
+;
+; __kernel void partial_linearization11(__global int *out, int n) {
+;   // a
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     // b
+;     while (1) {
+;       if (n < 5) { // c
+;         // d
+;         for (int i = 0; i < n * 2; i++) ret++;
+;         if (n <= 3) {
+;           // i
+;           goto i;
+;         }
+;       } else {
+;         // e
+;         if (ret + id >= n) {
+;           // g
+;           ret /= n * n + ret;
+;           if (n <= 10) {
+;             goto k;
+;           } else {
+;             goto h;
+;           }
+;         }
+;       }
+;       // f
+;       ret *= n;
+;       if (n & 1) {
+;         goto j;
+;       }
+;
+;       // h
+; h:
+;       ret++;
+;     }
+;
+; j:
+;     ret += n * 2 + 20;
+;     goto l;
+;
+; k:
+;     ret *= n;
+;     goto l;
+;
+; l:
+;     if (n & 1) {
+;       // m
+;       ret++;
+;       goto m;
+;     }
+;   }
+;
+; m:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto n;
+;
+; i:
+;   ret /= n;
+;
+; n:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization11(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end33, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %storemerge, %if.end33 ]
+  br label %while.body2
+
+while.body2:                                      ; preds = %h, %while.body
+  %ret.1 = phi i32 [ %ret.0, %while.body ], [ %inc24, %h ]
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.2 = phi i32 [ %ret.1, %if.then ], [ %inc, %for.body ]
+  %storemerge2 = phi i32 [ 0, %if.then ], [ %inc6, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp4 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.2, 1
+  %inc6 = add nsw i32 %storemerge2, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp7 = icmp slt i32 %n, 4
+  br i1 %cmp7, label %i44, label %if.end20
+
+if.else:                                          ; preds = %while.body2
+  %add = add nsw i32 %ret.1, %conv
+  %cmp10 = icmp slt i32 %add, %n
+  br i1 %cmp10, label %if.end20, label %if.then12
+
+if.then12:                                        ; preds = %if.else
+  %mul13 = mul nsw i32 %n, %n
+  %add14 = add nsw i32 %ret.1, %mul13
+  %0 = icmp eq i32 %ret.1, -2147483648
+  %1 = icmp eq i32 %add14, -1
+  %2 = and i1 %0, %1
+  %3 = icmp eq i32 %add14, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %add14
+  %div = sdiv i32 %ret.1, %5
+  %cmp15 = icmp slt i32 %n, 11
+  br i1 %cmp15, label %k, label %h
+
+if.end20:                                         ; preds = %if.else, %for.end
+  %ret.3 = phi i32 [ %ret.2, %for.end ], [ %ret.1, %if.else ]
+  %mul21 = mul nsw i32 %ret.3, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %h, label %j
+
+h:                                                ; preds = %if.end20, %if.then12
+  %ret.4 = phi i32 [ %div, %if.then12 ], [ %mul21, %if.end20 ]
+  %inc24 = add nsw i32 %ret.4, 1
+  br label %while.body2
+
+j:                                                ; preds = %if.end20
+  %mul25 = mul i32 %n, 2
+  %add26 = add nsw i32 %mul25, 20
+  %add27 = add nsw i32 %add26, %mul21
+  br label %l
+
+k:                                                ; preds = %if.then12
+  %mul28 = mul nsw i32 %div, %n
+  br label %l
+
+l:                                                ; preds = %k, %j
+  %storemerge = phi i32 [ %add27, %j ], [ %mul28, %k ]
+  %and29 = and i32 %n, 1
+  %tobool30 = icmp eq i32 %and29, 0
+  br i1 %tobool30, label %if.end33, label %if.then31
+
+if.then31:                                        ; preds = %l
+  br label %for.cond35
+
+if.end33:                                         ; preds = %l
+  br label %while.body
+
+for.cond35:                                       ; preds = %for.body39, %if.then31
+  %ret.5.in = phi i32 [ %storemerge, %if.then31 ], [ %ret.5, %for.body39 ]
+  %storemerge1 = phi i32 [ 0, %if.then31 ], [ %inc42, %for.body39 ]
+  %ret.5 = add nsw i32 %ret.5.in, 1
+  %div36 = sdiv i32 %n, 4
+  %cmp37 = icmp slt i32 %storemerge1, %div36
+  br i1 %cmp37, label %for.body39, label %n46
+
+for.body39:                                       ; preds = %for.cond35
+  %inc42 = add nsw i32 %storemerge1, 1
+  br label %for.cond35
+
+i44:                                              ; preds = %for.end
+  %6 = icmp eq i32 %ret.2, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %div45 = sdiv i32 %ret.2, %11
+  br label %n46
+
+n46:                                              ; preds = %i44, %for.cond35
+  %ret.6 = phi i32 [ %div45, %i44 ], [ %ret.5, %for.cond35 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization11, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization11
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[WHILEBODY2:.+]]
+
+; CHECK: [[WHILEBODY2]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: br label %[[IFEND20:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFTHEN12:.+]]
+
+; CHECK: [[IFTHEN12]]:
+; CHECK: br label %[[IFEND20]]
+
+; CHECK: [[IFEND20]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[H]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY2]], label %[[WHILEBODY2PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY2PUREEXIT:.+]]:
+; CHECK: br label %[[K:.+]]
+
+; CHECK: [[J:.+]]:
+; CHECK: br label %[[L:.+]]
+
+; CHECK: [[K]]:
+; CHECK: br label %[[KELSE:.+]]
+
+; CHECK: [[KELSE]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[L]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FORCOND35PREHEADER:.+]]
+
+; CHECK: [[FORCOND35PREHEADER]]:
+; CHECK: br label %[[FORCOND35:.+]]
+
+; CHECK: [[FORCOND35PREHEADERELSE:.+]]:
+; CHECK: br label %[[I44:.+]]
+
+; CHECK: [[FORCOND35]]:
+; CHECK: %[[CMP37:.+]] = icmp
+; CHECK: br i1 %[[CMP37]], label %[[FORBODY39:.+]], label %[[N46LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY39]]:
+; CHECK: br label %[[FORCOND35]]
+
+; CHECK: [[I44]]:
+; CHECK: br label %[[N46:.+]]
+
+; CHECK: [[N46LOOPEXIT]]:
+; CHECK: br label %[[FORCOND35PREHEADERELSE]]
+
+; CHECK: [[N46]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
new file mode 100644
index 0000000000000..be2f0f909e0c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization12.ll
@@ -0,0 +1,627 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization12 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;              a
+;              |
+;              b <-----.
+;             / \      |
+;            c   d     |
+;           / \ /      |
+;          /   e       |
+;         /    |       |
+;        /     g <---. |
+;       f     / \    | |
+;       |    h   i   | |
+;       |   /   / \  | |
+;       |  /   k   l | |
+;       | /    |\ /| | |
+;       |/     |/ \| | |
+;       j      m   n | |
+;      /|     / \ /  | |
+;     / |    o   p --' |
+;    /  |   /   /      |
+;   |   |  /   r       |
+;   |   | /    |       |
+;   |   |/     s ------'
+;   |   |     /
+;   |  /|    t
+;   | / |   /
+;   |/  |  /
+;   q   | /
+;   |   |/
+;   |   u
+;    \ /
+;     v
+;
+; * where nodes b, c, g, j, k, l, m, p and s are uniform branches,
+;   and node i is a varying branch.
+; * where nodes k, l, o, n, m, p, q, s, r, t and v are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;         a
+;         |
+;         b <----.
+;        / \     |
+;       c   d    |
+;      / \ /     |
+;     /   e      |
+;    /    |      |
+;   f     g <--. |
+;   |    / \   | |
+;   |   h   i  | |
+;   |  /    |  | |
+;   | /     l  | |
+;   |/      |  | |
+;   j       k  | |
+;   |\      |  | |
+;   | \     n  | |
+;   |  \    |  | |
+;   |   |   m  | |
+;   |   |   |  | |
+;   |   |   p -' |
+;   |   |  /     |
+;   |   | r      |
+;   |   | |      |
+;   |   | s -----'
+;   |   |/
+;   |   o
+;   |  /
+;   | t
+;   |/
+;   u
+;   |
+;   q
+;   |
+;   v
+;
+; __kernel void partial_linearization12(__global int *out, int n) {
+;   // a
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n < 5) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n <= 2) { // g
+;         // h
+;         ret -= n * ret;
+;         for (int i = 0; i < n * 2; i++) ret++;
+;         // j
+;         goto j;
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // k
+;           ret /= n * n + ret;
+;           if (n < 5) {
+;             // m
+;             ret -= n;
+;             goto m;
+;           } else {
+;             // n
+;             ret += n;
+;             goto n;
+;           }
+;         } else {
+;           // l
+;           if (n >= 5) {
+;             // m
+;             ret += n;
+;             goto m;
+;           } else {
+;             // n
+;             ret -= n;
+;             goto n;
+;           }
+;         }
+;       }
+;       // m
+; m:
+;       if (n & 1) {
+;         // o
+;         ret *= n;
+;         goto q;
+;       } else {
+;         // p
+;         goto p;
+;       }
+;
+;       // n
+; n:
+;       ret *= ret;
+;       // p
+; p:
+;       if (n > 3) {
+;         goto r;
+;       }
+;       ret++;
+;     }
+;
+;     // r
+; r:
+;     ret *= 4;
+;     for (int i = 0; i < n / 4; i++) ret++;
+;
+;     // s
+;     if (n & 1) {
+;       goto t;
+;     }
+;     ret++;
+;   }
+;
+; f:
+;   ret /= n;
+;   goto j;
+;
+; j:
+;   if (n == 2) {
+;     goto q;
+;   } else {
+;     goto u;
+;   }
+;
+; t:
+;   for (int i = 0; i < n + 1; i++) ret++;
+;   goto u;
+;
+; q:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto v;
+;
+; u:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; v:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization12(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end79, %entry
+  %storemerge = phi i32 [ 0, %entry ], [ %inc80, %if.end79 ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %storemerge, %if.then ], [ %inc, %for.body ]
+  %storemerge10 = phi i32 [ 0, %if.then ], [ %inc4, %for.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge10, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.0, 1
+  %inc4 = add nsw i32 %storemerge10, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 5
+  br i1 %cmp5, label %f, label %if.end17
+
+if.else:                                          ; preds = %while.body
+  br label %for.cond9
+
+for.cond9:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ %storemerge, %if.else ], [ %inc13, %for.body12 ]
+  %storemerge1 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge1, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.1, 1
+  %inc15 = add nsw i32 %storemerge1, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.2 = phi i32 [ %ret.0, %for.end ], [ %ret.1, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %if.end63, %if.end17
+  %storemerge2.in = phi i32 [ %ret.2, %if.end17 ], [ %ret.4, %if.end63 ]
+  %storemerge2 = add nsw i32 %storemerge2.in, 1
+  %cmp21 = icmp slt i32 %n, 3
+  br i1 %cmp21, label %if.then23, label %if.else35
+
+if.then23:                                        ; preds = %while.body20
+  %mul24 = mul nsw i32 %storemerge2, %n
+  %sub = sub nsw i32 %storemerge2, %mul24
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.body30, %if.then23
+  %ret.3 = phi i32 [ %sub, %if.then23 ], [ %inc31, %for.body30 ]
+  %storemerge9 = phi i32 [ 0, %if.then23 ], [ %inc33, %for.body30 ]
+  %mul27 = shl nsw i32 %n, 1
+  %cmp28 = icmp slt i32 %storemerge9, %mul27
+  br i1 %cmp28, label %for.body30, label %j
+
+for.body30:                                       ; preds = %for.cond26
+  %inc31 = add nsw i32 %ret.3, 1
+  %inc33 = add nsw i32 %storemerge9, 1
+  br label %for.cond26
+
+if.else35:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge2, %conv
+  %cmp36 = icmp slt i32 %add, %n
+  br i1 %cmp36, label %if.else48, label %if.then38
+
+if.then38:                                        ; preds = %if.else35
+  %mul39 = mul nsw i32 %n, %n
+  %add40 = add nsw i32 %storemerge2, %mul39
+  %0 = icmp eq i32 %add40, 0
+  %1 = select i1 %0, i32 1, i32 %add40
+  %div41 = sdiv i32 %storemerge2, %1
+  %cmp42 = icmp slt i32 %n, 5
+  br i1 %cmp42, label %if.then44, label %if.else46
+
+if.then44:                                        ; preds = %if.then38
+  %sub45 = sub nsw i32 %div41, %n
+  br label %m
+
+if.else46:                                        ; preds = %if.then38
+  %add47 = add nsw i32 %div41, %n
+  br label %n58
+
+if.else48:                                        ; preds = %if.else35
+  %cmp49 = icmp sgt i32 %n, 4
+  br i1 %cmp49, label %if.then51, label %if.else53
+
+if.then51:                                        ; preds = %if.else48
+  %add52 = add nsw i32 %storemerge2, %n
+  br label %m
+
+if.else53:                                        ; preds = %if.else48
+  %sub54 = sub nsw i32 %storemerge2, %n
+  br label %n58
+
+m:                                                ; preds = %if.then51, %if.then44
+  %storemerge7 = phi i32 [ %add52, %if.then51 ], [ %sub45, %if.then44 ]
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %p, label %if.then55
+
+if.then55:                                        ; preds = %m
+  %mul56 = mul nsw i32 %storemerge7, %n
+  br label %q
+
+n58:                                              ; preds = %if.else53, %if.else46
+  %storemerge3 = phi i32 [ %sub54, %if.else53 ], [ %add47, %if.else46 ]
+  %mul59 = mul nsw i32 %storemerge3, %storemerge3
+  br label %p
+
+p:                                                ; preds = %n58, %m
+  %ret.4 = phi i32 [ %mul59, %n58 ], [ %storemerge7, %m ]
+  %cmp60 = icmp sgt i32 %n, 3
+  br i1 %cmp60, label %r, label %if.end63
+
+if.end63:                                         ; preds = %p
+  br label %while.body20
+
+r:                                                ; preds = %p
+  %mul65 = shl nsw i32 %ret.4, 2
+  br label %for.cond67
+
+for.cond67:                                       ; preds = %for.body71, %r
+  %ret.5 = phi i32 [ %mul65, %r ], [ %inc72, %for.body71 ]
+  %storemerge4 = phi i32 [ 0, %r ], [ %inc74, %for.body71 ]
+  %div68 = sdiv i32 %n, 4
+  %cmp69 = icmp slt i32 %storemerge4, %div68
+  br i1 %cmp69, label %for.body71, label %for.end75
+
+for.body71:                                       ; preds = %for.cond67
+  %inc72 = add nsw i32 %ret.5, 1
+  %inc74 = add nsw i32 %storemerge4, 1
+  br label %for.cond67
+
+for.end75:                                        ; preds = %for.cond67
+  %and76 = and i32 %n, 1
+  %tobool77 = icmp eq i32 %and76, 0
+  br i1 %tobool77, label %if.end79, label %t
+
+if.end79:                                         ; preds = %for.end75
+  %inc80 = add nsw i32 %ret.5, 1
+  br label %while.body
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %n, 0
+  %3 = select i1 %2, i32 1, i32 %n
+  %div81 = sdiv i32 %ret.0, %3
+  br label %j
+
+j:                                                ; preds = %f, %for.cond26
+  %ret.6 = phi i32 [ %div81, %f ], [ %ret.3, %for.cond26 ]
+  %cmp82 = icmp eq i32 %n, 2
+  br i1 %cmp82, label %q, label %u
+
+t:                                                ; preds = %for.end75
+  br label %for.cond87
+
+for.cond87:                                       ; preds = %for.body91, %t
+  %ret.7 = phi i32 [ %ret.5, %t ], [ %inc92, %for.body91 ]
+  %storemerge5 = phi i32 [ 0, %t ], [ %inc94, %for.body91 ]
+  %cmp89 = icmp sgt i32 %storemerge5, %n
+  br i1 %cmp89, label %u, label %for.body91
+
+for.body91:                                       ; preds = %for.cond87
+  %inc92 = add nsw i32 %ret.7, 1
+  %inc94 = add nsw i32 %storemerge5, 1
+  br label %for.cond87
+
+q:                                                ; preds = %j, %if.then55
+  %ret.8 = phi i32 [ %mul56, %if.then55 ], [ %ret.6, %j ]
+  br label %for.cond97
+
+for.cond97:                                       ; preds = %for.body101, %q
+  %ret.9 = phi i32 [ %ret.8, %q ], [ %inc102, %for.body101 ]
+  %storemerge8 = phi i32 [ 0, %q ], [ %inc104, %for.body101 ]
+  %div98 = sdiv i32 %n, 4
+  %cmp99 = icmp slt i32 %storemerge8, %div98
+  br i1 %cmp99, label %for.body101, label %v
+
+for.body101:                                      ; preds = %for.cond97
+  %inc102 = add nsw i32 %ret.9, 1
+  %inc104 = add nsw i32 %storemerge8, 1
+  br label %for.cond97
+
+u:                                                ; preds = %for.cond87, %j
+  %ret.10 = phi i32 [ %ret.6, %j ], [ %ret.7, %for.cond87 ]
+  br label %for.cond107
+
+for.cond107:                                      ; preds = %for.body111, %u
+  %ret.11 = phi i32 [ %ret.10, %u ], [ %inc112, %for.body111 ]
+  %storemerge6 = phi i32 [ 0, %u ], [ %inc114, %for.body111 ]
+  %mul108 = shl nsw i32 %n, 1
+  %cmp109 = icmp slt i32 %storemerge6, %mul108
+  br i1 %cmp109, label %for.body111, label %v
+
+for.body111:                                      ; preds = %for.cond107
+  %inc112 = add nsw i32 %ret.11, 1
+  %inc114 = add nsw i32 %storemerge6, 1
+  br label %for.cond107
+
+v:                                                ; preds = %for.cond107, %for.cond97
+  %ret.12 = phi i32 [ %ret.9, %for.cond97 ], [ %ret.11, %for.cond107 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.12, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization12, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization12
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[CMP21:.+]] = icmp
+; CHECK: br i1 %[[CMP21]], label %[[IFTHEN23:.+]], label %[[IFELSE35:.+]]
+
+; CHECK: [[IFTHEN23]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHEN23ELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[IFTHEN23SPLIT:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[JLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[IFELSE35]]:
+; CHECK: br label %[[IFTHEN38:.+]]
+
+; CHECK: [[IFTHEN38]]:
+; CHECK: %[[CMP42:.+]] = icmp slt i32
+; CHECK: br i1 %[[CMP42]], label %[[IFTHEN44:.+]], label %[[IFELSE46:.+]]
+
+; CHECK: [[IFTHEN44]]:
+; CHECK: br label %[[IFELSE48:.+]]
+
+; CHECK: [[IFELSE46]]:
+; CHECK: br label %[[IFELSE48]]
+
+; CHECK: [[IFELSE48]]:
+; CHECK: %[[CMP49:.+]] = icmp
+; CHECK: br i1 %[[CMP49]], label %[[IFTHEN51:.+]], label %[[IFELSE53:.+]]
+
+; CHECK: [[IFTHEN51]]:
+; CHECK: br label %[[N58:.+]]
+
+; CHECK: [[IFELSE53]]:
+; CHECK: br label %[[N58]]
+
+; CHECK: [[M:.+]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[IFTHEN55:.+]]:
+; CHECK: br label %[[IFTHEN55ELSE:.+]]
+
+; CHECK: [[IFTHEN55ELSE]]:
+; CHECK: br label %[[FORCOND87PREHEADER:.+]]
+
+; CHECK: [[N58]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[P]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[R:.+]]
+
+; CHECK: [[R]]:
+; CHECK: br label %[[FORCOND67:.+]]
+
+; CHECK: [[FORCOND67]]:
+; CHECK: %[[CMP69:.+]] = icmp
+; CHECK: br i1 %[[CMP69]], label %[[FORBODY71:.+]], label %[[FOREND75:.+]]
+
+; CHECK: [[FORBODY71]]:
+; CHECK: br label %[[FORCOND67]]
+
+; CHECK: [[FOREND75]]:
+; CHECK: br label %[[IFEND79:.+]]
+
+; CHECK: [[FORCOND87PREHEADER]]:
+; CHECK: br label %[[FORCOND87:.+]]
+
+; CHECK: [[FORCOND87PREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHEN23ELSE]], label %[[IFTHEN23SPLIT]]
+
+; CHECK: [[IFEND79]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFTHEN55]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[U:.+]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[J:.+]]
+
+; CHECK: [[JLOOPEXIT]]:
+; CHECK: br label %[[J]]
+
+; CHECK: [[J]]:
+; CHECK: %[[CMP82:.+]] = icmp
+; CHECK: br i1 %[[CMP82]], label %[[Q:.+]], label %[[U]]
+
+; CHECK: [[FORCOND87]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[ULOOPEXIT:.+]], label %[[FORBODY91:.+]]
+
+; CHECK: [[FORBODY91]]:
+; CHECK: br label %[[FORCOND87]]
+
+; CHECK: [[Q]]:
+; CHECK: br label %[[FORCOND97:.+]]
+
+; CHECK: [[FORCOND97]]:
+; CHECK: %[[CMP99:.+]] = icmp
+; CHECK: br i1 %[[CMP99]], label %[[FORBODY101:.+]], label %[[VLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY101]]:
+; CHECK: br label %[[FORCOND97]]
+
+; CHECK: [[ULOOPEXIT]]:
+; CHECK: br label %[[FORCOND87PREHEADERELSE]]
+
+; CHECK: [[U]]:
+; CHECK: br label %[[FORCOND107:.+]]
+
+; CHECK: [[FORCOND107]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY111:.+]], label %[[VLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY111]]:
+; CHECK: br label %[[FORCOND107]]
+
+; CHECK: [[VLOOPEXIT]]:
+; CHECK: br label %[[V:.+]]
+
+; CHECK: [[VLOOPEXIT2]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[V]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
new file mode 100644
index 0000000000000..e044bef6c2a43
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization13.ll
@@ -0,0 +1,222 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization13 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c
+;    \ / \
+;     |   \
+;     |    d
+;     |   / \
+;     |  |   e
+;     |   \ /
+;     |    f
+;     |   / \
+;     |  |   g
+;     |   \ /
+;      \   h
+;       \ /
+;        i
+;
+; * where nodes d and f are uniform branches, and nodes a and c are varying
+;   branches.
+; * where nodes b, c, i are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     c
+;     |
+;     d
+;    / \
+;   |   e
+;    \ /
+;     f
+;    / \
+;   |   g
+;    \ /
+;     h
+;     |
+;     b
+;     |
+;     i
+;
+; __kernel void partial_linearization13(__global int *out, int n) {
+;   size_t tid = get_global_id(0);
+;   size_t size = get_global_size(0);
+;   // a
+;   if (tid + 1 < size) {
+;     // b
+;     out[tid] = n;
+;   } else if (tid + 1 == size) { // c
+;     size_t leftovers = 1 + (size & 1);
+;     switch (leftovers) { // d
+;       case 2: // e
+;         out[tid] = 2 * n + 1;
+;         // fall through
+;       case 1: // f
+;         out[tid] += 3 * n - 1;
+;         break;
+;     }
+;     switch (leftovers) { // g
+;       case 2:
+;         out[tid] /= n;
+;         // fall through
+;       case 1: // h
+;         out[tid]--;
+;         break;
+;     }
+;   }
+;   // i
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization13(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %call1 = call i64 @__mux_get_global_size(i32 0) #2
+  %add = add i64 %call, 1
+  %cmp = icmp ult i64 %add, %call1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %n, i32 addrspace(1)* %arrayidx, align 4
+  br label %if.end17
+
+if.else:                                          ; preds = %entry
+  %add2 = add i64 %call, 1
+  %cmp3 = icmp eq i64 %add2, %call1
+  br i1 %cmp3, label %if.then4, label %if.end17
+
+if.then4:                                         ; preds = %if.else
+  %0 = and i64 %call1, 1
+  %trunc = icmp eq i64 %0, 0
+  br i1 %trunc, label %sw.bb8, label %sw.bb
+
+sw.bb:                                            ; preds = %if.then4
+  %mul = shl nsw i32 %n, 1
+  %add6 = or i32 %mul, 1
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %add6, i32 addrspace(1)* %arrayidx7, align 4
+  br label %sw.bb8
+
+sw.bb8:                                           ; preds = %sw.bb, %if.then4
+  %mul9 = mul nsw i32 %n, 3
+  %sub = add nsw i32 %mul9, -1
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %add11 = add nsw i32 %sub, %1
+  store i32 %add11, i32 addrspace(1)* %arrayidx10, align 4
+  %2 = and i64 %call1, 1
+  %trunc2 = icmp ne i64 %2, 0
+  %trunc2.off = add i1 %trunc2, true
+  %switch = icmp ult i1 %trunc2.off, true
+  br i1 %switch, label %sw.bb12, label %sw.bb14
+
+sw.bb12:                                          ; preds = %sw.bb8
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %3 = load i32, i32 addrspace(1)* %arrayidx13, align 4
+  %4 = icmp eq i32 %3, -2147483648
+  %5 = icmp eq i32 %n, -1
+  %6 = and i1 %5, %4
+  %7 = icmp eq i32 %n, 0
+  %8 = or i1 %7, %6
+  %9 = select i1 %8, i32 1, i32 %n
+  %div = sdiv i32 %3, %9
+  store i32 %div, i32 addrspace(1)* %arrayidx13, align 4
+  br label %sw.bb14
+
+sw.bb14:                                          ; preds = %sw.bb12, %sw.bb8
+  %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  %10 = load i32, i32 addrspace(1)* %arrayidx15, align 4
+  %dec = add nsw i32 %10, -1
+  store i32 %dec, i32 addrspace(1)* %arrayidx15, align 4
+  br label %if.end17
+
+if.end17:                                         ; preds = %sw.bb14, %if.else, %if.then
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_size(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization13, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization13
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[IFEND17:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFTHEN4:.+]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: %[[TMP:.+]] = and i64 %call1, 1
+; CHECK: %[[TRUNC:.+]] = icmp eq i64 %[[TMP]], 0
+; FIXME: We shouldn't need to mask this comparison, as it's truly uniform even
+; on inactive lanes.
+; CHECK: %[[TRUNC_ACTIVE:.+]] = select i1 {{%.*}}, i1 %[[TRUNC]], i1 false
+; CHECK: %[[TRUNC_ACTIVE_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[TRUNC_ACTIVE]])
+; CHECK: br i1 %[[TRUNC_ACTIVE_ANY]], label %[[SWBB8:.+]], label %[[SWBB:.+]]
+
+; CHECK: [[SWBB]]:
+; CHECK: br label %[[SWBB8]]
+
+; CHECK: [[SWBB8]]:
+; CHECK: %[[TMP2:.+]] = and i64 %call1, 1
+; CHECK: %[[TRUNC2:.+]] = icmp eq i64 %[[TMP2]], 0
+; CHECK: br i1 %[[TRUNC2]], label %[[SWBB14:.+]], label %[[SWBB12:.+]]
+
+; CHECK: [[SWBB12]]:
+; CHECK: br label %[[SWBB14]]
+
+; CHECK: [[SWBB14]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
new file mode 100644
index 0000000000000..165092cd8c1ba
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization14.ll
@@ -0,0 +1,292 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization14 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c <-.
+;   |  / \  |
+;   | d   e |
+;   |/ \ /  |
+;   f   g --'
+;    \  |
+;     \ h
+;      \|
+;       i
+;
+; * where nodes a, d and g are uniform branches, and node c is a varying
+;   branch.
+; * where nodes d, e, f, g, h and i are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;     a
+;    / \
+;   b   c <.
+;   |   |  |
+;   |   e  |
+;   |   |  |
+;   |   d  |
+;   |   |  |
+;   |   g -'
+;    \  |
+;     \ h
+;      \|
+;       f
+;       |
+;       i
+;
+; __kernel void partial_linearization14(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   if (n < 5) {
+;     for (int i = 0; i < n + 10; i++) ret++;
+;     goto f;
+;   } else {
+;     while (1) {
+;       if (id + i % 2 == 0) {
+;         if (n > 2) {
+;           goto f;
+;         }
+;       } else {
+;         for (int i = 0; i < n + 10; i++) ret++;
+;       }
+;       if (n <= 2) break;
+;     }
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization14(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %entry
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %n, 10
+  %cmp3 = icmp slt i32 %storemerge4, %add
+  br i1 %cmp3, label %for.body, label %f
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc5 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond
+
+while.body:                                       ; preds = %if.end24, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ]
+  %cmp7 = icmp eq i32 %conv, 0
+  br i1 %cmp7, label %if.then9, label %for.cond15
+
+if.then9:                                         ; preds = %while.body
+  %cmp10 = icmp sgt i32 %n, 2
+  br i1 %cmp10, label %f, label %if.end24
+
+for.cond15:                                       ; preds = %for.body19, %while.body
+  %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ]
+  %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ]
+  %add16 = add nsw i32 %n, 10
+  %cmp17 = icmp slt i32 %storemerge, %add16
+  br i1 %cmp17, label %for.body19, label %if.end24
+
+for.body19:                                       ; preds = %for.cond15
+  %inc20 = add nsw i32 %ret.2, 1
+  %inc22 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond15
+
+if.end24:                                         ; preds = %for.cond15, %if.then9
+  %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ]
+  %cmp25 = icmp slt i32 %n, 3
+  br i1 %cmp25, label %if.end29, label %while.body
+
+if.end29:                                         ; preds = %if.end24
+  %mul = mul i32 %n, 2
+  %add30 = add nsw i32 %ret.3, %mul
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.end29
+  %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ]
+  %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ]
+  %mul33 = shl nsw i32 %n, 1
+  %cmp34 = icmp slt i32 %storemerge1, %mul33
+  br i1 %cmp34, label %for.body36, label %for.end39
+
+for.body36:                                       ; preds = %for.cond32
+  %sub = sub nsw i32 %ret.4, %storemerge1
+  %inc38 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond32
+
+for.end39:                                        ; preds = %for.cond32
+  %0 = icmp eq i32 %ret.4, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.4, %5
+  br label %early
+
+f:                                                ; preds = %if.then9, %for.cond
+  %ret.5 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %if.then9 ]
+  br label %for.cond41
+
+for.cond41:                                       ; preds = %for.body45, %f
+  %ret.6 = phi i32 [ %ret.5, %f ], [ %div46, %for.body45 ]
+  %storemerge3 = phi i32 [ 0, %f ], [ %inc48, %for.body45 ]
+  %add42 = add nsw i32 %n, 5
+  %cmp43 = icmp slt i32 %storemerge3, %add42
+  br i1 %cmp43, label %for.body45, label %for.end49
+
+for.body45:                                       ; preds = %for.cond41
+  %div46 = sdiv i32 %ret.6, 2
+  %inc48 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond41
+
+for.end49:                                        ; preds = %for.cond41
+  %sub50 = sub nsw i32 %ret.6, %n
+  br label %early
+
+early:                                            ; preds = %for.end49, %for.end39
+  %storemerge2 = phi i32 [ %div, %for.end39 ], [ %sub50, %for.end49 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization14, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization14
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYPREHEADER]]:
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND15PREHEADER:.+]]
+
+; CHECK: [[FORCOND15PREHEADER]]:
+; CHECK: br label %[[FORCOND15:.+]]
+
+; CHECK: [[IFTHEN9:.+]]:
+; CHECK: br label %[[IFEND24:.+]]
+
+; CHECK: [[FORCOND15]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY19]]:
+; CHECK: br label %[[FORCOND15]]
+
+; CHECK: [[IFEND24LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN9]]
+
+; CHECK: [[IFEND24]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFEND29:.+]]
+
+; CHECK: [[IFEND29]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[IFEND29ELSE:.+]]:
+; CHECK: br label %[[FLOOPEXIT2:.+]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[FOREND39]]:
+; CHECK: br label %[[IFEND29ELSE]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[FLOOPEXIT2]]:
+; CHECK: br label %[[F]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FORCOND41:.+]]
+
+; CHECK: [[FORCOND41]]:
+; CHECK: %[[CMP43:.+]] = icmp
+; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]]
+
+; CHECK: [[FORBODY45]]:
+; CHECK: br label %[[FORCOND41]]
+
+; CHECK: [[FOREND49]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
new file mode 100644
index 0000000000000..96155f725946f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization15.ll
@@ -0,0 +1,385 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization15 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;           a
+;           |
+;           b <-----.
+;          / \      |
+;         c   d     |
+;        / \ /      |
+;       /   e       |
+;      /    |       |
+;     /     g <---. |
+;    /     / \    | |
+;   f     h   i   | |
+;   |    / \ / \  | |
+;   |   |   j   k | |
+;   |    \ / \ /  | |
+;   |     l   m --' |
+;   |    /          |
+;   |   o ----------'
+;   |   |
+;   n   p
+;    \ /
+;     q
+;
+; * where nodes b, c, g, h, j and o are uniform branches, and node i is a
+;   varying branch.
+; * where nodes j, k, m, l, and o are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;       a
+;       |
+;       b <-----.
+;      / \      |
+;     c   d     |
+;    / \ /      |
+;   f   e       |
+;   |   |       |
+;   |   g <---. |
+;   |  / \    | |
+;   | h   i   | |
+;   | |   |   | |
+;   | |   k   | |
+;   |  \ /    | |
+;   |   j     | |
+;   |   |     | |
+;   |   m ----' |
+;   |   |       |
+;   |   l       |
+;   |   |       |
+;   |   o ------'
+;   |   |
+;   n   p
+;    \ /
+;     q
+;
+; __kernel void partial_linearization15(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0) { // b
+;       // c
+;       for (int i = 0; i < n * 2; i++) ret++;
+;       if (n <= 10) {
+;         // f
+;         goto f;
+;       }
+;     } else {
+;       // d
+;       for (int i = 0; i < n / 4; i++) ret++;
+;     }
+;     // e
+;     ret++;
+;     while (1) {
+;       if (n & 1) { // g
+;         // h
+;         if (n < 3) {
+;           goto l;
+;         }
+;       } else {
+;         // i
+;         if (ret + id >= n) {
+;           // k
+;           ret /= n * n + ret;
+;           goto m;
+;         }
+;       }
+;       // j
+;       if (n & 1) {
+;         goto l;
+;       }
+;       // m
+; m:
+;       ret++;
+;     }
+; l:
+;     ret *= 4;
+; o:
+;     if (n & 1) {
+;       // p
+;       ret++;
+;       goto p;
+;     }
+;   }
+;
+; p:
+;   for (int i = 0; i < n / 4; i++) ret++;
+;   goto q;
+;
+; f:
+;   ret /= n;
+;   goto n;
+;
+; n:
+;   for (int i = 0; i < n * 2; i++) ret++;
+;
+; q:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization15(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %l, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %mul40, %l ]
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.cond, label %for.cond9
+
+for.cond:                                         ; preds = %for.body, %while.body
+  %ret.1 = phi i32 [ %inc, %for.body ], [ %ret.0, %while.body ]
+  %storemerge3 = phi i32 [ %inc4, %for.body ], [ 0, %while.body ]
+  %mul = shl nsw i32 %n, 1
+  %cmp2 = icmp slt i32 %storemerge3, %mul
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nsw i32 %ret.1, 1
+  %inc4 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp5 = icmp slt i32 %n, 11
+  br i1 %cmp5, label %f, label %if.end17
+
+for.cond9:                                        ; preds = %for.body12, %while.body
+  %ret.2 = phi i32 [ %inc13, %for.body12 ], [ %ret.0, %while.body ]
+  %storemerge = phi i32 [ %inc15, %for.body12 ], [ 0, %while.body ]
+  %div = sdiv i32 %n, 4
+  %cmp10 = icmp slt i32 %storemerge, %div
+  br i1 %cmp10, label %for.body12, label %if.end17
+
+for.body12:                                       ; preds = %for.cond9
+  %inc13 = add nsw i32 %ret.2, 1
+  %inc15 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond9
+
+if.end17:                                         ; preds = %for.cond9, %for.end
+  %ret.3 = phi i32 [ %ret.1, %for.end ], [ %ret.2, %for.cond9 ]
+  br label %while.body20
+
+while.body20:                                     ; preds = %m, %if.end17
+  %storemerge1.in = phi i32 [ %ret.3, %if.end17 ], [ %ret.4, %m ]
+  %storemerge1 = add nsw i32 %storemerge1.in, 1
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.else26, label %if.then21
+
+if.then21:                                        ; preds = %while.body20
+  %cmp22 = icmp slt i32 %n, 3
+  br i1 %cmp22, label %l, label %if.end34
+
+if.else26:                                        ; preds = %while.body20
+  %add = add nsw i32 %storemerge1, %conv
+  %cmp27 = icmp slt i32 %add, %n
+  br i1 %cmp27, label %if.end34, label %if.then29
+
+if.then29:                                        ; preds = %if.else26
+  %mul30 = mul nsw i32 %n, %n
+  %add31 = add nsw i32 %storemerge1, %mul30
+  %0 = icmp eq i32 %add31, 0
+  %1 = select i1 %0, i32 1, i32 %add31
+  %div32 = sdiv i32 %storemerge1, %1
+  br label %m
+
+if.end34:                                         ; preds = %if.else26, %if.then21
+  %and35 = and i32 %n, 1
+  %tobool36 = icmp eq i32 %and35, 0
+  br i1 %tobool36, label %m, label %l
+
+m:                                                ; preds = %if.end34, %if.then29
+  %ret.4 = phi i32 [ %div32, %if.then29 ], [ %storemerge1, %if.end34 ]
+  br label %while.body20
+
+l:                                                ; preds = %if.end34, %if.then21
+  %mul40 = shl nsw i32 %storemerge1, 2
+  %and41 = and i32 %n, 1
+  %tobool42 = icmp eq i32 %and41, 0
+  br i1 %tobool42, label %while.body, label %if.then43
+
+if.then43:                                        ; preds = %l
+  %inc44 = or i32 %mul40, 1
+  br label %for.cond47
+
+for.cond47:                                       ; preds = %for.body51, %if.then43
+  %ret.5 = phi i32 [ %inc44, %if.then43 ], [ %inc52, %for.body51 ]
+  %storemerge2 = phi i32 [ 0, %if.then43 ], [ %inc54, %for.body51 ]
+  %div48 = sdiv i32 %n, 4
+  %cmp49 = icmp slt i32 %storemerge2, %div48
+  br i1 %cmp49, label %for.body51, label %q
+
+for.body51:                                       ; preds = %for.cond47
+  %inc52 = add nsw i32 %ret.5, 1
+  %inc54 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond47
+
+f:                                                ; preds = %for.end
+  %2 = icmp eq i32 %ret.1, -2147483648
+  %3 = icmp eq i32 %n, -1
+  %4 = and i1 %3, %2
+  %5 = icmp eq i32 %n, 0
+  %6 = or i1 %5, %4
+  %7 = select i1 %6, i32 1, i32 %n
+  %div56 = sdiv i32 %ret.1, %7
+  br label %for.cond59
+
+for.cond59:                                       ; preds = %for.body63, %f
+  %ret.6 = phi i32 [ %div56, %f ], [ %inc64, %for.body63 ]
+  %storemerge4 = phi i32 [ 0, %f ], [ %inc66, %for.body63 ]
+  %mul60 = shl nsw i32 %n, 1
+  %cmp61 = icmp slt i32 %storemerge4, %mul60
+  br i1 %cmp61, label %for.body63, label %q
+
+for.body63:                                       ; preds = %for.cond59
+  %inc64 = add nsw i32 %ret.6, 1
+  %inc66 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond59
+
+q:                                                ; preds = %for.cond59, %for.cond47
+  %ret.7 = phi i32 [ %ret.5, %for.cond47 ], [ %ret.6, %for.cond59 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.7, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization15, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization15
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND9PREHEADER:.+]]
+
+; CHECK: [[FORCOND9PREHEADER]]:
+; CHECK: br label %[[FORCOND9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 false, label %[[FORBODY:.+]], label %[[FOREND:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[F:.+]], label %[[IFEND17:.+]]
+
+; CHECK: [[FORCOND9]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[IFEND17LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND9]]
+
+; CHECK: [[IFEND17LOOPEXIT]]:
+; CHECK: br label %[[IFEND17]]
+
+; CHECK: [[IFEND17]]:
+; CHECK: br label %[[WHILEBODY20:.+]]
+
+; CHECK: [[WHILEBODY20]]:
+; CHECK: %[[AND:.+]] = and i32
+; CHECK: %[[TOBOOL:.+]] = icmp eq i32 %[[AND]]
+; CHECK: br i1 %[[TOBOOL]], label %[[IFELSE26:.+]], label %[[IFTHEN21:.+]]
+
+; CHECK: [[IFTHEN21]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[IFELSE26]]:
+; CHECK: br label %[[IFTHEN29:.+]]
+
+; CHECK: [[IFTHEN29]]:
+; CHECK: br label %[[IFEND34:.+]]
+
+; CHECK: [[IFEND34]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[M]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY20]], label %[[WHILEBODY20PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY20PUREEXIT]]:
+; CHECK: br label %[[L:.+]]
+
+; CHECK: [[L]]:
+; CHECK: %[[TOBOOL42:.+]] = icmp
+; CHECK: br i1 %[[TOBOOL42]], label %[[WHILEBODY]], label %[[IFTHEN43:.+]]
+
+; CHECK: [[IFTHEN43]]:
+; CHECK: br label %[[FORCOND47:.+]]
+
+; CHECK: [[FORCOND47]]:
+; CHECK: %[[CMP49:.+]] = icmp
+; CHECK: br i1 %[[CMP49]], label %[[FORBODY51:.+]], label %[[QLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY51]]:
+; CHECK: br label %[[FORCOND47]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FORCOND59:.+]]
+
+; CHECK: [[FORCOND59]]:
+; CHECK: br i1 false, label %[[FORBODY63:.+]], label %[[QLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY63]]:
+; CHECK: br label %[[FORCOND59]]
+
+; CHECK: [[QLOOPEXIT]]:
+; CHECK: br label %[[Q:.+]]
+
+; CHECK: [[QLOOPEXIT2]]:
+; CHECK: br label %[[Q]]
+
+; CHECK: [[Q]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
new file mode 100644
index 0000000000000..48295e243c7ab
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization16.ll
@@ -0,0 +1,319 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization16 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;      / \
+;     b   c <-.
+;    /   / \  |
+;   |   d   e |
+;   |  / \ /  |
+;   | f   g --'
+;   |/    |
+;   h     i
+;    \   /
+;     \ /
+;      j
+;
+; * where nodes a, d and g are uniform branches, and node c is a varying
+;   branch.
+; * where nodes d, e, f, g, i and j are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;     a
+;    / \
+;   b   c <.
+;   |   |  |
+;   |   e  |
+;   |   |  |
+;   |   d  |
+;   |   |  |
+;   |   g -'
+;   |   |
+;   |   i
+;    \  |
+;     \ f
+;      \|
+;       h
+;       |
+;       j
+;
+; __kernel void partial_linearization16(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   if (n < 5) {
+;     for (int i = 0; i < n + 10; i++) ret++;
+;     goto h;
+;   } else {
+;     while (1) {
+;       if (id + i % 2 == 0) {
+;         if (n > 2) {
+;           goto f;
+;         }
+;       } else {
+;         for (int i = 0; i < n + 10; i++) ret++;
+;       }
+;       if (n <= 2) break;
+;     }
+;   }
+;
+;   ret += n * 2;
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;   ret /= n;
+;   goto early;
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret /= 2;
+;   ret -= n;
+;
+; h:
+;   for (int i = 0; i < n * 2; i++) ret -= i;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization16(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 5
+  br i1 %cmp, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %entry
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %storemerge4 = phi i32 [ %inc5, %for.body ], [ 0, %entry ]
+  %add = add nsw i32 %n, 10
+  %cmp3 = icmp slt i32 %storemerge4, %add
+  br i1 %cmp3, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc5 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond
+
+while.body:                                       ; preds = %if.end24, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %ret.3, %if.end24 ]
+  %cmp7 = icmp eq i32 %conv, 0
+  br i1 %cmp7, label %if.then9, label %for.cond15
+
+if.then9:                                         ; preds = %while.body
+  %cmp10 = icmp sgt i32 %n, 2
+  br i1 %cmp10, label %for.cond41, label %if.end24
+
+for.cond15:                                       ; preds = %for.body19, %while.body
+  %ret.2 = phi i32 [ %inc20, %for.body19 ], [ %ret.1, %while.body ]
+  %storemerge = phi i32 [ %inc22, %for.body19 ], [ 0, %while.body ]
+  %add16 = add nsw i32 %n, 10
+  %cmp17 = icmp slt i32 %storemerge, %add16
+  br i1 %cmp17, label %for.body19, label %if.end24
+
+for.body19:                                       ; preds = %for.cond15
+  %inc20 = add nsw i32 %ret.2, 1
+  %inc22 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond15
+
+if.end24:                                         ; preds = %for.cond15, %if.then9
+  %ret.3 = phi i32 [ %ret.1, %if.then9 ], [ %ret.2, %for.cond15 ]
+  %cmp25 = icmp slt i32 %n, 3
+  br i1 %cmp25, label %if.end29, label %while.body
+
+if.end29:                                         ; preds = %if.end24
+  %mul = mul i32 %n, 2
+  %add30 = add nsw i32 %ret.3, %mul
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.end29
+  %ret.4 = phi i32 [ %add30, %if.end29 ], [ %sub, %for.body36 ]
+  %storemerge1 = phi i32 [ 0, %if.end29 ], [ %inc38, %for.body36 ]
+  %mul33 = shl nsw i32 %n, 1
+  %cmp34 = icmp slt i32 %storemerge1, %mul33
+  br i1 %cmp34, label %for.body36, label %for.end39
+
+for.body36:                                       ; preds = %for.cond32
+  %sub = sub nsw i32 %ret.4, %storemerge1
+  %inc38 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond32
+
+for.end39:                                        ; preds = %for.cond32
+  %0 = icmp eq i32 %ret.4, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %ret.4, %5
+  br label %early
+
+for.cond41:                                       ; preds = %for.body45, %if.then9
+  %ret.5 = phi i32 [ %div46, %for.body45 ], [ %ret.1, %if.then9 ]
+  %storemerge2 = phi i32 [ %inc48, %for.body45 ], [ 0, %if.then9 ]
+  %add42 = add nsw i32 %n, 5
+  %cmp43 = icmp slt i32 %storemerge2, %add42
+  br i1 %cmp43, label %for.body45, label %for.end49
+
+for.body45:                                       ; preds = %for.cond41
+  %div46 = sdiv i32 %ret.5, 2
+  %inc48 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond41
+
+for.end49:                                        ; preds = %for.cond41
+  %sub50 = sub nsw i32 %ret.5, %n
+  br label %h
+
+h:                                                ; preds = %for.end49, %for.cond
+  %ret.6 = phi i32 [ %sub50, %for.end49 ], [ %ret.0, %for.cond ]
+  br label %for.cond52
+
+for.cond52:                                       ; preds = %for.body56, %h
+  %ret.7 = phi i32 [ %ret.6, %h ], [ %sub57, %for.body56 ]
+  %storemerge3 = phi i32 [ 0, %h ], [ %inc59, %for.body56 ]
+  %mul53 = shl nsw i32 %n, 1
+  %cmp54 = icmp slt i32 %storemerge3, %mul53
+  br i1 %cmp54, label %for.body56, label %early
+
+for.body56:                                       ; preds = %for.cond52
+  %sub57 = sub nsw i32 %ret.7, %storemerge3
+  %inc59 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond52
+
+early:                                            ; preds = %for.cond52, %for.end39
+  %ret.8 = phi i32 [ %div, %for.end39 ], [ %ret.7, %for.cond52 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.8, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization16, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization16
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODYPREHEADER:.+]]
+
+; CHECK: [[WHILEBODYPREHEADER]]:
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND15PREHEADER:.+]]
+
+; CHECK: [[FORCOND15PREHEADER]]:
+; CHECK: br label %[[FORCOND15:.+]]
+
+; CHECK: [[IFTHEN9:.+]]:
+; CHECK: br label %[[IFEND24:.+]]
+
+; CHECK: [[FORCOND41PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND41:.+]]
+
+; CHECK: [[FORCOND15]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY19:.+]], label %[[IFEND24LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY19]]:
+; CHECK: br label %[[FORCOND15]]
+
+; CHECK: [[IFEND24LOOPEXIT]]:
+; CHECK: br label %[[IFTHEN9]]
+
+; CHECK: [[IFEND24]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[IFEND29:.+]]
+
+; CHECK: [[IFEND29]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[IFEND29ELSE:.+]]:
+; CHECK: br label %[[FORCOND41PREHEADER]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY36:.+]], label %[[FOREND39:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[FOREND39]]:
+; CHECK: br label %[[IFEND29ELSE]]
+
+; CHECK: [[FORCOND41]]:
+; CHECK: %[[CMP43:.+]] = icmp
+; CHECK: br i1 %[[CMP43]], label %[[FORBODY45:.+]], label %[[FOREND49:.+]]
+
+; CHECK: [[FORBODY45]]:
+; CHECK: br label %[[FORCOND41]]
+
+; CHECK: [[FOREND49]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[FORCOND52:.+]]
+
+; CHECK: [[FORCOND52]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY56:.+]], label %[[EARLYLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY56]]:
+; CHECK: br label %[[FORCOND52]]
+
+; CHECK: [[EARLYLOOPEXIT]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
new file mode 100644
index 0000000000000..0ed3fe5c32596
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization17.ll
@@ -0,0 +1,377 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization17 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;              a
+;              |
+;              b <----.
+;             / \     |
+;            c   d    |
+;           /   / \   |
+;          e   f   g -'
+;         / \  |   |
+;   .--> h   | i   j
+;   |   / \  |  \ /
+;   '- k   l '-> m
+;      |    \   /
+;      n     \ /
+;       \     o
+;        \   /
+;         \ /
+;          p
+;
+; * where nodes b, d, and h are uniform branches, and nodes e and g are varying
+;   branches.
+; * where nodes h, j, m, o, and p are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;              a
+;              |
+;              b <----.
+;             / \     |
+;            c   d    |
+;           /   / \   |
+;          e   f   g -'
+;         /    |   |
+;   .--> h     i   |
+;   |   / \    |   |
+;   '- k   l   |   |
+;       \   \  |  /
+;        n   \ | /
+;         \   \|/
+;          `-> j
+;              |
+;              m
+;              |
+;              o
+;              |
+;              p
+;
+; __kernel void partial_linearization17(__global int *out, int n, int x) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 10) {
+;       goto c;
+;     } else if (n < 5) {
+;       goto f;
+;     }
+;     if (id + i++ % 2 == 0) {
+;       break;
+;     }
+;   }
+;
+;   // j
+;   for (int i = 0; i < n + 10; i++) ret++;
+;   goto m;
+;
+; f:
+;   ret += x / 2;
+;   for (int i = 0; i < x / 2; i++) ret += i;
+;   goto m;
+;
+; c:
+;   for (int i = 0; i < n - 5; i++) ret += 2;
+;   // e
+;   if (id % 2 == 0) {
+;     goto h;
+;   } else {
+;     goto m;
+;   }
+;
+; m:
+;   ret <<= 2;
+;   goto o;
+;
+; h:
+;   for (int i = 0; i < x / 2; i++) {
+;     if (x < 5) {
+;       goto l;
+;     }
+;   }
+;   // n
+;   ret += id << 3;
+;   goto p;
+;
+; l:
+;   ret += id << 3;
+;
+; o:
+;   for (int i = 0; i < x / 2; i++) ret += i;
+;
+; p:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization17(i32 addrspace(1)* %out, i32 noundef %n, i32 noundef %x) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end5, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end5 ]
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %for.cond28, label %if.else
+
+if.else:                                          ; preds = %while.body
+  %cmp2 = icmp slt i32 %n, 5
+  br i1 %cmp2, label %f, label %if.end5
+
+if.end5:                                          ; preds = %if.else
+  %inc = add nuw nsw i32 %i.0, 1
+  %rem = and i32 %i.0, 1
+  %add = sub nsw i32 0, %rem
+  %cmp6 = icmp eq i32 %conv, %add
+  br i1 %cmp6, label %for.cond, label %while.body
+
+for.cond:                                         ; preds = %for.body, %if.end5
+  %ret.0 = phi i32 [ %inc14, %for.body ], [ 0, %if.end5 ]
+  %storemerge = phi i32 [ %inc15, %for.body ], [ 0, %if.end5 ]
+  %add11 = add nsw i32 %n, 10
+  %cmp12 = icmp slt i32 %storemerge, %add11
+  br i1 %cmp12, label %for.body, label %m
+
+for.body:                                         ; preds = %for.cond
+  %inc14 = add nuw nsw i32 %ret.0, 1
+  %inc15 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+f:                                                ; preds = %if.else
+  %div = sdiv i32 %x, 2
+  br label %for.cond18
+
+for.cond18:                                       ; preds = %for.body22, %f
+  %ret.1 = phi i32 [ %div, %f ], [ %add23, %for.body22 ]
+  %storemerge3 = phi i32 [ 0, %f ], [ %inc25, %for.body22 ]
+  %div19 = sdiv i32 %x, 2
+  %cmp20 = icmp slt i32 %storemerge3, %div19
+  br i1 %cmp20, label %for.body22, label %m
+
+for.body22:                                       ; preds = %for.cond18
+  %add23 = add nsw i32 %storemerge3, %ret.1
+  %inc25 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond18
+
+for.cond28:                                       ; preds = %for.body32, %while.body
+  %ret.2 = phi i32 [ %add33, %for.body32 ], [ 0, %while.body ]
+  %storemerge4 = phi i32 [ %inc35, %for.body32 ], [ 0, %while.body ]
+  %add29 = add nsw i32 %n, 5
+  %cmp30 = icmp slt i32 %storemerge4, %add29
+  br i1 %cmp30, label %for.body32, label %for.end36
+
+for.body32:                                       ; preds = %for.cond28
+  %add33 = add nuw nsw i32 %ret.2, 2
+  %inc35 = add nuw nsw i32 %storemerge4, 1
+  br label %for.cond28
+
+for.end36:                                        ; preds = %for.cond28
+  %rem375 = and i32 %conv, 1
+  %cmp38 = icmp eq i32 %rem375, 0
+  br i1 %cmp38, label %for.cond43, label %m
+
+m:                                                ; preds = %for.end36, %for.cond18, %for.cond
+  %ret.3 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond18 ], [ %ret.2, %for.end36 ]
+  %shl = shl i32 %ret.3, 2
+  br label %o
+
+for.cond43:                                       ; preds = %for.inc52, %for.end36
+  %storemerge6 = phi i32 [ %inc53, %for.inc52 ], [ 0, %for.end36 ]
+  %div44 = sdiv i32 %x, 2
+  %cmp45 = icmp slt i32 %storemerge6, %div44
+  br i1 %cmp45, label %for.body47, label %for.end54
+
+for.body47:                                       ; preds = %for.cond43
+  %cmp48 = icmp slt i32 %x, 5
+  br i1 %cmp48, label %l, label %for.inc52
+
+for.inc52:                                        ; preds = %for.body47
+  %inc53 = add nuw nsw i32 %storemerge6, 1
+  br label %for.cond43
+
+for.end54:                                        ; preds = %for.cond43
+  %shl55 = mul i32 %conv, 8
+  %add56 = add nsw i32 %ret.2, %shl55
+  br label %p
+
+l:                                                ; preds = %for.body47
+  %shl57 = mul i32 %conv, 8
+  %add58 = add nsw i32 %ret.2, %shl57
+  br label %o
+
+o:                                                ; preds = %l, %m
+  %storemerge1 = phi i32 [ %shl, %m ], [ %add58, %l ]
+  br label %for.cond60
+
+for.cond60:                                       ; preds = %for.body64, %o
+  %ret.4 = phi i32 [ %storemerge1, %o ], [ %add65, %for.body64 ]
+  %storemerge2 = phi i32 [ 0, %o ], [ %inc67, %for.body64 ]
+  %div61 = sdiv i32 %x, 2
+  %cmp62 = icmp slt i32 %storemerge2, %div61
+  br i1 %cmp62, label %for.body64, label %p
+
+for.body64:                                       ; preds = %for.cond60
+  %add65 = add nsw i32 %storemerge2, %ret.4
+  %inc67 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond60
+
+p:                                                ; preds = %for.cond60, %for.end54
+  %ret.5 = phi i32 [ %add56, %for.end54 ], [ %ret.4, %for.cond60 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.5, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization17, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization17
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[FORCOND28PREHEADER:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[FORCOND28PREHEADER]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FORCOND28PREHEADERELSE:.+]]:
+; CHECK: br label %[[M:.+]]
+
+; CHECK: [[FORCOND28PREHEADERSPLIT:.+]]:
+; CHECK: br label %[[FORCOND28:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %[[CMP2]], label %[[F:.+]], label %[[IFEND5:.+]]
+
+; CHECK: [[IFEND5]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCONDPREHEADERELSE:.+]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE:.+]], label %[[FSPLIT:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[MLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND28PREHEADERELSE]], label %[[FORCOND28PREHEADERSPLIT]]
+
+; CHECK: [[FSPLIT]]:
+; CHECK: br label %[[FORCOND18:.+]]
+
+; CHECK: [[FORCOND18]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[MLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY22]]:
+; CHECK: br label %[[FORCOND18]]
+
+; CHECK: [[FORCOND28]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY32:.+]], label %[[FOREND36:.+]]
+
+; CHECK: [[FORBODY32]]:
+; CHECK: br label %[[FORCOND28]]
+
+; CHECK: [[FOREND36]]:
+; CHECK: br label %[[FORCOND43PREHEADER:.+]]
+
+; CHECK: [[FORCOND43PREHEADER]]:
+; CHECK: br label %[[FORCOND43:.+]]
+
+; CHECK: [[MLOOPEXIT]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[MLOOPEXIT2]]:
+; CHECK: br label %[[FORCONDPREHEADERELSE]]
+
+; CHECK: [[M]]:
+; CHECK: br label %[[O:.+]]
+
+; CHECK: [[FORCOND43]]:
+; CHECK: %[[CMP14:.+]] = icmp
+; CHECK: br i1 %[[CMP14]], label %[[FORBODY47:.+]], label %[[FOREND54:.+]]
+
+; CHECK: [[FORBODY47]]:
+; CHECK: %[[CMP48:.+]] = icmp
+; CHECK: br i1 %[[CMP48]], label %[[L:.+]], label %[[FORINC52:.+]]
+
+; CHECK: [[FORINC52]]:
+; CHECK: br label %[[FORCOND43]]
+
+; CHECK: [[FOREND54]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[L]]:
+; CHECK: br label %[[M]]
+
+; CHECK: [[O]]:
+; CHECK: br label %[[FORCOND60:.+]]
+
+; CHECK: [[FORCOND60]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY64:.+]], label %[[PLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY64]]:
+; CHECK: br label %[[FORCOND60]]
+
+; CHECK: [[PLOOPEXIT]]:
+; CHECK: br label %[[P:.+]]
+
+; CHECK: [[P]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
new file mode 100644
index 0000000000000..903ba12b02fd9
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization18.ll
@@ -0,0 +1,289 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization18 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <--.
+;      / \   |
+;     c   d -'
+;    / \  |
+;   e   f |
+;   |    \|
+;   |     g
+;   |    /
+;   |   h
+;    \ / \
+;     i   j
+;      \ /
+;       k
+;
+; * where nodes b, and h are uniform branches, and nodes c and d are varying
+;   branches.
+; * where nodes e, f, g, i and k are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;     a
+;     |
+;     b <--.
+;    / \   |
+;   c   d -'
+;   |   |
+;   f   |
+;   |   |
+;   e   |
+;    \ /
+;     g
+;     |
+;     h
+;    / \
+;   |   j
+;    \ /
+;     i
+;     |
+;     k
+;
+; __kernel void partial_linearization18(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 5) {
+;       if (id + i % 2 == 0) {
+;         goto e;
+;       } else {
+;         goto f;
+;       }
+;     }
+;     if (++i + id > 3) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   goto g;
+;
+; g:
+;   for (int i = 1; i < n * 2; i++) ret *= i;
+;   goto h;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) ret++;
+;   goto i;
+;
+; h:
+;   if (n > 3) {
+; i:
+;     ret++;
+;   } else {
+;     ret *= 3;
+;   }
+;
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization18(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp sgt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  %rem = and i32 %i.0, 1
+  %add = sub nsw i32 0, %rem
+  %cmp2 = icmp eq i32 %conv, %add
+  br i1 %cmp2, label %for.cond26, label %for.cond
+
+if.end:                                           ; preds = %while.body
+  %inc = add nuw nsw i32 %i.0, 1
+  %add5 = add nsw i32 %inc, %conv
+  %cmp6 = icmp sgt i32 %add5, 3
+  br i1 %cmp6, label %g, label %while.body
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %add14, %for.body ], [ 0, %if.then ]
+  %storemerge2 = phi i32 [ %inc15, %for.body ], [ 0, %if.then ]
+  %add11 = add nsw i32 %n, 5
+  %cmp12 = icmp slt i32 %storemerge2, %add11
+  br i1 %cmp12, label %for.body, label %g
+
+for.body:                                         ; preds = %for.cond
+  %add14 = add nuw nsw i32 %ret.0, 2
+  %inc15 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond
+
+g:                                                ; preds = %for.cond, %if.end
+  %ret.1 = phi i32 [ 0, %if.end ], [ %ret.0, %for.cond ]
+  br label %for.cond17
+
+for.cond17:                                       ; preds = %for.body20, %g
+  %ret.2 = phi i32 [ %ret.1, %g ], [ %mul21, %for.body20 ]
+  %storemerge = phi i32 [ 1, %g ], [ %inc23, %for.body20 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp18 = icmp slt i32 %storemerge, %mul
+  br i1 %cmp18, label %for.body20, label %h
+
+for.body20:                                       ; preds = %for.cond17
+  %mul21 = mul nsw i32 %storemerge, %ret.2
+  %inc23 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond17
+
+for.cond26:                                       ; preds = %for.body30, %if.then
+  %ret.3 = phi i32 [ %inc31, %for.body30 ], [ 0, %if.then ]
+  %storemerge3 = phi i32 [ %inc33, %for.body30 ], [ 0, %if.then ]
+  %add27 = add nsw i32 %n, 5
+  %cmp28 = icmp slt i32 %storemerge3, %add27
+  br i1 %cmp28, label %for.body30, label %i38
+
+for.body30:                                       ; preds = %for.cond26
+  %inc31 = add nuw nsw i32 %ret.3, 1
+  %inc33 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond26
+
+h:                                                ; preds = %for.cond17
+  %cmp35 = icmp sgt i32 %n, 3
+  br i1 %cmp35, label %i38, label %if.else40
+
+i38:                                              ; preds = %h, %for.cond26
+  %ret.4 = phi i32 [ %ret.3, %for.cond26 ], [ %ret.2, %h ]
+  %inc39 = add nsw i32 %ret.4, 1
+  br label %if.end42
+
+if.else40:                                        ; preds = %h
+  %mul41 = mul nsw i32 %ret.2, 3
+  br label %if.end42
+
+if.end42:                                         ; preds = %if.else40, %i38
+  %storemerge1 = phi i32 [ %mul41, %if.else40 ], [ %inc39, %i38 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization18, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization18
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHENELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[IFTHENSPLIT:.+]]:
+; CHECK: br label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND26PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND26:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[GLOOPEXIT2:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[GLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[FORCOND26PREHEADER]]
+
+; CHECK: [[GLOOPEXIT2]]:
+; CHECK: br label %[[GLOOPEXIT2ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT2ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND17:.+]]
+
+; CHECK: [[FORCOND17]]:
+; CHECK: %[[CMP18:.+]] = icmp
+; CHECK: br i1 %[[CMP18]], label %[[FORBODY20:.+]], label %[[H:.+]]
+
+; CHECK: [[FORBODY20]]:
+; CHECK: br label %[[FORCOND17]]
+
+; CHECK: [[FORCOND26]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY30:.+]], label %[[I38LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY30]]:
+; CHECK: br label %[[FORCOND26]]
+
+; CHECK: [[H]]:
+; CHECK: %[[CMP35:.+]] = icmp
+; CHECK: br i1 %[[CMP35]], label %[[I38:.+]], label %[[IFELSE40:.+]]
+
+; CHECK: [[I38LOOPEXIT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[I38]]:
+; CHECK: br label %[[IFEND42:.+]]
+
+; CHECK: [[IFELSE40]]:
+; CHECK: br label %[[I38]]
+
+; CHECK: [[IFEND42]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
new file mode 100644
index 0000000000000..6810eb855c5f4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization19.ll
@@ -0,0 +1,308 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization19 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <----.
+;      / \     |
+;     c   \    |
+;    / \   \   |
+;   d   e   f -'
+;   |   |   |
+;    \  \   g
+;     \  \ / \
+;      \  h   i <,
+;       \  \ /  /
+;        \  j  /
+;         \   /
+;          `-'
+;
+; * where nodes b, c, and g are uniform branches, and node f is a varying
+;   branch.
+; * where nodes g, h, i and j are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;       a
+;       |
+;       b <----.
+;      / \     |
+;     c   \    |
+;    / \   \   |
+;   d   e   f -'
+;   |   |   |
+;    \  |  /
+;     \ | /
+;      \|/
+;       g
+;       |
+;       i
+;       |
+;       h
+;       |
+;       j
+;
+; The uniform branch `g` has been linearized because both its successors are
+; divergent. Not linearizing `g`  would mean that only one of both
+; successors could be executed in addition to the other, pending a uniform
+; condition evaluates to true, whereas what we want is to possibly execute both
+; no matter what the uniform condition evaluates to.
+;
+; __kernel void partial_linearization19(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;   int i = 0;
+;
+;   while (1) {
+;     if (n > 5) {
+;       if (n == 6) {
+;         goto d;
+;       } else {
+;         goto e;
+;       }
+;     }
+;     if (++i + id > 3) {
+;       break;
+;     }
+;   }
+;
+;   // g
+;   if (n == 3) {
+;     goto h;
+;   } else {
+;     goto i;
+;   }
+;
+; d:
+;   for (int i = 0; i < n + 5; i++) ret += 2;
+;   goto i;
+;
+; e:
+;   for (int i = 1; i < n * 2; i++) ret += i;
+;   goto h;
+;
+; i:
+;   for (int i = 0; i < n + 5; i++) ret++;
+;   goto j;
+;
+; h:
+;   for (int i = 0; i < n; i++) ret++;
+;   goto j;
+;
+; j:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization19(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp sgt i32 %n, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp eq i32 %n, 6
+  br i1 %cmp2, label %for.cond, label %for.cond20
+
+if.end:                                           ; preds = %while.body
+  %inc = add nuw nsw i32 %i.0, 1
+  %add = add nsw i32 %inc, %conv
+  %cmp5 = icmp sgt i32 %add, 3
+  br i1 %cmp5, label %while.end, label %while.body
+
+while.end:                                        ; preds = %if.end
+  %cmp9 = icmp eq i32 %n, 3
+  br i1 %cmp9, label %h, label %i28
+
+for.cond:                                         ; preds = %for.body, %if.then
+  %ret.0 = phi i32 [ %add17, %for.body ], [ 0, %if.then ]
+  %storemerge3 = phi i32 [ %inc18, %for.body ], [ 0, %if.then ]
+  %add14 = add nsw i32 %n, 5
+  %cmp15 = icmp slt i32 %storemerge3, %add14
+  br i1 %cmp15, label %for.body, label %i28
+
+for.body:                                         ; preds = %for.cond
+  %add17 = add nuw nsw i32 %ret.0, 2
+  %inc18 = add nuw nsw i32 %storemerge3, 1
+  br label %for.cond
+
+for.cond20:                                       ; preds = %for.body23, %if.then
+  %ret.1 = phi i32 [ %add24, %for.body23 ], [ 0, %if.then ]
+  %storemerge2 = phi i32 [ %inc26, %for.body23 ], [ 1, %if.then ]
+  %mul = shl nsw i32 %n, 1
+  %cmp21 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp21, label %for.body23, label %h
+
+for.body23:                                       ; preds = %for.cond20
+  %add24 = add nuw nsw i32 %storemerge2, %ret.1
+  %inc26 = add nuw nsw i32 %storemerge2, 1
+  br label %for.cond20
+
+i28:                                              ; preds = %for.cond, %while.end
+  %ret.2 = phi i32 [ 0, %while.end ], [ %ret.0, %for.cond ]
+  br label %for.cond30
+
+for.cond30:                                       ; preds = %for.body34, %i28
+  %ret.3 = phi i32 [ %ret.2, %i28 ], [ %inc35, %for.body34 ]
+  %storemerge = phi i32 [ 0, %i28 ], [ %inc37, %for.body34 ]
+  %add31 = add nsw i32 %n, 5
+  %cmp32 = icmp slt i32 %storemerge, %add31
+  br i1 %cmp32, label %for.body34, label %j
+
+for.body34:                                       ; preds = %for.cond30
+  %inc35 = add nuw nsw i32 %ret.3, 1
+  %inc37 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond30
+
+h:                                                ; preds = %for.cond20, %while.end
+  %ret.4 = phi i32 [ 0, %while.end ], [ %ret.1, %for.cond20 ]
+  br label %for.cond40
+
+for.cond40:                                       ; preds = %for.body43, %h
+  %ret.5 = phi i32 [ %ret.4, %h ], [ %inc44, %for.body43 ]
+  %storemerge1 = phi i32 [ 0, %h ], [ %inc46, %for.body43 ]
+  %cmp41 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp41, label %for.body43, label %j
+
+for.body43:                                       ; preds = %for.cond40
+  %inc44 = add nsw i32 %ret.5, 1
+  %inc46 = add nuw nsw i32 %storemerge1, 1
+  br label %for.cond40
+
+j:                                                ; preds = %for.cond40, %for.cond30
+  %ret.6 = phi i32 [ %ret.3, %for.cond30 ], [ %ret.5, %for.cond40 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization19, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization19
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[IFTHENELSE:.+]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[IFTHENSPLIT:.+]]:
+; CHECK: br i1 %[[CMP2MERGE:.+]], label %[[FORCONDPREHEADER:.+]], label %[[FORCOND20PREHEADER:.+]]
+
+; CHECK: [[FORCOND20PREHEADER]]:
+; CHECK: br label %[[FORCOND20:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: %[[CMP2MERGE]] = phi i1 [ %[[CMP2]], %[[IFTHEN]] ], [ false, %[[IFEND]] ]
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[WHILEENDELSE:.+]]
+
+; CHECK: [[WHILEENDELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[IFTHENELSE]], label %[[IFTHENSPLIT]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[I28LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND20]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY23:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY23]]:
+; CHECK: br label %[[FORCOND20]]
+
+; CHECK: [[I28LOOPEXIT]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[I28:.+]]:
+; CHECK: br label %[[FORCOND30:.+]]
+
+; CHECK: [[FORCOND30]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY34:.+]], label %[[JLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY34]]:
+; CHECK: br label %[[FORCOND30]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[FORCOND40:.+]]
+
+; CHECK: [[FORCOND40]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY43:.+]], label %[[JLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY43]]:
+; CHECK: br label %[[FORCOND40]]
+
+; CHECK: [[JLOOPEXIT]]:
+; CHECK: br label %[[J:.+]]
+
+; CHECK: [[JLOOPEXIT2]]:
+; CHECK: br label %[[I28]]
+
+; CHECK: [[J]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
new file mode 100644
index 0000000000000..9e59e6bf7092b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization2.ll
@@ -0,0 +1,274 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization2 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   X   /
+;      \ / \ /
+;       h   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    /         \
+;   e - d   f - g
+;        \ /
+;         i
+;         |
+;         h
+;         |
+;         j
+;
+; __kernel void partial_linearization2(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n < 10) { // uniform
+;     if (id % 3 == 0) { // varying
+;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n / 3; i++) { ret -= 2; } goto i;
+;     }
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n * 2; i++) { ret += 1; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;   goto end;
+;
+; i:
+;   ret *= 10;
+;   goto end;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization2(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else17
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %if.else
+
+if.then4:                                         ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then4
+  %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ]
+  %storemerge5 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ]
+  %sub = add nsw i32 %n, -1
+  %cmp5 = icmp slt i32 %storemerge5, %sub
+  br i1 %cmp5, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %div = sdiv i32 %ret.0, 2
+  %inc = add nsw i32 %storemerge5, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %if.then
+  br label %for.cond8
+
+for.cond8:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ]
+  %storemerge4 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div9 = sdiv i32 %n, 3
+  %cmp10 = icmp slt i32 %storemerge4, %div9
+  br i1 %cmp10, label %for.body12, label %i42
+
+for.body12:                                       ; preds = %for.cond8
+  %sub13 = add nsw i32 %ret.1, -2
+  %inc15 = add nsw i32 %storemerge4, 1
+  br label %for.cond8
+
+if.else17:                                        ; preds = %entry
+  %rem181 = and i32 %conv, 1
+  %cmp19 = icmp eq i32 %rem181, 0
+  br i1 %cmp19, label %if.then21, label %if.else30
+
+if.then21:                                        ; preds = %if.else17
+  br label %for.cond23
+
+for.cond23:                                       ; preds = %for.body26, %if.then21
+  %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ]
+  %storemerge3 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp24 = icmp slt i32 %storemerge3, %mul
+  br i1 %cmp24, label %for.body26, label %h
+
+for.body26:                                       ; preds = %for.cond23
+  %add = add nsw i32 %ret.2, 1
+  %inc28 = add nsw i32 %storemerge3, 1
+  br label %for.cond23
+
+if.else30:                                        ; preds = %if.else17
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 5
+  %cmp34 = icmp slt i32 %storemerge, %add33
+  br i1 %cmp34, label %for.body36, label %i42
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.3, 1
+  %inc39 = add nsw i32 %storemerge, 1
+  br label %for.cond32
+
+h:                                                ; preds = %for.cond23, %for.cond
+  %ret.4 = phi i32 [ %ret.0, %for.cond ], [ %ret.2, %for.cond23 ]
+  %add41 = add nsw i32 %ret.4, 5
+  br label %end
+
+i42:                                              ; preds = %for.cond32, %for.cond8
+  %ret.5 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.3, %for.cond32 ]
+  %mul43 = mul nsw i32 %ret.5, 10
+  br label %end
+
+end:                                              ; preds = %i42, %h
+  %storemerge2 = phi i32 [ %mul43, %i42 ], [ %add41, %h ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization2, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization2
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[FORCOND8PREHEADER:.+]]
+
+; CHECK: [[FORCOND8PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND8:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND8]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[I42LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND8]]
+
+; CHECK: [[IFELSE17]]:
+; CHECK: br label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND23PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND23:.+]]
+
+; CHECK: [[FORCOND23]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT3:.+]]
+
+; CHECK: [[FORBODY26]]:
+; CHECK: br label %[[FORCOND23]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[I42LOOPEXIT4:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[I42:.+]]
+
+; CHECK: [[HLOOPEXIT3]]:
+; CHECK: br label %[[I42]]
+
+; CHECK: [[H:.+]]:
+; CHECK: br label %[[END:.+]]
+
+; CHECK: [[I42LOOPEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[I42LOOPEXIT4]]:
+; CHECK: br label %[[FORCOND23PREHEADER]]
+
+; CHECK: [[I42]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[END]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
new file mode 100644
index 0000000000000..56369b161964e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization20.ll
@@ -0,0 +1,236 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization20 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <--------.
+;    / \         |
+;   |   c        |
+;   |  / \       |
+;   | f   h <--. |
+;   | |  / \   | |
+;   | | |   d -' |
+;   | | |   |    |
+;   | | |   e ---'
+;   | | |  /
+;   | | | /
+;   | | |/
+;   | | /
+;    \|/
+;     g
+;
+; * where nodes b, d, and e are uniform branches, and node h is a varying
+;   branch.
+; * where nodes b, d and g are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <--.
+;     |    |
+;     c    |
+;    /|    |
+;   f h <. |
+;   | |  | |
+;   | d -' |
+;   | |    |
+;   | e ---'
+;    \|
+;     g
+;
+; __kernel void partial_linearization20(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto g;
+;     }
+;     if (n == 6) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (ret++ + id >= n) {
+;         goto d;
+;       }
+;       if (n & 1) {
+;         goto g;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+; e:
+;     if (n & 1) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   for (int i = 0; i < n + 1; i++) ret++;
+; g:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization20(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %inc, %e ]
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  br i1 %0, label %g, label %if.end
+
+if.end:                                           ; preds = %while.body
+  %cmp4 = icmp eq i32 %n, 6
+  br i1 %cmp4, label %for.cond, label %while.body9
+
+while.body9:                                      ; preds = %d, %if.end
+  %ret.1 = phi i32 [ %ret.0, %if.end ], [ %inc, %d ]
+  %inc = add nsw i32 %ret.1, 1
+  %add = add nsw i32 %ret.1, %conv
+  %cmp10 = icmp sge i32 %add, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond1 = or i1 %tobool, %cmp10
+  br i1 %or.cond1, label %d, label %g
+
+d:                                                ; preds = %while.body9
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body9
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+for.cond:                                         ; preds = %for.body, %if.end
+  %ret.2 = phi i32 [ %inc27, %for.body ], [ %ret.0, %if.end ]
+  %storemerge = phi i32 [ %inc28, %for.body ], [ 0, %if.end ]
+  %cmp25 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp25, label %g, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc27 = add nsw i32 %ret.2, 1
+  %inc28 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+g:                                                ; preds = %for.cond, %e, %while.body9, %while.body
+  %ret.3 = phi i32 [ %ret.0, %while.body ], [ %inc, %e ], [ %ret.2, %for.cond ], [ %inc, %while.body9 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization20, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization20
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: %[[CMP4:.+]] = icmp
+; CHECK: br i1 %[[CMP4]], label %[[FORCONDPREHEADER:.+]], label %[[WHILEBODY9PREHEADER:.+]]
+
+; CHECK: [[WHILEBODY9PREHEADER]]:
+; CHECK: br label %[[WHILEBODY9:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPREHEADERELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[FORCONDPREHEADERSPLIT:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[WHILEBODY9]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY9]], label %[[WHILEBODY9PUREEXIT:.+]]
+
+; CHECK: [[WHILEBODY9PUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[GLOOPEXIT1:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[GLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[GLOOPEXIT]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[GLOOPEXIT1]]:
+; CHECK: br label %[[GLOOPEXIT1ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT1ELSE]]:
+; CHECK: br label %[[GLOOPEXIT2:.+]]
+
+; CHECK: [[GLOOPEXIT2]]:
+; CHECK: br label %[[GLOOPEXIT2ELSE:.+]]
+
+; CHECK: [[GLOOPEXIT2ELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCONDPREHEADERELSE]], label %[[FORCONDPREHEADERSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
new file mode 100644
index 0000000000000..bc11225496785
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization21.ll
@@ -0,0 +1,197 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization21 -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <------.
+;    / \       |
+;   |   c <--. |
+;   |  / \   | |
+;   | |   d -' |
+;   | |  / \   |
+;   | | |   e -'
+;   | | |  /
+;   | | | /
+;   | | |/
+;   | | /
+;    \|/
+;     f
+;
+; * where nodes b, d, and e are uniform branches, and node c is a varying
+;   branch.
+; * where nodes b, d, e and f are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;   a
+;   |
+;   b <--.
+;   |    |
+;   c <. |
+;   |  | |
+;   d -' |
+;   |    |
+;   e ---'
+;   |
+;   f
+;
+; __kernel void partial_linearization21(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (n <= 2) {
+;         goto f;
+;       } else {
+;         if (ret + id >= n) {
+;           goto d;
+;         }
+;       }
+;       if (n & 1) {
+;         goto f;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+;
+; e:
+;     if (n & 1) {
+;       goto f;
+;     }
+;   }
+;
+; f:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization21(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  %cmp6.old = icmp eq i32 %n, 3
+  br i1 %cmp6.old, label %if.else, label %f
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %f
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %f
+
+f:                                                ; preds = %e, %if.else, %while.body5, %while.body
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization21, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization21
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[IFELSEPREHEADER:.+]]
+
+; CHECK: [[IFELSEPREHEADER]]:
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[WHILEBODY5:.+]]:
+
+; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
+
+; CHECK: [[IFELSEPUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[WHILEBODY5]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[FLOOPEXITELSE:.+]]
+
+; CHECK: [[FLOOPEXITELSE]]:
+; CHECK: br label %[[FLOOPEXIT1:.+]]
+
+; CHECK: [[FLOOPEXIT1]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[F]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
new file mode 100644
index 0000000000000..7be8b4bbc187d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization22.ll
@@ -0,0 +1,263 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization22 -vecz-passes="function(lower-switch),vecz-loop-rotate,indvars,cfg-convert" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <------.
+;    / \       |
+;   f   c <--. |
+;   |\ / \   | |
+;   | |   d -' |
+;   | |\ / \   |
+;   | | |   e -'
+;   | | |\ /
+;   | | | g
+;   | | |/
+;   | | /
+;    \|/
+;     h
+;
+; * where nodes b, d, and e are uniform branches, and node c is a varying
+;   branch.
+; * where nodes b, d, e and f are divergent.
+;
+; With partial linearization, it will be transformed as follows:
+;
+;     a
+;     |
+;     b <--.
+;    /|    |
+;   f c <. |
+;   | |  | |
+;   | d -' |
+;   | |    |
+;   | e ---'
+;    \|
+;     g
+;     |
+;     h
+;
+; __kernel void partial_linearization22(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n > 0 && n < 5) {
+;       goto f;
+;     }
+;     while (1) {
+;       if (n <= 2) {
+;         goto f;
+;       } else {
+;         if (ret + id >= n) {
+;           goto d;
+;         }
+;       }
+;       if (n & 1) {
+;         goto h;
+;       }
+;
+; d:
+;       if (n > 3) {
+;         goto e;
+;       }
+;     }
+;
+; e:
+;     if (n & 1) {
+;       goto g;
+;     }
+;   }
+;
+; f:
+;   if (n == 2) {
+;     goto h;
+;   }
+;
+; g:
+;   for (int i = 0; i < n + 1; i++) ret++;
+;   goto h;
+;
+; h:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization22(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  switch i32 %n, label %g [
+    i32 3, label %if.else
+    i32 2, label %h
+  ]
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %h
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+f:                                                ; preds = %while.body
+  %cmp24 = icmp eq i32 %n, 2
+  br i1 %cmp24, label %h, label %g
+
+g:                                                ; preds = %f, %e, %while.body5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %g
+  %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ]
+  %cmp29 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp29, label %h, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc31 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+h:                                                ; preds = %for.cond, %f, %if.else, %while.body5
+  %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization22, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization22
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP6:.+]] = icmp slt
+; CHECK: %[[ORCOND1:.+]] = or i1 %[[CMP6]]
+; CHECK: %[[F_EXIT_MASK:.+]] = select i1
+; CHECK: %[[ORCOND2:.+]] = call i1 @__vecz_b_divergence_any(i1 %[[ORCOND1]])
+; CHECK: br i1 %[[ORCOND2]], label %[[F:.+]], label %[[IFELSEPREHEADER:.+]]
+
+; CHECK: [[IFELSEPREHEADER]]:
+; CHECK: br label %[[IFELSE:.+]]
+
+; CHECK: [[LEAFBLOCK1:.*]]:
+; CHECK: %[[SWITCHLEAF:.+]] = icmp eq i32 %n, 3
+; CHECK: br i1 %{{.+}}, label %[[IFELSE]], label %[[IFELSEPUREEXIT:.+]]
+
+; CHECK: [[IFELSEPUREEXIT]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[LEAFBLOCK1]]
+
+; CHECK: [[E]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: %[[CMP24MERGE:.+]] = phi i1 [ %[[G_EXIT_MASK:.+]], %[[F]] ], [ false, %[[E]] ]
+; CHECK: br label %[[HLOOPEXIT1:.+]]
+
+; CHECK: [[F]]:
+; CHECK: %[[CMP24:.+]] = icmp eq i32 %n, 2
+; CHECK: %[[G_EXIT_MASK]] = select i1 %[[CMP24]], i1 false, i1 %[[F_EXIT_MASK]]
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[FELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[FSPLIT:.+]]:
+; CHECK: %[[CMP24_ANY:.+]] = call i1 @__vecz_b_divergence_any(i1 %cmp24.merge)
+; CHECK: br i1 %[[CMP24_ANY]], label %[[H:.+]], label %[[G]]
+
+; CHECK: [[GLOOPEXIT:.+]]:
+; CHECK: br label %[[GLOOPEXITELSE:.+]]
+
+; CHECK: [[GLOOPEXITELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[FELSE]], label %[[FSPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 true, label %[[HLOOPEXIT:.+]], label %[[FORBODY:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[HLOOPEXIT1]]:
+; CHECK: br label %[[HLOOPEXIT1ELSE:.+]]
+
+; CHECK: [[HLOOPEXIT1ELSE]]:
+; CHECK: br label %[[GLOOPEXIT]]
+
+;; CHECK: [[H]]:
+;; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
new file mode 100644
index 0000000000000..58a1f2548f38e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization23.ll
@@ -0,0 +1,247 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization23 -vecz-passes=cfg-convert -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   X   /
+;      \ / \ /
+;       h   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    /         \
+;   e - d   f - g
+;        \ /
+;         i
+;         |
+;         h
+;         |
+;         j
+;
+; The purpose of this test is to make sure we correctly handle blending in `i`
+; which cannot be considered as a blend block since it is not the join point of
+; either div causing blocks.
+; We want to make sure the incoming blocks of the phi nodes in `i` are correctly
+; translated into select instructions for the predecessors which get linearized.
+;
+;
+;
+; __kernel void partial_linearization23(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n > 10) {
+;     if (id % 3 == 0) {
+;       ret = n - 1; goto h;
+;     } else {
+;       for (int i = 0; i < n / 3; i++) { ret += 2; } goto i;
+;     }
+;   } else {
+;     if (id % 2 == 0) {
+;       ret = n * 2; goto h;
+;     } else {
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;   goto end;
+;
+; i:
+;   ret *= 10;
+;   goto end;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @partial_linearization23(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else7
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %if.then
+  %div = sdiv i32 %n, 3
+  %cmp52 = icmp sgt i32 %n, 2
+  br i1 %cmp52, label %for.body.lr.ph, label %i24
+
+for.body.lr.ph:                                   ; preds = %for.cond.preheader
+  %min.iters.check = icmp ult i32 %div, 8
+  br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.lr.ph
+  %n.vec = and i32 %div, -8
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi6 = phi i32 [ 0, %vector.ph ], [ %0, %vector.body ]
+  %vec.phi11 = phi i32 [ 0, %vector.ph ], [ %1, %vector.body ]
+  %vec.phi17 = phi i32 [ 0, %vector.ph ], [ %2, %vector.body ]
+  %vec.phi22 = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
+  %vec.phi104 = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
+  %vec.phi109 = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
+  %vec.phi1015 = phi i32 [ 0, %vector.ph ], [ %6, %vector.body ]
+  %vec.phi1020 = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
+  %0 = add nuw nsw i32 %vec.phi6, 2
+  %1 = add nuw nsw i32 %vec.phi11, 2
+  %2 = add nuw nsw i32 %vec.phi17, 2
+  %3 = add nuw nsw i32 %vec.phi22, 2
+  %4 = add nuw nsw i32 %vec.phi104, 2
+  %5 = add nuw nsw i32 %vec.phi109, 2
+  %6 = add nuw nsw i32 %vec.phi1015, 2
+  %7 = add nuw nsw i32 %vec.phi1020, 2
+  %index.next = add i32 %index, 8
+  %8 = icmp eq i32 %index.next, %n.vec
+  br i1 %8, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %.lcssa25 = phi i32 [ %0, %vector.body ]
+  %.lcssa210 = phi i32 [ %1, %vector.body ]
+  %.lcssa216 = phi i32 [ %2, %vector.body ]
+  %.lcssa221 = phi i32 [ %3, %vector.body ]
+  %.lcssa3 = phi i32 [ %4, %vector.body ]
+  %.lcssa8 = phi i32 [ %5, %vector.body ]
+  %.lcssa14 = phi i32 [ %6, %vector.body ]
+  %.lcssa19 = phi i32 [ %7, %vector.body ]
+  %bin.rdx7 = add nuw i32 %.lcssa3, %.lcssa25
+  %bin.rdx12 = add nuw i32 %.lcssa8, %.lcssa210
+  %bin.rdx18 = add nuw i32 %.lcssa14, %.lcssa216
+  %bin.rdx23 = add nuw i32 %.lcssa19, %.lcssa221
+  %bin.rdx1113 = add i32 %bin.rdx7, %bin.rdx12
+  %bin.rdx1124 = add i32 %bin.rdx18, %bin.rdx23
+  %bin.rdx1325 = add i32 %bin.rdx1113, %bin.rdx1124
+  %cmp.n = icmp eq i32 %div, %n.vec
+  br i1 %cmp.n, label %i24, label %scalar.ph
+
+scalar.ph:                                        ; preds = %middle.block, %for.body.lr.ph
+  %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %for.body.lr.ph ]
+  %bc.merge.rdx = phi i32 [ %bin.rdx1325, %middle.block ], [ 0, %for.body.lr.ph ]
+  %9 = add i32 %bc.resume.val, 1
+  %10 = icmp sgt i32 %div, %9
+  %smax = select i1 %10, i32 %div, i32 %9
+  %11 = shl i32 %smax, 1
+  %12 = shl i32 %bc.resume.val, 1
+  br label %for.body
+
+if.then4:                                         ; preds = %if.then
+  %sub = add nsw i32 %n, -1
+  br label %h
+
+for.body:                                         ; preds = %for.body, %scalar.ph
+  %storemerge44 = phi i32 [ %bc.resume.val, %scalar.ph ], [ %inc, %for.body ]
+  %inc = add nuw nsw i32 %storemerge44, 1
+  %cmp5 = icmp slt i32 %inc, %div
+  br i1 %cmp5, label %for.body, label %i24.loopexit
+
+if.else7:                                         ; preds = %entry
+  %rem81 = and i32 %conv, 1
+  %cmp9 = icmp eq i32 %rem81, 0
+  br i1 %cmp9, label %if.then11, label %for.cond14.preheader
+
+for.cond14.preheader:                             ; preds = %if.else7
+  %add15 = add nsw i32 %n, 5
+  %cmp165 = icmp sgt i32 %add15, 0
+  br i1 %cmp165, label %for.body18.preheader, label %i24
+
+for.body18.preheader:                             ; preds = %for.cond14.preheader
+  %13 = add i32 %n, 5
+  br label %for.body18
+
+if.then11:                                        ; preds = %if.else7
+  %mul = shl nsw i32 %n, 1
+  br label %h
+
+for.body18:                                       ; preds = %for.body18.preheader, %for.body18
+  %storemerge7 = phi i32 [ %inc21, %for.body18 ], [ 0, %for.body18.preheader ]
+  %ret.16 = phi i32 [ %mul19, %for.body18 ], [ 0, %for.body18.preheader ]
+  %mul19 = shl nsw i32 %ret.16, 1
+  %inc21 = add nuw nsw i32 %storemerge7, 1
+  %exitcond = icmp ne i32 %inc21, %13
+  br i1 %exitcond, label %for.body18, label %i24.loopexit1
+
+h:                                                ; preds = %if.then11, %if.then4
+  %storemerge3 = phi i32 [ %mul, %if.then11 ], [ %sub, %if.then4 ]
+  %add23 = add nsw i32 %storemerge3, 5
+  br label %end
+
+i24.loopexit:                                     ; preds = %for.body
+  %14 = add i32 %bc.merge.rdx, %11
+  %15 = sub i32 %14, %12
+  br label %i24
+
+i24.loopexit1:                                    ; preds = %for.body18
+  %mul19.lcssa = phi i32 [ %mul19, %for.body18 ]
+  br label %i24
+
+i24:                                              ; preds = %i24.loopexit1, %i24.loopexit, %for.cond14.preheader, %middle.block, %for.cond.preheader
+  %ret.2 = phi i32 [ 0, %for.cond.preheader ], [ %bin.rdx1325, %middle.block ], [ 0, %for.cond14.preheader ], [ %15, %i24.loopexit ], [ %mul19.lcssa, %i24.loopexit1 ]
+  %mul25 = mul nsw i32 %ret.2, 10
+  br label %end
+
+end:                                              ; preds = %i24, %h
+  %storemerge2 = phi i32 [ %mul25, %i24 ], [ %add23, %h ]
+  %sext = shl i64 %call, 32
+  %idxprom = ashr exact i64 %sext, 32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization23
+; CHECK: i24:
+; CHECK: %i24.entry_mask{{.+}} = or i1
+; CHECK: %i24.entry_mask{{.+}} = or i1
+; CHECK: %i24.entry_mask{{.+}} = or i1
+; CHECK: %i24.entry_mask{{.+}} = or i1
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
new file mode 100644
index 0000000000000..ffabf74a42b22
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization3.ll
@@ -0,0 +1,269 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization3 -vecz-passes="function(instcombine,simplifycfg),mergereturn,vecz-loop-rotate,function(loop(indvars)),cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;         a
+;        / \
+;       /   \
+;      /     \
+;     b       c
+;    / \     / \
+;   d   e   f   g
+;    \   \ /   /
+;     \   h   /
+;      \   \ /
+;       \   i
+;        \ /
+;         j
+;
+; * where node a is a uniform branch, and nodes b and c are varying branches.
+; * where nodes d, e, f, g, i and j are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;     a
+;    / \
+;   b   c
+;   |   |
+;   e   g
+;   |   |
+;   d   f
+;    \ /
+;     h
+;     |
+;     i
+;     |
+;     j
+;
+; __kernel void partial_linearization3(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n < 10) { // uniform
+;     if (id % 3 == 0) { // varying
+;       for (int i = 0; i < n - 1; i++) { ret /= 2; } goto end;
+;     } else { // varying
+;       for (int i = 0; i < n / 3; i++) { ret -= 2; } goto h;
+;     }
+;   } else { // uniform
+;     if (id % 2 == 0) { // varying
+;       for (int i = 0; i < n * 2; i++) { ret += 1; } goto h;
+;     } else { // varying
+;       for (int i = 0; i < n + 5; i++) { ret *= 2; } goto i;
+;     }
+;   }
+;
+; h:
+;   ret += 5;
+;
+; i:
+;   ret *= 10;
+;
+; end:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization3(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else17
+
+if.then:                                          ; preds = %entry
+  %rem = srem i32 %conv, 3
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.then4, label %if.else
+
+if.then4:                                         ; preds = %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.then4
+  %ret.0 = phi i32 [ 0, %if.then4 ], [ %div, %for.body ]
+  %storemerge4 = phi i32 [ 0, %if.then4 ], [ %inc, %for.body ]
+  %sub = add nsw i32 %n, -1
+  %cmp5 = icmp slt i32 %storemerge4, %sub
+  br i1 %cmp5, label %for.body, label %end
+
+for.body:                                         ; preds = %for.cond
+  %div = sdiv i32 %ret.0, 2
+  %inc = add nsw i32 %storemerge4, 1
+  br label %for.cond
+
+if.else:                                          ; preds = %if.then
+  br label %for.cond8
+
+for.cond8:                                        ; preds = %for.body12, %if.else
+  %ret.1 = phi i32 [ 0, %if.else ], [ %sub13, %for.body12 ]
+  %storemerge3 = phi i32 [ 0, %if.else ], [ %inc15, %for.body12 ]
+  %div9 = sdiv i32 %n, 3
+  %cmp10 = icmp slt i32 %storemerge3, %div9
+  br i1 %cmp10, label %for.body12, label %h
+
+for.body12:                                       ; preds = %for.cond8
+  %sub13 = add nsw i32 %ret.1, -2
+  %inc15 = add nsw i32 %storemerge3, 1
+  br label %for.cond8
+
+if.else17:                                        ; preds = %entry
+  %rem181 = and i32 %conv, 1
+  %cmp19 = icmp eq i32 %rem181, 0
+  br i1 %cmp19, label %if.then21, label %if.else30
+
+if.then21:                                        ; preds = %if.else17
+  br label %for.cond23
+
+for.cond23:                                       ; preds = %for.body26, %if.then21
+  %ret.2 = phi i32 [ 0, %if.then21 ], [ %add, %for.body26 ]
+  %storemerge2 = phi i32 [ 0, %if.then21 ], [ %inc28, %for.body26 ]
+  %mul = shl nsw i32 %n, 1
+  %cmp24 = icmp slt i32 %storemerge2, %mul
+  br i1 %cmp24, label %for.body26, label %h
+
+for.body26:                                       ; preds = %for.cond23
+  %add = add nsw i32 %ret.2, 1
+  %inc28 = add nsw i32 %storemerge2, 1
+  br label %for.cond23
+
+if.else30:                                        ; preds = %if.else17
+  br label %for.cond32
+
+for.cond32:                                       ; preds = %for.body36, %if.else30
+  %ret.3 = phi i32 [ 0, %if.else30 ], [ %mul37, %for.body36 ]
+  %storemerge = phi i32 [ 0, %if.else30 ], [ %inc39, %for.body36 ]
+  %add33 = add nsw i32 %n, 5
+  %cmp34 = icmp slt i32 %storemerge, %add33
+  br i1 %cmp34, label %for.body36, label %i42
+
+for.body36:                                       ; preds = %for.cond32
+  %mul37 = shl nsw i32 %ret.3, 1
+  %inc39 = add nsw i32 %storemerge, 1
+  br label %for.cond32
+
+h:                                                ; preds = %for.cond23, %for.cond8
+  %ret.4 = phi i32 [ %ret.1, %for.cond8 ], [ %ret.2, %for.cond23 ]
+  %add41 = add nsw i32 %ret.4, 5
+  br label %i42
+
+i42:                                              ; preds = %h, %for.cond32
+  %ret.5 = phi i32 [ %add41, %h ], [ %ret.3, %for.cond32 ]
+  %mul43 = mul nsw i32 %ret.5, 10
+  br label %end
+
+end:                                              ; preds = %i42, %for.cond
+  %ret.6 = phi i32 [ %mul43, %i42 ], [ %ret.0, %for.cond ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.6, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization3, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization3
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE17:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[FORCOND8PREHEADER:.+]]
+
+; CHECK: [[FORCOND8PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND8:.+]]
+
+; CHECK: [[FORCONDPREHEADER:.+]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP5:.+]] = icmp
+; CHECK: br i1 %[[CMP5]], label %[[FORBODY:.+]], label %[[ENDLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND8]]:
+; CHECK: %[[CMP10:.+]] = icmp
+; CHECK: br i1 %[[CMP10]], label %[[FORBODY12:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY12]]:
+; CHECK: br label %[[FORCOND8]]
+
+; CHECK: [[IFELSE17]]:
+; CHECK: br label %[[FORCOND32PREHEADER:.+]]
+
+; CHECK: [[FORCOND32PREHEADER]]:
+; CHECK: br label %[[FORCOND32:.+]]
+
+; CHECK: [[FORCOND23PREHEADER:.+]]:
+; CHECK: br label %[[FORCOND23:.+]]
+
+; CHECK: [[FORCOND23]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY26:.+]], label %[[HLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY26]]:
+; CHECK: br label %[[FORCOND23]]
+
+; CHECK: [[FORCOND32]]:
+; CHECK: br i1 false, label %[[FORBODY36:.+]], label %[[ENDLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY36]]:
+; CHECK: br label %[[FORCOND32]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[FORCONDPREHEADER]]
+
+; CHECK: [[HLOOPEXIT2]]:
+; CHECK: br label %[[H:.+]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[END:.+]]
+
+; CHECK: [[ENDLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[ENDLOOPEXIT2]]:
+; CHECK: br label %[[FORCOND23PREHEADER]]
+
+; CHECK: [[END]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
new file mode 100644
index 0000000000000..a9158f7ff59c7
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization4.ll
@@ -0,0 +1,195 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization4 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <-.
+;    / \  |
+;   e   c |
+;   |  / \|
+;   | f   d
+;   |/
+;   g
+;
+; * where node b is a uniform branch, and node c is a varying branch.
+; * where nodes f, d and g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;     a
+;     |
+;     b <--.
+;    / \   |
+;   e   c  |
+;   |   |  |
+;   |   d -'
+;    \ /
+;     f
+;     |
+;     g
+;
+; __kernel void partial_linearization4(__global int *out, int n) {
+;   int id = get_global_id(0);
+;
+;   int x = id / n;
+;   int y = id % n;
+;   int i = 0;
+;   for (;;) {
+;     if (n > 20) goto e;
+;     if (x + y > n) goto f;
+;     y++;
+;     x++;
+;     i++;
+;   }
+;
+; goto g;
+;
+; e:
+;   i *= 2 + n;
+;   goto g;
+;
+; f:
+;   i /= i + n;
+;
+; g:
+;   out[id] = x + y + i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization4(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %0 = icmp eq i32 %conv, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %conv, %5
+  %6 = icmp eq i32 %conv, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %rem = srem i32 %conv, %11
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end5, %entry
+  %x.0 = phi i32 [ %div, %entry ], [ %inc6, %if.end5 ]
+  %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end5 ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc7, %if.end5 ]
+  %cmp = icmp sgt i32 %n, 20
+  br i1 %cmp, label %e, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  %add = add nsw i32 %y.0, %x.0
+  %cmp2 = icmp sgt i32 %add, %n
+  br i1 %cmp2, label %f, label %if.end5
+
+if.end5:                                          ; preds = %if.end
+  %inc = add nsw i32 %y.0, 1
+  %inc6 = add nsw i32 %x.0, 1
+  %inc7 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %for.cond
+  %add8 = add nsw i32 %n, 2
+  %mul = mul nsw i32 %storemerge, %add8
+  br label %g
+
+f:                                                ; preds = %if.end
+  %add9 = add nsw i32 %storemerge, %n
+  %12 = icmp eq i32 %add9, 0
+  %13 = select i1 %12, i32 1, i32 %add9
+  %div10 = sdiv i32 %storemerge, %13
+  br label %g
+
+g:                                                ; preds = %f, %e
+  %storemerge1 = phi i32 [ %div10, %f ], [ %mul, %e ]
+  %add11 = add i32 %y.0, %x.0
+  %add12 = add i32 %add11, %storemerge1
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add12, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization4, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization4
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[E:.+]], label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br label %[[IFEND5:.+]]
+
+; CHECK: [[IFEND5]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND:.+]], label %[[FORCONDPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPUREEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br label %[[FORCONDPUREEXIT]]
+
+; CHECK: [[EELSE:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[ESPLIT:.+]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FELSE:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[EELSE]], label %[[ESPLIT]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
new file mode 100644
index 0000000000000..a65b8bad7dd25
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization5.ll
@@ -0,0 +1,221 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization5 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;    / \
+;   b   c
+;   |\ / \
+;   | d   e
+;   |  \ /
+;   |   f
+;    \ /
+;     g
+;
+; * where node c is a uniform branch, and nodes a and b are varying branches.
+; * where nodes b, c, d, f, g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;     a
+;     |
+;     c
+;    / \
+;   |   e
+;    \ /
+;     b
+;     |
+;     d
+;     |
+;     f
+;     |
+;     g
+;
+; __kernel void partial_linearization5(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (id % 2 == 0) { // a
+;     if (id == 4) { // b
+;       goto g;
+;     } else {
+;       goto d;
+;     }
+;   } else { // c
+;     if (n % 2 == 0) {
+;       goto d;
+;     } else {
+;       goto e;
+;     }
+;   }
+;
+; d:
+;   for (int i = 0; i < n / 4; i++) { ret += i - 2; }
+;   goto f;
+;
+; e:
+;   for (int i = 0; i < n + 5; i++) { ret += i + 5; }
+;
+; f:
+;   ret *= ret % n;
+;   ret *= ret + 4;
+;
+; g:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization5(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %rem1 = and i32 %conv, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  %cmp2 = icmp eq i32 %conv, 4
+  br i1 %cmp2, label %g, label %d
+
+if.else5:                                         ; preds = %entry
+  %rem62 = and i32 %n, 1
+  %cmp7 = icmp eq i32 %rem62, 0
+  br i1 %cmp7, label %d, label %e
+
+d:                                                ; preds = %if.else5, %if.then
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %d
+  %ret.0 = phi i32 [ 0, %d ], [ %add, %for.body ]
+  %storemerge3 = phi i32 [ 0, %d ], [ %inc, %for.body ]
+  %div = sdiv i32 %n, 4
+  %cmp11 = icmp slt i32 %storemerge3, %div
+  br i1 %cmp11, label %for.body, label %f
+
+for.body:                                         ; preds = %for.cond
+  %sub = add i32 %ret.0, -2
+  %add = add i32 %sub, %storemerge3
+  %inc = add nsw i32 %storemerge3, 1
+  br label %for.cond
+
+e:                                                ; preds = %if.else5
+  br label %for.cond14
+
+for.cond14:                                       ; preds = %for.body18, %e
+  %ret.1 = phi i32 [ 0, %e ], [ %add20, %for.body18 ]
+  %storemerge = phi i32 [ 0, %e ], [ %inc22, %for.body18 ]
+  %add15 = add nsw i32 %n, 5
+  %cmp16 = icmp slt i32 %storemerge, %add15
+  br i1 %cmp16, label %for.body18, label %f
+
+for.body18:                                       ; preds = %for.cond14
+  %add19 = add i32 %ret.1, 5
+  %add20 = add i32 %add19, %storemerge
+  %inc22 = add nsw i32 %storemerge, 1
+  br label %for.cond14
+
+f:                                                ; preds = %for.cond14, %for.cond
+  %ret.2 = phi i32 [ %ret.0, %for.cond ], [ %ret.1, %for.cond14 ]
+  %0 = icmp eq i32 %ret.2, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %rem24 = srem i32 %ret.2, %5
+  %mul = mul nsw i32 %rem24, %ret.2
+  %add25 = add nsw i32 %mul, 4
+  %mul26 = mul nsw i32 %add25, %mul
+  br label %g
+
+g:                                                ; preds = %f, %if.then
+  %ret.3 = phi i32 [ %mul26, %f ], [ 0, %if.then ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization5, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization5
+; CHECK: br label %[[IFELSE5:.+]]
+
+; CHECK: [[IFTHEN:.+]]:
+; CHECK: br label %[[D:.+]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: %[[CMP7:.+]] = icmp
+; CHECK: br i1 %[[CMP7]], label %[[IFTHEN]], label %[[FORCOND14PREHEADER:.+]]
+
+; CHECK: [[FORCOND14PREHEADER]]:
+; CHECK: br label %[[FORCOND14:.+]]
+
+; CHECK: [[D]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: %[[CMP11:.+]] = icmp
+; CHECK: br i1 %[[CMP11]], label %[[FORBODY:.+]], label %[[FLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FORCOND14]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY18:.+]], label %[[FLOOPEXIT2:.+]]
+
+; CHECK: [[FORBODY18]]:
+; CHECK: br label %[[FORCOND14]]
+
+; CHECK: [[FLOOPEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[FLOOPEXIT2]]:
+; CHECK: br label %[[IFTHEN]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
new file mode 100644
index 0000000000000..5425139b5d888
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization6.ll
@@ -0,0 +1,200 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization6 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ g
+;      \|
+;       h
+;
+; * where nodes b and c are uniform branches, and node f is a varying
+;   branch.
+; * where nodes g and h are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;       a
+;       |
+;       b <-.
+;      / \  |
+;     c   d |
+;    / \ /  |
+;   e   f --'
+;    \  |
+;     \ |
+;      \|
+;       g
+;       |
+;       h
+;
+; __kernel void partial_linearization6(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   while (1) {
+;     if (n % 2 == 0) {
+;       if (n > 2) {
+;         goto e;
+;       }
+;     } else {
+;       ret += n + 1;
+;     }
+;     if (id == n) break;
+;   }
+;
+;   ret += n * 2;
+;   ret /= n;
+;   goto early;
+;
+; e:
+;   ret += n * 4;
+;   ret -= n;
+;
+; early:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization6(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end10, %entry
+  %ret.0 = phi i32 [ 0, %entry ], [ %ret.1, %if.end10 ]
+  %rem1 = and i32 %n, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %while.body
+  %cmp2 = icmp sgt i32 %n, 2
+  br i1 %cmp2, label %e, label %if.end6
+
+if.else:                                          ; preds = %while.body
+  %add = add nsw i32 %n, 1
+  %add5 = add nsw i32 %add, %ret.0
+  br label %if.end6
+
+if.end6:                                          ; preds = %if.else, %if.then
+  %ret.1 = phi i32 [ %add5, %if.else ], [ %ret.0, %if.then ]
+  %cmp7 = icmp eq i32 %conv, %n
+  br i1 %cmp7, label %while.end, label %if.end10
+
+if.end10:                                         ; preds = %if.end6
+  br label %while.body
+
+while.end:                                        ; preds = %if.end6
+  %mul = shl nsw i32 %n, 1
+  %add11 = add nsw i32 %ret.1, %mul
+  %0 = icmp eq i32 %add11, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %add11, %5
+  br label %early
+
+e:                                                ; preds = %if.then
+  %mul12 = mul i32 %n, 4
+  %n.neg = sub i32 0, %n
+  %add13 = add i32 %mul12, %n.neg
+  %sub = add i32 %add13, %ret.0
+  br label %early
+
+early:                                            ; preds = %e, %while.end
+  %storemerge = phi i32 [ %div, %while.end ], [ %sub, %e ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %storemerge, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization6, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization6
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: %[[CMP2:.+]] = icmp
+; CHECK: br i1 %[[CMP2]], label %[[E:.+]], label %[[IFEND6:.+]]
+
+; CHECK: [[IFELSE]]:
+; CHECK: br label %[[IFEND6]]
+
+; CHECK: [[IFEND6]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: br label %[[WHILEENDELSE:.+]]
+
+; CHECK: [[WHILEENDELSE]]:
+; CHECK: br i1 %{{.+}}, label %[[EELSE:.+]], label %[[ESPLIT:.+]]
+
+; CHECK: [[E]]:
+; CHECK: br label %[[WHILEBODYPUREEXIT]]
+
+; CHECK: [[EELSE]]:
+; CHECK: br label %[[EARLY:.+]]
+
+; CHECK: [[ESPLIT]]:
+; CHECK: br label %[[EARLY]]
+
+; CHECK: [[EARLY]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
new file mode 100644
index 0000000000000..1c59a75ab15d8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization7.ll
@@ -0,0 +1,228 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization7 -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;       a
+;      / \
+;     b   c
+;    / \ / \
+;   d   e   f
+;    \ / \ /
+;     g   h
+;      \ /
+;       i
+;
+; * where nodes a, c and e are uniform branches, and node b is a varying
+;   branch.
+; * where nodes d, e, g and i are divergent.
+;
+; With partial linearization, it can be transformed in the following way:
+;
+;     a
+;    / \
+;   b   c
+;   |  /|
+;   d / |
+;   |/  |
+;   e   f
+;   |\  |
+;   | \ |
+;   |  \|
+;   g - h
+;   |
+;   i
+;
+; __kernel void partial_linearization7(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int i = 0;
+;
+;   if (n > 10) { // a
+;     if (n + id > 10) { // b
+;       i = n * 10; // d
+;       goto g;
+;     } else {
+;       goto e;
+;     }
+;   } else {
+;     if (n < 5) { // c
+;       goto e;
+;     } else {
+;       for (int j = 0; j < n; j++) { i++; }
+;       goto h;
+;     }
+;   }
+;
+; e:
+;   if (n > 5) {
+;     goto g;
+;   } else {
+;     i = n * 3 / 5;
+;     goto h;
+;   }
+;
+; g:
+;   for (int j = 0; j < n; j++) { i++; }
+;   goto i;
+;
+; h:
+;   i = n + id / 3;
+;
+; i:
+;   out[id] = i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization7(i32 addrspace(1)* %out, i32 noundef %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %n, 10
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  %add = add nsw i32 %conv, %n
+  %cmp2 = icmp sgt i32 %add, 10
+  br i1 %cmp2, label %if.then4, label %e
+
+if.then4:                                         ; preds = %if.then
+  %mul = mul nsw i32 %n, 10
+  br label %g
+
+if.else5:                                         ; preds = %entry
+  %cmp6 = icmp slt i32 %n, 5
+  br i1 %cmp6, label %e, label %if.else9
+
+if.else9:                                         ; preds = %if.else5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %if.else9
+  %storemerge = phi i32 [ 0, %if.else9 ], [ %inc12, %for.body ]
+  %cmp10 = icmp slt i32 %storemerge, %n
+  br i1 %cmp10, label %for.body, label %h
+
+for.body:                                         ; preds = %for.cond
+  %inc12 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %if.else5, %if.then
+  %cmp13 = icmp sgt i32 %n, 5
+  br i1 %cmp13, label %g, label %h
+
+g:                                                ; preds = %e, %if.then4
+  %i.1 = phi i32 [ %mul, %if.then4 ], [ 0, %e ]
+  br label %for.cond19
+
+for.cond19:                                       ; preds = %for.body22, %g
+  %i.2 = phi i32 [ %i.1, %g ], [ %inc23, %for.body22 ]
+  %storemerge1 = phi i32 [ 0, %g ], [ %inc25, %for.body22 ]
+  %cmp20 = icmp slt i32 %storemerge1, %n
+  br i1 %cmp20, label %for.body22, label %i29
+
+for.body22:                                       ; preds = %for.cond19
+  %inc23 = add nsw i32 %i.2, 1
+  %inc25 = add nsw i32 %storemerge1, 1
+  br label %for.cond19
+
+h:                                                ; preds = %e, %for.cond
+  %div27 = sdiv i32 %conv, 3
+  %add28 = add nsw i32 %div27, %n
+  br label %i29
+
+i29:                                              ; preds = %h, %for.cond19
+  %i.3 = phi i32 [ %add28, %h ], [ %i.2, %for.cond19 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %i.3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization7, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization7
+; CHECK: %[[CMP:.+]] = icmp
+; CHECK: br i1 %[[CMP]], label %[[IFTHEN:.+]], label %[[IFELSE5:.+]]
+
+; CHECK: [[IFTHEN]]:
+; CHECK: br label %[[IFTHEN4:.+]]
+
+; CHECK: [[IFTHEN4]]:
+; CHECK: br label %[[E:.+]]
+
+; CHECK: [[IFELSE5]]:
+; CHECK: %[[CMP6:.+]] = icmp
+; CHECK: br i1 %[[CMP6]], label %[[E]], label %[[FORCONDPREHEADER:.+]]
+
+; CHECK: [[FORCONDPREHEADER]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY:.+]], label %[[HLOOPEXIT:.+]]
+
+; CHECK: [[FORBODY]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[E]]:
+; CHECK: %[[CMP13:.+]] = icmp
+; CHECK: br i1 %[[CMP13]], label %[[G:.+]], label %[[H:.+]]
+
+; CHECK: [[G]]:
+; CHECK: br label %[[FORCOND19:.+]]
+
+; CHECK: [[FORCOND19]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(false)}}, label %[[FORBODY22:.+]], label %[[I29LOOPEXIT:.+]]
+
+; CHECK: [[FORBODY22]]:
+; CHECK: br label %[[FORCOND19]]
+
+; CHECK: [[HLOOPEXIT]]:
+; CHECK: br label %[[H]]
+
+; CHECK: [[H]]:
+; CHECK: br label %[[G]]
+
+; CHECK: [[I29LOOPEXIT]]:
+; CHECK: br label %[[I29:.+]]
+
+; CHECK: [[I29]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
new file mode 100644
index 0000000000000..b5c22f6b5c588
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization8.ll
@@ -0,0 +1,191 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization8 -vecz-passes=cfg-convert -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;     a
+;     |
+;     b <-.
+;    / \  |
+;   e   c |
+;   |  / \|
+;   | f   d
+;   |/
+;   g
+;
+; * where nodes b and c varying branches.
+; * where nodes e, f, d and g are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;   a
+;   |
+;   b <.
+;   |  |
+;   c  |
+;   |  |
+;   d -'
+;   |
+;   f
+;   |
+;   e
+;   |
+;   g
+;
+; __kernel void partial_linearization8(__global int *out, int n) {
+;   int id = get_global_id(0);
+;
+;   int x = id / n;
+;   int y = id % n;
+;   int i = 0;
+;   for (;;) {
+;     if (i + id > n) goto e;
+;     if (x + y > n) goto f;
+;     y++;
+;     x++;
+;     i++;
+;   }
+;
+; goto g;
+;
+; e:
+;   i *= 2 + n;
+;   goto g;
+;
+; f:
+;   i /= i + n;
+;
+; g:
+;   out[id] = x + y + i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization8(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %0 = icmp eq i32 %conv, -2147483648
+  %1 = icmp eq i32 %n, -1
+  %2 = and i1 %1, %0
+  %3 = icmp eq i32 %n, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %n
+  %div = sdiv i32 %conv, %5
+  %6 = icmp eq i32 %conv, -2147483648
+  %7 = icmp eq i32 %n, -1
+  %8 = and i1 %7, %6
+  %9 = icmp eq i32 %n, 0
+  %10 = or i1 %9, %8
+  %11 = select i1 %10, i32 1, i32 %n
+  %rem = srem i32 %conv, %11
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end6, %entry
+  %x.0 = phi i32 [ %div, %entry ], [ %inc7, %if.end6 ]
+  %y.0 = phi i32 [ %rem, %entry ], [ %inc, %if.end6 ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc8, %if.end6 ]
+  %add = add nsw i32 %storemerge, %conv
+  %cmp = icmp sgt i32 %add, %n
+  br i1 %cmp, label %e, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  %add2 = add nsw i32 %y.0, %x.0
+  %cmp3 = icmp sgt i32 %add2, %n
+  br i1 %cmp3, label %f, label %if.end6
+
+if.end6:                                          ; preds = %if.end
+  %inc = add nsw i32 %y.0, 1
+  %inc7 = add nsw i32 %x.0, 1
+  %inc8 = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+e:                                                ; preds = %for.cond
+  %add9 = add nsw i32 %n, 2
+  %mul = mul nsw i32 %storemerge, %add9
+  br label %g
+
+f:                                                ; preds = %if.end
+  %add10 = add nsw i32 %storemerge, %n
+  %12 = icmp eq i32 %add10, 0
+  %13 = select i1 %12, i32 1, i32 %add10
+  %div11 = sdiv i32 %storemerge, %13
+  br label %g
+
+g:                                                ; preds = %f, %e
+  %storemerge1 = phi i32 [ %div11, %f ], [ %mul, %e ]
+  %add12 = add i32 %y.0, %x.0
+  %add13 = add i32 %add12, %storemerge1
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %add13, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization8, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization8
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br label %[[IFEND:.+]]
+
+; CHECK: [[IFEND]]:
+; CHECK: br label %[[IFEND6:.+]]
+
+; CHECK: [[IFEND6]]:
+; CHECK: br i1 %{{.+}}, label %[[FORCOND]], label %[[FORCONDPUREEXIT:.+]]
+
+; CHECK: [[FORCONDPUREEXIT]]:
+; CHECK: br label %[[F:.+]]
+
+; CHECK: [[E:.+]]:
+; CHECK: br label %[[G:.+]]
+
+; CHECK: [[F]]:
+; CHECK: br label %[[FELSE:.+]]
+
+; CHECK: [[FELSE]]:
+; CHECK: br label %[[E]]
+
+; CHECK: [[G]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
new file mode 100644
index 0000000000000..12ff83e3ac98d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization9.ll
@@ -0,0 +1,148 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k partial_linearization9 -vecz-passes=cfg-convert,cleanup-divergence -S < %s | FileCheck %s
+
+; The CFG of the following kernel is:
+;
+;   a
+;   |
+;   b <--.
+;   |    |
+;   c <. |
+;   |  | |
+;   d -' |
+;   |    |
+;   e ---'
+;   |
+;   f
+;
+; * where node e is a varying branch.
+; * where node f is divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;   a
+;   |
+;   b <--.
+;   |    |
+;   c <. |
+;   |  | |
+;   d -' |
+;   |    |
+;   e ---'
+;   |
+;   f
+;
+; __kernel void partial_linearization9(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int i = 0;
+;
+;   while (1) {
+;     int j = 0;
+;     for (; ; i++) {
+;       if (j++ > n) break;
+;     }
+;     if (i++ + id > n) break;
+;   }
+;
+;   out[id] = i;
+; }
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @partial_linearization9(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %if.end7, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc3, %if.end7 ]
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %while.body
+  %i.1 = phi i32 [ %i.0, %while.body ], [ %inc3, %for.inc ]
+  %j.0 = phi i32 [ 0, %while.body ], [ %inc, %for.inc ]
+  %cmp = icmp sgt i32 %j.0, %n
+  %inc3 = add nsw i32 %i.1, 1
+  br i1 %cmp, label %for.end, label %for.inc
+
+for.inc:                                          ; preds = %for.cond
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %add = add nsw i32 %i.1, %conv
+  %cmp4 = icmp sgt i32 %add, %n
+  br i1 %cmp4, label %while.end, label %if.end7
+
+if.end7:                                          ; preds = %for.end
+  br label %while.body
+
+while.end:                                        ; preds = %for.end
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %inc3, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind readonly }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32)* @partial_linearization9, !3, !4, !5, !6, !7, !8}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"", !""}
+!8 = !{!"kernel_arg_name", !"out", !"n"}
+
+; CHECK: spir_kernel void @__vecz_v4_partial_linearization9
+; CHECK: br label %[[WHILEBODY:.+]]
+
+; CHECK: [[WHILEBODY]]:
+; CHECK: br label %[[FORCOND:.+]]
+
+; CHECK: [[FORCOND]]:
+; CHECK: br i1 {{(%[0-9A-Za-z\.]+)|(true)}}, label %[[FOREND:.+]], label %[[FORINC:.+]]
+
+; CHECK: [[FORINC]]:
+; CHECK: br label %[[FORCOND]]
+
+; CHECK: [[FOREND]]:
+; CHECK: br i1 %{{.+}}, label %[[WHILEBODY]], label %[[WHILEBODYPUREEXIT:.+]]
+
+; CHECK: [[WHILEBODYPUREEXIT]]:
+; CHECK: br label %[[WHILEEND:.+]]
+
+; CHECK: [[WHILEEND]]:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
new file mode 100644
index 0000000000000..2f8b137532493
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/partial_linearization_exit_masks.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test < %s
+
+; This test ensures that VECZ does not crash during control flow conversion due
+; to a missing exit mask. As such, we need only verify that the return code from
+; veczc is 0, and FileCheck is not required.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = tail call i32 @__mux_get_global_id(i32 0)
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.body.preheader, label %if.end.thread
+
+for.body.preheader:
+  %cmp2 = icmp sgt i32 %n, 1
+  %0 = and i32 %call, 1
+  %cmp3 = icmp eq i32 %0, 0
+  br i1 %cmp2, label %if.end2, label %if.else
+
+if.end.thread:
+  %cmp4 = icmp eq i32 %call, 0
+  br i1 %cmp4, label %if.end, label %for.cond.preheader
+
+if.else:
+  br i1 %cmp3, label %if.end, label %for.body
+
+for.cond.preheader:
+  %cmp5 = icmp sgt i32 %n, 1
+  br i1 %cmp5, label %for.body, label %if.end
+
+for.body:
+  br i1 0, label %if.end, label %for.body
+
+if.end:
+  %div = sdiv i32 %call, 2
+  br label %if.end2
+
+if.end2:
+  %ret = phi i32 [ 0, %for.body.preheader ], [ %div, %if.end ]
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 0
+  store i32 %ret, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+declare i32 @__mux_get_global_id(i32)
+
+declare spir_func i32 @_Z3maxii(i32, i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
new file mode 100644
index 0000000000000..08f72b45bf6de
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline.ll
@@ -0,0 +1,48 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k foo -w 2 -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s
+; RUN: veczc -k foo -w 2 -vecz-passes scalarize -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s --check-prefix=PASSES1
+; RUN: veczc -k foo -w 2 -vecz-passes scalarize,packetizer -debug-vecz-pipeline -S < %s 2>&1 | FileCheck %s --check-prefix=PASSES2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Don't check specific passes, but assume that *some* analyses and passes are run.
+; CHECK: Running analysis: {{.*}}> on __vecz_v2_foo
+; CHECK: Running pass: {{.*}} on __vecz_v2_foo
+
+; PASSES1: Running pass: RequireAnalysisPass<{{(class )?}}compiler::utils::DeviceInfoAnalysis,
+; PASSES1-NOT: Running pass:
+; PASSES1: Running pass: Function scalarization on __vecz_v2_foo
+; PASSES1-NOT: Running pass:
+; PASSES1-NOT: Running pass:
+
+; PASSES2: Running pass: RequireAnalysisPass<{{(class )?}}compiler::utils::DeviceInfoAnalysis,
+; PASSES2-NOT: Running pass:
+; PASSES2: Running pass: Function scalarization on __vecz_v2_foo
+; PASSES2: Running pass: Function packetization on __vecz_v2_foo
+; PASSES2-NOT: Running pass:
+; PASSES2-NOT: Running pass:
+
+define spir_kernel void @foo(i32 addrspace(1)* %out) {
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx
+  store i32 0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
new file mode 100644
index 0000000000000..cad700234785a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/pass_pipeline_printafter.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k foo -w 2 -vecz-passes scalarize,mask-memops,packetizer -print-after mask-memops -S < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: IR Dump After Simplify masked memory operations{{( on __vecz_v2_foo)?}}
+; CHECK-NEXT: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) #0 {
+; CHECK-NEXT:   %idx = call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:   %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 %idx
+; CHECK-NEXT:   store i32 0, ptr addrspace(1) %arrayidx, align 4
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+; CHECK: define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %out) {{.*}} {
+; CHECK-NEXT:   %idx = call i64 @__mux_get_global_id(i32 0)
+; CHECK-NEXT:   %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 %idx
+; CHECK-NEXT:   store <2 x i32> zeroinitializer, ptr addrspace(1) %arrayidx, align 4
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+define spir_kernel void @foo(i32 addrspace(1)* %out) {
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idx
+  store i32 0, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
new file mode 100644
index 0000000000000..4b289100a3ffb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_interleaved.ll
@@ -0,0 +1,89 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k codegen_2 -vecz-simd-width 16 -vecz-choices=TargetIndependentPacketization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @codegen_2(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %size, i32 %reps) local_unnamed_addr {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = sext i32 %reps to i64
+  %mul = mul i64 %call, %conv
+  %add = add i64 %call, 1
+  %mul2 = mul i64 %add, %conv
+  %cmp19 = icmp ult i64 %mul, %mul2
+  br i1 %cmp19, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv4 = sext i32 %size to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.inc, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %sum.1, %for.inc ]
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %sum.0.lcssa, i32 addrspace(1)* %arrayidx8, align 4, !tbaa !9
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.021 = phi i64 [ %mul, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %sum.020 = phi i32 [ 0, %for.body.lr.ph ], [ %sum.1, %for.inc ]
+  %cmp5 = icmp ult i64 %i.021, %conv4
+  br i1 %cmp5, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %i.021
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4, !tbaa !9
+  %add7 = add nsw i32 %0, %sum.020
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %sum.1 = phi i32 [ %add7, %if.then ], [ %sum.020, %for.body ]
+  %inc = add nuw i64 %i.021, 1
+  %cmp = icmp ult i64 %inc, %mul2
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!opencl.kernels = !{!2}
+!host.build_options = !{!8}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, i32 2}
+!2 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @codegen_2, !3, !4, !5, !6, !7}
+!3 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 0, i32 0}
+!4 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
+!5 = !{!"kernel_arg_type", !"int*", !"int*", !"int", !"int"}
+!6 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int", !"int"}
+!7 = !{!"kernel_arg_type_qual", !"const", !"", !"", !""}
+!8 = !{!""}
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+
+
+; It checks that the PHI node did not prevent the interleave factor from being determined
+; CHECK: define spir_kernel void @__vecz_v16_codegen_2
+; CHECK-NOT: call <16 x i32> @__vecz_b_masked_gather_load4_4_Dv16_jDv16_u3ptrU3AS1Dv16_b
+; CHECK: call <16 x i32> @__vecz_b_masked_interleaved_load4_V_Dv16_ju3ptrU3AS1Dv16_b
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
new file mode 100644
index 0000000000000..aabbd65bf1059
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_node_debug_info.ll
@@ -0,0 +1,138 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that debug info intrinsics are correctly placed after
+; phi nodes.
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+; CHECK: define spir_kernel void @__vecz_v4_loop_phi(
+define spir_kernel void @loop_phi(i32 addrspace(3)* %a, i32 addrspace(3)* %b) #0 !dbg !4 {
+entry:
+  %a.addr = alloca i32 addrspace(3)*, align 8
+  %b.addr = alloca i32 addrspace(3)*, align 8
+  %tid = alloca i64, align 8
+  %i = alloca i32, align 4
+  store i32 addrspace(3)* %a, i32 addrspace(3)** %a.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(3)** %a.addr, metadata !12, metadata !30), !dbg !31
+  store i32 addrspace(3)* %b, i32 addrspace(3)** %b.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(3)** %b.addr, metadata !13, metadata !30), !dbg !31
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !30), !dbg !32
+  %call = call i64 @__mux_get_local_id(i32 0) #3, !dbg !32
+  store i64 %call, i64* %tid, align 8, !dbg !32
+  call void @llvm.dbg.declare(metadata i32* %i, metadata !19, metadata !30), !dbg !33
+  %0 = load i64, i64* %tid, align 8, !dbg !33
+  %conv = trunc i64 %0 to i32, !dbg !33
+  store i32 %conv, i32* %i, align 4, !dbg !33
+  br label %for.cond, !dbg !33
+
+
+; CHECK: for.cond:
+; CHECK: %[[PHI1:.+]] = phi {{i[0-9]+}} [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: #dbg_value(i64 %[[PHI1]], !{{[0-9]+}},
+; CHECK-SAME: !DIExpression({{.*}}),
+; CHECK-SAME: !{{[0-9]+}}
+; Check we haven't inserted a llvm.dbg.value intrinsic before the last of the PHIs.
+; CHECK-NOT: phi
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4, !dbg !34
+  %cmp = icmp slt i32 %1, 128, !dbg !34
+  br i1 %cmp, label %for.body, label %for.end, !dbg !33
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, i32* %i, align 4, !dbg !36
+  %idxprom = sext i32 %2 to i64, !dbg !36
+  %3 = load i32 addrspace(3)*, i32 addrspace(3)** %b.addr, align 8, !dbg !36
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(3)* %3, i64 %idxprom, !dbg !36
+  %4 = load i32, i32 addrspace(3)* %arrayidx, align 4, !dbg !36
+  %5 = load i32, i32* %i, align 4, !dbg !36
+  %idxprom2 = sext i32 %5 to i64, !dbg !36
+  %6 = load i32 addrspace(3)*, i32 addrspace(3)** %a.addr, align 8, !dbg !36
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(3)* %6, i64 %idxprom2, !dbg !36
+  store i32 %4, i32 addrspace(3)* %arrayidx3, align 4, !dbg !36
+  br label %for.inc, !dbg !38
+
+for.inc:                                          ; preds = %for.body
+  %7 = load i32, i32* %i, align 4, !dbg !34
+  %add = add nsw i32 %7, 32, !dbg !34
+  store i32 %add, i32* %i, align 4, !dbg !34
+  br label %for.cond, !dbg !34
+
+for.end:                                          ; preds = %for.cond
+; CHECK: ret void
+  ret void, !dbg !39
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare i64 @__mux_get_local_id(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!28}
+!llvm.ident = !{!29}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.1 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "kernel.opencl", directory: "/home/Aorta/build")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "loop_phi", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !9}
+!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 64)
+!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 64)
+!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !8)
+!11 = !{!12, !13, !14, !19}
+!12 = !DILocalVariable(name: "a", arg: 1, scope: !4, file: !1, line: 2, type: !7)
+!13 = !DILocalVariable(name: "b", arg: 2, scope: !4, file: !1, line: 2, type: !9)
+!14 = !DILocalVariable(name: "tid", scope: !4, file: !1, line: 3, type: !15)
+!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17)
+!16 = !DIFile(filename: "/home/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/home/Aorta/build")
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18)
+!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!19 = !DILocalVariable(name: "i", scope: !20, file: !1, line: 4, type: !8)
+!20 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4)
+!21 = !{void (i32 addrspace(3)*, i32 addrspace(3)*)* @loop_phi, !22, !23, !24, !25, !26, !27}
+!22 = !{!"kernel_arg_addr_space", i32 3, i32 3}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"const"}
+!27 = !{!"reqd_work_group_size", i32 32, i32 1, i32 1}
+!28 = !{i32 2, !"Debug Info Version", i32 3}
+!29 = !{!"clang version 3.8.1 "}
+!30 = !DIExpression()
+!31 = !DILocation(line: 2, scope: !4)
+!32 = !DILocation(line: 3, scope: !4)
+!33 = !DILocation(line: 4, scope: !20)
+!34 = !DILocation(line: 4, scope: !35)
+!35 = distinct !DILexicalBlock(scope: !20, file: !1, line: 4)
+!36 = !DILocation(line: 5, scope: !37)
+!37 = distinct !DILexicalBlock(scope: !35, file: !1, line: 4)
+!38 = !DILocation(line: 6, scope: !37)
+!39 = !DILocation(line: 7, scope: !4)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
new file mode 100644
index 0000000000000..0885f8a058592
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather.ll
@@ -0,0 +1,65 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k phi_memory -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @phi_memory(i32 addrspace(1)* %input, i32 addrspace(1)* %output, i32 %size) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %idx.ext = sext i32 %conv to i64
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %idx.ext
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %output.addr.0 = phi i32 addrspace(1)* [ %add.ptr, %entry ], [ %add.ptr2, %for.body ]
+  %storemerge = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %storemerge, %size
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %storemerge, %conv
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %idxprom
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  store i32 %0, i32 addrspace(1)* %output.addr.0, align 4
+  %add.ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %output.addr.0, i64 1
+  %inc = add nsw i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the contiguity of the load and store is identified through the
+; loop-incrementing pointer PHI node
+;
+; CHECK: void @__vecz_v4_phi_memory
+; CHECK: %[[LD:.+]] = load <4 x i32>
+; CHECK: store <4 x i32> %[[LD]]
+; CHECK-NOT: scatter_store
+; CHECK-NOT: gather_load
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
new file mode 100644
index 0000000000000..e13dc4ed88a66
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/phi_scatter_gather_2.ll
@@ -0,0 +1,60 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k phi_memory -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @phi_memory(i32 addrspace(1)* %input, i32 addrspace(1)* %output, i64 %size) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %add.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %call
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %output.addr.0 = phi i32 addrspace(1)* [ %add.ptr, %entry ], [ %add.ptr2, %for.body ]
+  %storemerge = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i64 %storemerge, %size
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i64 %storemerge, %call
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %add
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  store i32 %0, i32 addrspace(1)* %output.addr.0, align 4
+  %add.ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %output.addr.0, i64 %call
+  %inc = add nsw i64 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the NON-contiguity of the store is identified through the
+; loop-incrementing pointer PHI node
+;
+; CHECK: void @__vecz_v4_phi_memory
+; CHECK: %[[LD:.+]] = load <4 x i32>
+; CHECK: call void @__vecz_b_scatter_store4_Dv4_jDv4_u3ptrU3AS1(<4 x i32> %[[LD]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll
new file mode 100644
index 0000000000000..027a688a8614a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/poison_ub.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i32:32-f80:128-n8:16:32:64-S128"
+target triple = "spir-unknown-unknown"
+
+; Function Attrs: convergent nounwind readonly
+declare i32 @__mux_get_local_id(i32) #2
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @test() #0 {
+entry:
+  %call8 = call i32 @__mux_get_local_id(i32 0) #3
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* poison, i32 %call8
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %conv9 = uitofp i8 %0 to float
+  %phitmp = fptoui float %conv9 to i8
+  %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* poison, i32 %call8
+  store i8 %phitmp, i8 addrspace(1)* %arrayidx16, align 1
+  ret void
+}
+
+; The "poison"s in the above IR should "optimize" to a trap call and an unreachable
+; terminator instruction.
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: unreachable
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
new file mode 100644
index 0000000000000..7d52fda58e6f3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/predicate_with_switch.ll
@@ -0,0 +1,61 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k predicate_with_switch -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_local_id(i32)
+
+declare i64 @__mux_get_global_id(i32)
+
+@predicate_with_switch.tmpIn = internal addrspace(3) global [16 x i32] poison, align 4
+
+define spir_kernel void @predicate_with_switch(i32 addrspace(1)* %A, i32 addrspace(1)* %B) #0 {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0) #2
+  %call1 = call i64 @__mux_get_global_id(i32 0) #2
+  switch i64 %call, label %if.end [
+    i64 0, label %return
+    i64 200, label %return
+  ]
+
+if.end:
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %A, i64 %call1
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @predicate_with_switch.tmpIn, i64 0, i64 %call
+  store i32 %0, i32 addrspace(3)* %arrayidx3, align 4
+  %sub = add i64 %call, -1
+  %arrayidx4 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @predicate_with_switch.tmpIn, i64 0, i64 %sub
+  %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %B, i64 %call1
+  store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
+  br label %return
+
+return:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_predicate_with_switch
+
+; We should use masked stores
+; CHECK: vecz_b_masked_store4
+; CHECK: vecz_b_masked_store4
+
+; We should *not* have unconditional stores
+; CHECK-NOT: store <4 x i32>
+; CHECK-NOT: store <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
new file mode 100644
index 0000000000000..34c892ca5dea6
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/preserve-fast-math.ll
@@ -0,0 +1,35 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S -vecz-passes=packetizer < %s | FileCheck %s
+
+; CHECK: %{{.*}} = fcmp nnan ninf olt <4 x float> %{{.*}}, %{{.*}}
+
+define spir_kernel void @fast_nan(float addrspace(1)* %src1, float addrspace(1)* %src2, i16 addrspace(1)* %dst, i32 %width) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %src1, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %src2, i64 %call
+  %1 = load float, float addrspace(1)* %arrayidx2, align 4
+  %cmp = fcmp nnan ninf olt float %0, %1
+  %conv4 = zext i1 %cmp to i16
+  %arrayidx6 = getelementptr inbounds i16, i16 addrspace(1)* %dst, i64 %call
+  store i16 %conv4, i16 addrspace(1)* %arrayidx6, align 2
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
new file mode 100644
index 0000000000000..695e6d0a39696
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/printf_float.ll
@@ -0,0 +1,88 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_float -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [8 x i8] c"blah %d\00", align 1
+@.strf = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @printf_kernel(i32 addrspace(1)* %in, i32 addrspace(1)* %stridesX, i32 addrspace(1)* %dst, i32 %width, i32 %height) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #3
+  %cmp = icmp eq i32 %width, 13
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([8 x i8], [8
+ x i8] addrspace(2)* @.str, i64 0, i64 0), i32 %0) #3
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+define spir_kernel void @test_float(float* %in) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float* %in, i64 %call
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %0, %0
+  %conv = fpext float %mul to double
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([7 x i8], [7 x i8] addrspace(2)* @.strf, i64 0, i64 0), double %conv)
+  ret void
+}
+
+
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*, i32, i32)* @printf_kernel, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1, i32 0, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*", !"int*", !"int", !"int"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*", !"int", !"int"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !"", !"", !""}
+!6 = !{!"clang version 3.8.0 "}
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [7 x i8] c"%#16A\0A\00", align 1
+
+; CHECK: define spir_kernel void @__vecz_v4_test_float
+; CHECK: %[[CONV2:.+]] = fpext <4 x float> %{{.+}} to <4 x double>
+; CHECK: %[[V2:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 0
+; CHECK: %[[V3:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 1
+; CHECK: %[[V4:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 2
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x double> %[[CONV2]], {{(i32|i64)}} 3
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V2]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V3]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V4]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V5]])
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
new file mode 100644
index 0000000000000..533f710b34a01
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/regression_by_all.ll
@@ -0,0 +1,126 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+; RUN: veczc -k regression_by_all -vecz-passes=vecz-loop-rotate,cfg-convert -S < %s | FileCheck %s
+
+; The purpose of this test is to make sure the block `c` does not get considered
+; as a by_all because one of its predecessors is by_all. In fact, because `c`
+; has a div causing block (b) as one of its predecessors, then it cannot be
+; considered by_all
+
+; The CFG of the following kernel is:
+;
+;   a
+;   |\
+;   | b
+;   |/ \
+;   c   d
+;    \ /
+;     e
+;
+; * where node a is a uniform branch, and node b is a varying branch.
+; * where nodes c, d and e are divergent.
+;
+; With partial linearization we will have a CFG of the form:
+;
+;     a
+;    /|
+;   | b
+;   | |
+;   | d
+;    \|
+;     c
+;     |
+;     e
+;
+; __kernel void regression_by_all(__global int *out, int n) {
+;   int id = get_global_id(0);
+;   int ret = 0;
+;
+;   if (n % 2 == 0) {
+;     goto d;
+;   } else {
+;     ret = 1;
+;     if (id % 2 != 0) {
+;       goto d;
+;     } else {
+;       for (int i = 0; i < n; ++i) { ret++; }
+;       goto e;
+;     }
+;   }
+;
+; d:
+;   ret += id;
+;   ret *= n;
+;
+; e:
+;   out[id] = ret;
+; }
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @regression_by_all(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %rem1 = and i32 %n, 1
+  %cmp = icmp eq i32 %rem1, 0
+  br i1 %cmp, label %d, label %if.else
+
+if.else:                                          ; preds = %entry
+  %rem22 = and i32 %conv, 1
+  %cmp3 = icmp eq i32 %rem22, 0
+  br i1 %cmp3, label %for.cond, label %d
+
+for.cond:                                         ; preds = %if.else, %for.body
+  %ret.0 = phi i32 [ %inc, %for.body ], [ 1, %if.else ]
+  %storemerge = phi i32 [ %inc9, %for.body ], [ 0, %if.else ]
+  %cmp7 = icmp slt i32 %storemerge, %n
+  br i1 %cmp7, label %for.body, label %e
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc9 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+d:                                                ; preds = %if.else, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ 1, %if.else ]
+  %add = add nsw i32 %ret.1, %conv
+  %mul = mul nsw i32 %add, %n
+  br label %e
+
+e:                                                ; preds = %for.cond, %d
+  %ret.2 = phi i32 [ %mul, %d ], [ %ret.0, %for.cond ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @__vecz_v4_regression_by_all
+; CHECK: br i1 %[[CMP:.+]], label %[[D:.+]], label %[[IFELSE:.+]]
+
+; CHECK: [[D]]:
+; CHECK-NOT: %d.entry_mask = and i1 true, true
+; CHECK: %d.entry_mask = phi i1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
new file mode 100644
index 0000000000000..cc64e2641a2b0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=remove-int-ptr -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i8(
+; CHECK: %shl = shl i64 %call, 2
+; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl
+; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64
+; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
+define spir_kernel void @intptr_cast_i8(i8 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = ptrtoint i8 addrspace(1)* %in to i64
+  %shl = shl i64 %call, 2
+  %add = add i64 %shl, %0
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_intptr_cast_i16(
+; CHECK: %shl = shl i64 %call, 2
+; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in, i64 %shl
+; CHECK: %remove_intptr1 = ptrtoint ptr addrspace(1) %remove_intptr to i64
+; CHECK: store i64 %remove_intptr1, ptr addrspace(1) %out, align 8
+define spir_kernel void @intptr_cast_i16(i16 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = ptrtoint i16 addrspace(1)* %in to i64
+  %shl = shl i64 %call, 2
+  %add = add i64 %shl, %0
+  store i64 %add, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
new file mode 100644
index 0000000000000..8dd706f51977b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_2.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @remove_intptr(i8 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = ptrtoint i8 addrspace(1)* %in to i64
+  %shl = shl nuw nsw i64 %call, 2
+  %add = add i64 %shl, %0
+  %1 = inttoptr i64 %add to i32 addrspace(1)*
+  %2 = load i32, i32 addrspace(1)* %1, align 4
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @__vecz_v4_remove_intptr
+; CHECK-NOT: ptrtoint
+; CHECK-NOT: inttoptr
+; CHECK: %remove_intptr = getelementptr i8, ptr addrspace(1) %in
+; CHECK: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %remove_intptr, align 4
+; CHECK: store <4 x i32> %[[LOAD]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
new file mode 100644
index 0000000000000..64234b9019781
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/remove_intptr_phi.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @remove_intptr(i8 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = ptrtoint i8 addrspace(1)* %in to i64
+  %shl = shl nuw nsw i64 %call, 2
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %shl
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %x.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %intin.06 = phi i64 [ %0, %entry ], [ %add, %for.body ]
+  %add = add i64 %intin.06, 4
+  %1 = inttoptr i64 %add to i32 addrspace(1)*
+  %2 = load i32, i32 addrspace(1)* %1, align 4
+  store i32 %2, i32 addrspace(1)* %arrayidx, align 4
+  %inc = add nuw nsw i32 %x.07, 1
+  %exitcond.not = icmp eq i32 %inc, 4
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @__vecz_v4_remove_intptr
+; CHECK-NOT: ptrtoint
+; CHECK-NOT: inttoptr
+; CHECK: %[[RPHI:.+]] = phi ptr addrspace(1) [ %in, %entry ], [ %[[RGEP:.+]], %for.body ]
+; CHECK: %[[RGEP]] = getelementptr i8, ptr addrspace(1) %[[RPHI]], i{{32|64}} 4
+; CHECK: load i32, ptr addrspace(1) %[[RGEP]], align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll
new file mode 100644
index 0000000000000..7f4a881552699
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/reqd-sg-size-auto.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Let vecz pick the right vectorization factor for this kernel
+; RUN: veczc --vecz-auto -k bar_sg8 -k foo_sg13 -S < %s | FileCheck %s
+; RUN: veczc --vecz-auto -k bar_sg8:4 -k foo_sg13:8 -S < %s | FileCheck %s
+
+; Check we auto-vectorize to 8, despite any other options telling us a
+; different vectorization factor.
+; CHECK: define void @__vecz_v8_bar_sg8
+define void @bar_sg8(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !intel_reqd_sub_group_size !0 {
+  %id = call i64 @__mux_get_global_id(i32 0)
+  %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id
+  %x = load i32, ptr addrspace(1) %in.addr
+; CHECK: = add <8 x i32>
+  %y = add i32 %x, 1
+  %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id
+  store i32 %y, ptr addrspace(1) %out.addr
+  ret void
+}
+
+; Check we auto-vectorize to 13, despite any other options telling us a
+; different vectorization factor. This is a silly number but it if we're told
+; to do it we must obey.
+; CHECK: define void @__vecz_v13_foo_sg13
+define void @foo_sg13(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 !intel_reqd_sub_group_size !1 {
+  %id = call i64 @__mux_get_global_id(i32 0)
+  %in.addr = getelementptr i32, ptr addrspace(1) %in, i64 %id
+  %x = load i32, ptr addrspace(1) %in.addr
+; CHECK: = add <13 x i32>
+  %y = add i32 %x, 1
+  %out.addr = getelementptr i32, ptr addrspace(1) %out, i64 %id
+  store i32 %y, ptr addrspace(1) %out.addr
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+attributes #0 = { "mux-kernel"="entry-point" }
+
+!0 = !{i32 8}
+!1 = !{i32 13}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
new file mode 100644
index 0000000000000..dcf78d89930d2
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/roscc_simplify.ll
@@ -0,0 +1,53 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s -w 16 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out, i64 addrspace(1)* %N) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %0 = load i64, i64 addrspace(1)* %N, align 8
+  %cmp = icmp ult i64 %call, %0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %call
+  %1 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in2, i64 %call
+  %2 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add = add nsw i32 %2, %1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @__vecz_v16_add
+; CHECK: entry:
+; CHECK: br i1 %{{.+}}, label %[[END:.+]], label %[[THEN:.+]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[THEN]]:
+; CHECK: br label %[[END]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[END]]:
+; CHECK-NEXT: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
new file mode 100644
index 0000000000000..1c9a90a942684
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_load_store_in_varying_branch.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @__mux_get_local_id(i32);
+declare spir_func i32 @__mux_get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %lid = call i32 @__mux_get_local_id(i32 0)
+  %cmp = icmp eq i32 %lid, 0
+  br i1 %cmp, label %if, label %merge
+
+if:
+  %single_load = load i32, i32 addrspace(1)* %in
+  %single_add = add i32 %single_load, 42
+  store i32 %single_add, i32 addrspace(1)* %in
+  br label %merge
+
+merge:
+  %multi_load = load i32, i32 addrspace(1)* %in
+  %multi_add = add i32 %multi_load, 42
+  %gid = call i32 @__mux_get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %multi_add, i32 addrspace(1)* %slot
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[BITCAST:[0-9]+]] = bitcast <4 x i1> %cmp3 to i4
+; CHECK: %[[MASK:.+]] = icmp ne i4 %[[BITCAST]], 0
+; CHECK: %single_load{{[0-9]*}} = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]])
+; CHECK: %multi_load = load i32, ptr addrspace(1) %in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
new file mode 100644
index 0000000000000..fb2b8e8076f5f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat.ll
@@ -0,0 +1,36 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i32 @__mux_get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %load = load i32, i32 addrspace(1)* %in
+  %gid = call i32 @__mux_get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %load, i32 addrspace(1)* %slot
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: entry:
+; CHECK: %load = load i32, ptr addrspace(1) %in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
new file mode 100644
index 0000000000000..c563b79b6917e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_load_store_in_varying_branch.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %lid = call i32 @__mux_get_local_id(i32 0)
+  %cmp = icmp eq i32 %lid, 0
+  br i1 %cmp, label %if, label %merge
+
+if:
+  %secretly_scalar_load = load i32, i32 addrspace(1)* %in
+  %add = add i32 %secretly_scalar_load, 42
+  store i32 %add, i32 addrspace(1)* %in
+  br label %merge
+
+merge:
+  %load = load i32, i32 addrspace(1)* %in
+  %gid = call i32 @__mux_get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %load, i32 addrspace(1)* %slot
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %load = load i32, ptr addrspace(1) %in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
new file mode 100644
index 0000000000000..62ea24d8e2c5e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_after_varying_branch.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %lid = call i32 @__mux_get_local_id(i32 0)
+  %cmp = icmp eq i32 %lid, 0
+  br i1 %cmp, label %if, label %merge
+
+if:
+  br label %merge
+
+merge:
+  %load = load i32, i32 addrspace(1)* %in
+  %gid = call i32 @__mux_get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %load, i32 addrspace(1)* %slot
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %load = load i32, ptr addrspace(1) %in
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
new file mode 100644
index 0000000000000..e7b76a778e784
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_splat_in_varying_branch.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i32 @__mux_get_local_id(i32);
+declare i32 @__mux_get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %lid = call i32 @__mux_get_local_id(i32 0)
+  %and = and i32 %lid, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if, label %merge
+
+if:
+  %lid1 = call i32 @__mux_get_local_id(i32 1)
+  %cmp1 = icmp eq i32 %lid1, 0
+  br i1 %cmp1, label %deeper_if, label %deeper_merge
+
+deeper_if:
+  br label %deeper_merge
+
+deeper_merge:
+  %load = load i32, i32 addrspace(1)* %in
+  %gid = call i32 @__mux_get_global_id(i32 0)
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %gid
+  store i32 %load, i32 addrspace(1)* %slot
+  br label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LOAD:.+]] = load i32, ptr addrspace(1) %in
+; CHECK: %[[SPLAT_IN:.+]] = insertelement <4 x i32> poison, i32 %[[LOAD]], {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLAT_IN]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrU3AS1Dv4_b(<4 x i32> %[[SPLAT]], ptr addrspace(1){{( nonnull)? %.*}}, <4 x i1> %{{.+}})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
new file mode 100644
index 0000000000000..e5fe580b0ac22
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalar_vector_user.ll
@@ -0,0 +1,78 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k scalar_vector_user -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:1:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_local_id(i32) #0
+
+; Function Attrs: nounwind readnone
+declare spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) #0
+
+declare spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float>, i64, float addrspace(1)*)
+
+declare spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64, float addrspace(1)*)
+; Function Attrs: inlinehint norecurse nounwind readnone
+declare spir_func float @_Z3madfff(float, float, float) local_unnamed_addr #2
+
+define spir_kernel void @scalar_vector_user(float addrspace(1)* %inout, i64 %n) {
+entry:
+  %lid = tail call i64 @__mux_get_local_id(i32 0) #0
+  %inout.address = getelementptr inbounds float, float addrspace(1)* %inout, i64 %lid
+  br label %loop
+
+loop:                                              ; preds = %entry, %loop
+  %madv4.prev = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4, %loop ]
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop ]
+  %i.inc = add nuw nsw i64 %i, 1
+  %cmp = icmp slt i64 %i.inc, %n
+  %inout.vload = tail call spir_func <4 x float> @_Z6vload4mPU3AS3Kf(i64 0, float addrspace(1)* %inout.address)
+  %inout.vec0 = shufflevector <4 x float> %inout.vload, <4 x float> poison, <4 x i32> zeroinitializer
+  %madv4 = tail call spir_func <4 x float> @_Z3madDv4_fS_S_(<4 x float> %inout.vload, <4 x float> %inout.vec0, <4 x float> %madv4.prev) #0
+  br i1 %cmp, label %loop, label %end
+
+end:                                             ; preds = %loop
+  %mad.vec0 = extractelement <4 x float> %madv4, i32 0
+  store float %mad.vec0, float addrspace(1)* %inout.address, align 4
+  tail call spir_func void @_Z7vstore4Dv4_fmPU3AS1f(<4 x float> %madv4, i64 0, float addrspace(1)* %inout.address)
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate }
+attributes #2 = { inlinehint norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; The purpose of this test is to make sure we correctly scalarize an instruction
+; used by both a scalar and vector instruction. We would previously try to
+; scalarize its users twice thus resulting in invalid IR.
+
+; CHECK: define spir_kernel void @__vecz_v4_scalar_vector_user
+; CHECK: loop:
+; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S0:[0-9]+]], %loop ]{{$}}
+; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S1:[0-9]+]], %loop ]{{$}}
+; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S2:[0-9]+]], %loop ]{{$}}
+; CHECK: %madv4.prev{{.*}} = phi <4 x float> [ zeroinitializer, %entry ], [ %madv4[[S3:[0-9]+]], %loop ]{{$}}
+
+; make sure the above PHI incomings are unique by looking for their definitions
+; CHECK: %madv4[[S0]] =
+; CHECK: %madv4[[S1]] =
+; CHECK: %madv4[[S2]] =
+; CHECK: %madv4[[S3]] =
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
new file mode 100644
index 0000000000000..97ccb3494c1ac
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls.ll
@@ -0,0 +1,86 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_calls(<4 x float>* %pa, <4 x float>* %pb, <4 x i32>* %pc, <4 x float>* %pd) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <4 x float>, <4 x float>* %pa, i64 %idx
+  %b = getelementptr <4 x float>, <4 x float>* %pb, i64 %idx
+  %c = getelementptr <4 x i32>, <4 x i32>* %pc, i64 %idx
+  %d = getelementptr <4 x float>, <4 x float>* %pd, i64 %idx
+  %0 = load <4 x float>, <4 x float>* %a, align 16
+  %1 = load <4 x float>, <4 x float>* %b, align 16
+  %2 = load <4 x i32>, <4 x i32>* %c, align 16
+  %call = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %2)
+  %3 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %call)
+  store <4 x float> %3, <4 x float>* %d, align 16
+  ret void
+}
+
+declare spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32>)
+declare spir_func float @_Z13convert_floati(i32)
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; CHECK: define spir_kernel void @__vecz_v4_test_calls(ptr %pa, ptr %pb, ptr %pc, ptr %pd)
+; CHECK: entry:
+; CHECK: %[[A_0:.+]] = getelementptr float, ptr %a, i32 0
+; CHECK: %[[A_1:.+]] = getelementptr float, ptr %a, i32 1
+; CHECK: %[[A_2:.+]] = getelementptr float, ptr %a, i32 2
+; CHECK: %[[A_3:.+]] = getelementptr float, ptr %a, i32 3
+; CHECK: %[[LA_0:.+]] = load float, ptr %[[A_0]]
+; CHECK: %[[LA_1:.+]] = load float, ptr %[[A_1]]
+; CHECK: %[[LA_2:.+]] = load float, ptr %[[A_2]]
+; CHECK: %[[LA_3:.+]] = load float, ptr %[[A_3]]
+; CHECK: %[[B_0:.+]] = getelementptr float, ptr %b, i32 0
+; CHECK: %[[B_1:.+]] = getelementptr float, ptr %b, i32 1
+; CHECK: %[[B_2:.+]] = getelementptr float, ptr %b, i32 2
+; CHECK: %[[B_3:.+]] = getelementptr float, ptr %b, i32 3
+; CHECK: %[[LB_0:.+]] = load float, ptr %[[B_0]]
+; CHECK: %[[LB_1:.+]] = load float, ptr %[[B_1]]
+; CHECK: %[[LB_2:.+]] = load float, ptr %[[B_2]]
+; CHECK: %[[LB_3:.+]] = load float, ptr %[[B_3]]
+; CHECK: %[[C_0:.+]] = getelementptr i32, ptr %c, i32 0
+; CHECK: %[[C_1:.+]] = getelementptr i32, ptr %c, i32 1
+; CHECK: %[[C_2:.+]] = getelementptr i32, ptr %c, i32 2
+; CHECK: %[[C_3:.+]] = getelementptr i32, ptr %c, i32 3
+; CHECK: %[[LC_0:.+]] = load i32, ptr %[[C_0]]
+; CHECK: %[[LC_1:.+]] = load i32, ptr %[[C_1]]
+; CHECK: %[[LC_2:.+]] = load i32, ptr %[[C_2]]
+; CHECK: %[[LC_3:.+]] = load i32, ptr %[[C_3]]
+; CHECK: %[[CALL1:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_0]])
+; CHECK: %[[CALL2:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_1]])
+; CHECK: %[[CALL3:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_2]])
+; CHECK: %[[CALL4:.+]] = call spir_func float @_Z13convert_floati(i32 %[[LC_3]])
+; CHECK: %[[FMAD_0:.+]] = call float @llvm.fmuladd.f32(float %[[LA_0]], float %[[LB_0]], float %[[CALL1]])
+; CHECK: %[[FMAD_1:.+]] = call float @llvm.fmuladd.f32(float %[[LA_1]], float %[[LB_1]], float %[[CALL2]])
+; CHECK: %[[FMAD_2:.+]] = call float @llvm.fmuladd.f32(float %[[LA_2]], float %[[LB_2]], float %[[CALL3]])
+; CHECK: %[[FMAD_3:.+]] = call float @llvm.fmuladd.f32(float %[[LA_3]], float %[[LB_3]], float %[[CALL4]])
+; CHECK: %[[D_0:.+]] = getelementptr float, ptr %d, i32 0
+; CHECK: %[[D_1:.+]] = getelementptr float, ptr %d, i32 1
+; CHECK: %[[D_2:.+]] = getelementptr float, ptr %d, i32 2
+; CHECK: %[[D_3:.+]] = getelementptr float, ptr %d, i32 3
+; CHECK: store float %[[FMAD_0]], ptr %[[D_0]]
+; CHECK: store float %[[FMAD_1]], ptr %[[D_1]]
+; CHECK: store float %[[FMAD_2]], ptr %[[D_2]]
+; CHECK: store float %[[FMAD_3]], ptr %[[D_3]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
new file mode 100644
index 0000000000000..f016562ea54ef
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_calls_uniform.ll
@@ -0,0 +1,47 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_calls -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_calls(<4 x float>* %a, <4 x float>* %b, <4 x i32>* %c, <4 x float>* %d) {
+entry:
+  %0 = load <4 x float>, <4 x float>* %a, align 16
+  %1 = load <4 x float>, <4 x float>* %b, align 16
+  %2 = load <4 x i32>, <4 x i32>* %c, align 16
+  %call = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %2)
+  %3 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %call)
+  store <4 x float> %3, <4 x float>* %d, align 16
+  ret void
+}
+
+declare spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32>)
+declare spir_func float @_Z13convert_floati(i32)
+declare <4x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+
+; Checks that this function gets vectorized, although because every instruction is
+; uniform, the process of vectorization makes no actual changes whatsoever!
+; CHECK: define spir_kernel void @__vecz_v4_test_calls(ptr %a, ptr %b, ptr %c, ptr %d)
+; CHECK: entry:
+; CHECK: %[[LA:.+]] = load <4 x float>, ptr %a, align 16
+; CHECK: %[[LB:.+]] = load <4 x float>, ptr %b, align 16
+; CHECK: %[[LC:.+]] = load <4 x i32>, ptr %c, align 16
+; CHECK: %[[CALL:.+]] = call spir_func <4 x float> @_Z14convert_float4Dv4_i(<4 x i32> %[[LC]])
+; CHECK: %[[FMAD:.+]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %[[LA]], <4 x float> %[[LB]], <4 x float> %[[CALL]])
+; CHECK: store <4 x float> %[[FMAD]], ptr %d, align 16
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
new file mode 100644
index 0000000000000..2e7c1a2202c71
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_debug_info.ll
@@ -0,0 +1,183 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Check that debug info is preserved in the vectorized kernel.
+; Specifically that the scalarization pass doesn't destroy DI
+; intrinsics attached to the vector instructions it scalarizes.
+
+; RUN: veczc -k mul2 -vecz-passes="scalarize,function(mem2reg)" -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+
+; Function Attrs: nounwind
+define spir_kernel void @mul2(<2 x i32> addrspace(1)* %in1, <2 x i32> addrspace(1)* %in2, <2 x i32> addrspace(1)* %out) #0 !dbg !4 {
+entry:
+  %in1.addr = alloca <2 x i32> addrspace(1)*, align 8
+  %in2.addr = alloca <2 x i32> addrspace(1)*, align 8
+  %out.addr = alloca <2 x i32> addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %a = alloca <2 x i32>, align 8
+  %b = alloca <2 x i32>, align 8
+  %tmp = alloca <2 x i32>, align 8
+  store <2 x i32> addrspace(1)* %in1, <2 x i32> addrspace(1)** %in1.addr, align 8
+  call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %in1.addr, metadata !16, metadata !34), !dbg !35
+  store <2 x i32> addrspace(1)* %in2, <2 x i32> addrspace(1)** %in2.addr, align 8
+  call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %in2.addr, metadata !17, metadata !34), !dbg !35
+  store <2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata <2 x i32> addrspace(1)** %out.addr, metadata !18, metadata !34), !dbg !35
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !19, metadata !34), !dbg !36
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !36
+  store i64 %call, i64* %tid, align 8, !dbg !36
+  call void @llvm.dbg.declare(metadata <2 x i32>* %a, metadata !23, metadata !34), !dbg !37
+  %0 = load i64, i64* %tid, align 8, !dbg !37
+  %1 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %in1.addr, align 8, !dbg !37
+  %arrayidx = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %1, i64 %0, !dbg !37
+  %2 = load <2 x i32>, <2 x i32> addrspace(1)* %arrayidx, align 8, !dbg !37
+  store <2 x i32> %2, <2 x i32>* %a, align 8, !dbg !37
+  call void @llvm.dbg.declare(metadata <2 x i32>* %b, metadata !24, metadata !34), !dbg !38
+  %3 = load i64, i64* %tid, align 8, !dbg !38
+  %4 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %in2.addr, align 8, !dbg !38
+  %arrayidx1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %4, i64 %3, !dbg !38
+  %5 = load <2 x i32>, <2 x i32> addrspace(1)* %arrayidx1, align 8, !dbg !38
+  store <2 x i32> %5, <2 x i32>* %b, align 8, !dbg !38
+  call void @llvm.dbg.declare(metadata <2 x i32>* %tmp, metadata !25, metadata !34), !dbg !39
+  %6 = load <2 x i32>, <2 x i32>* %a, align 8, !dbg !39
+  %7 = load <2 x i32>, <2 x i32>* %b, align 8, !dbg !39
+  %mul = mul <2 x i32> %6, %7, !dbg !39
+  store <2 x i32> %mul, <2 x i32>* %tmp, align 8, !dbg !39
+  %8 = load <2 x i32>, <2 x i32>* %tmp, align 8, !dbg !40
+  %9 = load i64, i64* %tid, align 8, !dbg !40
+  %10 = load <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)** %out.addr, align 8, !dbg !40
+  %arrayidx2 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %10, i64 %9, !dbg !40
+  store <2 x i32> %8, <2 x i32> addrspace(1)* %arrayidx2, align 8, !dbg !40
+  ret void, !dbg !41
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare i64 @__mux_get_global_id(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!26}
+!llvm.module.flags = !{!32}
+!llvm.ident = !{!33}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "Aorta/vecz_build")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "mul2", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !15)
+!5 = !DIFile(filename: "kernel.opencl", directory: "Aorta/vecz_build")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !8}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIDerivedType(tag: DW_TAG_typedef, name: "int2", file: !10, line: 63, baseType: !11)
+!10 = !DIFile(filename: "Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "Aorta/vecz_build")
+!11 = !DICompositeType(tag: DW_TAG_array_type, baseType: !12, size: 64, align: 64, flags: DIFlagVector, elements: !13)
+!12 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!13 = !{!14}
+!14 = !DISubrange(count: 2)
+!15 = !{!16, !17, !18, !19, !23, !24, !25}
+!16 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+!17 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8)
+!18 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8)
+!19 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !20)
+!20 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !10, line: 33, baseType: !21)
+!21 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !10, line: 31, baseType: !22)
+!22 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!23 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 4, type: !9)
+!24 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 5, type: !9)
+!25 = !DILocalVariable(name: "tmp", scope: !4, file: !5, line: 6, type: !9)
+!26 = !{void (<2 x i32> addrspace(1)*, <2 x i32> addrspace(1)*, <2 x i32> addrspace(1)*)* @mul2, !27, !28, !29, !30, !31}
+!27 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!28 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!29 = !{!"kernel_arg_type", !"int2*", !"int2*", !"int2*"}
+!30 = !{!"kernel_arg_base_type", !"int __attribute__((ext_vector_type(2)))*", !"int __attribute__((ext_vector_type(2)))*", !"int __attribute__((ext_vector_type(2)))*"}
+!31 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!32 = !{i32 2, !"Debug Info Version", i32 3}
+!33 = !{!"clang version 3.8.0 "}
+!34 = !DIExpression()
+!35 = !DILocation(line: 1, scope: !4)
+!36 = !DILocation(line: 3, scope: !4)
+!37 = !DILocation(line: 4, scope: !4)
+!38 = !DILocation(line: 5, scope: !4)
+!39 = !DILocation(line: 6, scope: !4)
+!40 = !DILocation(line: 7, scope: !4)
+!41 = !DILocation(line: 8, scope: !4)
+
+; Vectorized kernel function
+; CHECK: @__vecz_v[[WIDTH:[0-9]+]]_mul2({{.*}} !dbg [[VECZ_SUBPROG:![0-9]+]]
+
+; Check that intrinsics for user variable locations are still present
+; CHECK: #dbg_value({{.*}} %in1, [[DI_IN1:![0-9]+]], [[EXPR:!DIExpression()]]
+; CHECK-SAME: [[PARAM_LOC:![0-9]+]]
+
+; CHECK: #dbg_value({{.*}} %in2, [[DI_IN2:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[PARAM_LOC]]
+
+; CHECK: #dbg_value({{.*}} %out, [[DI_OUT:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[PARAM_LOC]]
+
+; CHECK: #dbg_value(i64 %call, [[DI_TID:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[TID_LOC:![0-9]+]]
+
+; CHECK: #dbg_declare(ptr %a, [[DI_A:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[A_LOC:![0-9]+]]
+
+; CHECK: #dbg_declare(ptr %b, [[DI_B:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[B_LOC:![0-9]+]]
+
+; CHECK: #dbg_declare(ptr %tmp, [[DI_TMP:![0-9]+]], [[EXPR]]
+; CHECK-SAME: [[TMP_LOC:![0-9]+]]
+
+; Debug info metadata entries
+; CHECK:[[PTR_TYPE:![0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[DI_INT2:![0-9]+]], size: 64, align: 64)
+; CHECK:[[DI_INT2]] = !DIDerivedType(tag: DW_TAG_typedef, name: "int2"
+
+; CHECK: [[VECZ_SUBPROG]] = distinct !DISubprogram(name: "mul2"
+; CHECK-SAME: retainedNodes: [[VECZ_VARS:![0-9]+]]
+
+; CHECK: [[VECZ_VARS]] = !{[[DI_IN1]], [[DI_IN2]], [[DI_OUT]], [[DI_TID]], [[DI_A]], [[DI_B]], [[DI_TMP]]}
+
+; CHECK: [[DI_IN1]] = !DILocalVariable(name: "in1", arg: 1, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
+
+; CHECK: [[DI_IN2]] = !DILocalVariable(name: "in2", arg: 2, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 1, type: [[PTR_TYPE]]
+
+; CHECK: [[DI_OUT]] = !DILocalVariable(name: "out", arg: 3, scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 1, type: [[PTR_TYPE]]
+
+; CHECK: [[DI_TID]] = !DILocalVariable(name: "tid", scope: [[VECZ_SUBPROG]]
+; CHECK-SAME:line: 3
+
+; CHECK: [[DI_A]] = !DILocalVariable(name: "a", scope: [[VECZ_SUBPROG]],
+; CHECK-SAME:line: 4
+
+; CHECK: [[DI_B]] = !DILocalVariable(name: "b", scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 5
+
+; CHECK: [[DI_TMP]] = !DILocalVariable(name: "tmp", scope: [[VECZ_SUBPROG]],
+; CHECK-SAME: line: 6
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
new file mode 100644
index 0000000000000..3e4db8b32697c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions.ll
@@ -0,0 +1,142 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+define spir_kernel void @test_instructions(<4 x i32>* %pa, <4 x i32>* %pb, <4 x i32>* %pc) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %a = getelementptr <4 x i32>, <4 x i32>* %pa, i64 %idx
+  %b = getelementptr <4 x i32>, <4 x i32>* %pb, i64 %idx
+  %c = getelementptr <4 x i32>, <4 x i32>* %pc, i64 %idx
+  %0 = load <4 x i32>, <4 x i32>* %a, align 16
+  %1 = load <4 x i32>, <4 x i32>* %b, align 16
+  %add = add <4 x i32> %1, %0
+  store <4 x i32> %add, <4 x i32>* %c, align 16
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 1
+  %2 = load <4 x i32>, <4 x i32>* %arrayidx3, align 16
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32>* %b, i64 1
+  %3 = load <4 x i32>, <4 x i32>* %arrayidx4, align 16
+  %cmp = icmp sgt <4 x i32> %2, %3
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %arrayidx5 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 1
+  store <4 x i32> %sext, <4 x i32>* %arrayidx5, align 16
+  %arrayidx6 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 2
+  %4 = load <4 x i32>, <4 x i32>* %arrayidx6, align 16
+  %cmp7 = icmp slt <4 x i32> %4, <i32 11, i32 12, i32 13, i32 14>
+  %sext8 = sext <4 x i1> %cmp7 to <4 x i32>
+  %arrayidx9 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 2
+  store <4 x i32> %sext8, <4 x i32>* %arrayidx9, align 16
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test_instructions(ptr %pa, ptr %pb, ptr %pc)
+; CHECK: entry:
+; CHECK: %[[A_0:.+]] = getelementptr i32, ptr %a, i32 0
+; CHECK: %[[A_1:.+]] = getelementptr i32, ptr %a, i32 1
+; CHECK: %[[A_2:.+]] = getelementptr i32, ptr %a, i32 2
+; CHECK: %[[A_3:.+]] = getelementptr i32, ptr %a, i32 3
+; CHECK: %[[LA_0:.+]] = load i32, ptr %[[A_0]]
+; CHECK: %[[LA_1:.+]] = load i32, ptr %[[A_1]]
+; CHECK: %[[LA_2:.+]] = load i32, ptr %[[A_2]]
+; CHECK: %[[LA_3:.+]] = load i32, ptr %[[A_3]]
+; CHECK: %[[B_0:.+]] = getelementptr i32, ptr %b, i32 0
+; CHECK: %[[B_1:.+]] = getelementptr i32, ptr %b, i32 1
+; CHECK: %[[B_2:.+]] = getelementptr i32, ptr %b, i32 2
+; CHECK: %[[B_3:.+]] = getelementptr i32, ptr %b, i32 3
+; CHECK: %[[LB_0:.+]] = load i32, ptr %[[B_0]]
+; CHECK: %[[LB_1:.+]] = load i32, ptr %[[B_1]]
+; CHECK: %[[LB_2:.+]] = load i32, ptr %[[B_2]]
+; CHECK: %[[LB_3:.+]] = load i32, ptr %[[B_3]]
+; CHECK: %[[ADD1:.+]] = add i32 %[[LB_0]], %[[LA_0]]
+; CHECK: %[[ADD2:.+]] = add i32 %[[LB_1]], %[[LA_1]]
+; CHECK: %[[ADD3:.+]] = add i32 %[[LB_2]], %[[LA_2]]
+; CHECK: %[[ADD4:.+]] = add i32 %[[LB_3]], %[[LA_3]]
+; CHECK: %[[C_0:.+]] = getelementptr i32, ptr %c, i32 0
+; CHECK: %[[C_1:.+]] = getelementptr i32, ptr %c, i32 1
+; CHECK: %[[C_2:.+]] = getelementptr i32, ptr %c, i32 2
+; CHECK: %[[C_3:.+]] = getelementptr i32, ptr %c, i32 3
+; CHECK: store i32 %[[ADD1]], ptr %[[C_0]]
+; CHECK: store i32 %[[ADD2]], ptr %[[C_1]]
+; CHECK: store i32 %[[ADD3]], ptr %[[C_2]]
+; CHECK: store i32 %[[ADD4]], ptr %[[C_3]]
+; CHECK: %arrayidx3 = getelementptr <4 x i32>, ptr %a, i64 1
+; CHECK: %[[A1_0:.+]] = getelementptr i32, ptr %arrayidx3, i32 0
+; CHECK: %[[A1_1:.+]] = getelementptr i32, ptr %arrayidx3, i32 1
+; CHECK: %[[A1_2:.+]] = getelementptr i32, ptr %arrayidx3, i32 2
+; CHECK: %[[A1_3:.+]] = getelementptr i32, ptr %arrayidx3, i32 3
+; CHECK: %[[LA1_0:.+]] = load i32, ptr %[[A1_0]]
+; CHECK: %[[LA1_1:.+]] = load i32, ptr %[[A1_1]]
+; CHECK: %[[LA1_2:.+]] = load i32, ptr %[[A1_2]]
+; CHECK: %[[LA1_3:.+]] = load i32, ptr %[[A1_3]]
+; CHECK: %arrayidx4 = getelementptr <4 x i32>, ptr %b, i64 1
+; CHECK: %[[B1_0:.+]] = getelementptr i32, ptr %arrayidx4, i32 0
+; CHECK: %[[B1_1:.+]] = getelementptr i32, ptr %arrayidx4, i32 1
+; CHECK: %[[B1_2:.+]] = getelementptr i32, ptr %arrayidx4, i32 2
+; CHECK: %[[B1_3:.+]] = getelementptr i32, ptr %arrayidx4, i32 3
+; CHECK: %[[LB1_0:.+]] = load i32, ptr %[[B1_0]]
+; CHECK: %[[LB1_1:.+]] = load i32, ptr %[[B1_1]]
+; CHECK: %[[LB1_2:.+]] = load i32, ptr %[[B1_2]]
+; CHECK: %[[LB1_3:.+]] = load i32, ptr %[[B1_3]]
+; CHECK: %[[CMP5:.+]] = icmp sgt i32 %[[LA1_0]], %[[LB1_0]]
+; CHECK: %[[CMP6:.+]] = icmp sgt i32 %[[LA1_1]], %[[LB1_1]]
+; CHECK: %[[CMP8:.+]] = icmp sgt i32 %[[LA1_2]], %[[LB1_2]]
+; CHECK: %[[CMP9:.+]] = icmp sgt i32 %[[LA1_3]], %[[LB1_3]]
+; CHECK: %[[SEXT10:.+]] = sext i1 %[[CMP5]] to i32
+; CHECK: %[[SEXT11:.+]] = sext i1 %[[CMP6]] to i32
+; CHECK: %[[SEXT12:.+]] = sext i1 %[[CMP8]] to i32
+; CHECK: %[[SEXT13:.+]] = sext i1 %[[CMP9]] to i32
+; CHECK: %arrayidx5 = getelementptr <4 x i32>, ptr %c, i64 1
+; CHECK: %[[C1_0:.+]] = getelementptr i32, ptr %arrayidx5, i32 0
+; CHECK: %[[C1_1:.+]] = getelementptr i32, ptr %arrayidx5, i32 1
+; CHECK: %[[C1_2:.+]] = getelementptr i32, ptr %arrayidx5, i32 2
+; CHECK: %[[C1_3:.+]] = getelementptr i32, ptr %arrayidx5, i32 3
+; CHECK: store i32 %[[SEXT10]], ptr %[[C1_0]]
+; CHECK: store i32 %[[SEXT11]], ptr %[[C1_1]]
+; CHECK: store i32 %[[SEXT12]], ptr %[[C1_2]]
+; CHECK: store i32 %[[SEXT13]], ptr %[[C1_3]]
+; CHECK: %arrayidx6 = getelementptr <4 x i32>, ptr %a, i64 2
+; CHECK: %[[A2_0:.+]] = getelementptr i32, ptr %arrayidx6, i32 0
+; CHECK: %[[A2_1:.+]] = getelementptr i32, ptr %arrayidx6, i32 1
+; CHECK: %[[A2_2:.+]] = getelementptr i32, ptr %arrayidx6, i32 2
+; CHECK: %[[A2_3:.+]] = getelementptr i32, ptr %arrayidx6, i32 3
+; CHECK: %[[LA2_0:.+]] = load i32, ptr %[[A2_0]]
+; CHECK: %[[LA2_1:.+]] = load i32, ptr %[[A2_1]]
+; CHECK: %[[LA2_2:.+]] = load i32, ptr %[[A2_2]]
+; CHECK: %[[LA2_3:.+]] = load i32, ptr %[[A2_3]]
+; CHECK: %[[CMP714:.+]] = icmp slt i32 %[[LA2_0]], 11
+; CHECK: %[[CMP715:.+]] = icmp slt i32 %[[LA2_1]], 12
+; CHECK: %[[CMP716:.+]] = icmp slt i32 %[[LA2_2]], 13
+; CHECK: %[[CMP717:.+]] = icmp slt i32 %[[LA2_3]], 14
+; CHECK: %[[SEXT818:.+]] = sext i1 %[[CMP714]] to i32
+; CHECK: %[[SEXT819:.+]] = sext i1 %[[CMP715]] to i32
+; CHECK: %[[SEXT820:.+]] = sext i1 %[[CMP716]] to i32
+; CHECK: %[[SEXT821:.+]] = sext i1 %[[CMP717]] to i32
+; CHECK: %arrayidx9 = getelementptr <4 x i32>, ptr %c, i64 2
+; CHECK: %[[C2_0:.+]] = getelementptr i32, ptr %arrayidx9, i32 0
+; CHECK: %[[C2_1:.+]] = getelementptr i32, ptr %arrayidx9, i32 1
+; CHECK: %[[C2_2:.+]] = getelementptr i32, ptr %arrayidx9, i32 2
+; CHECK: %[[C2_3:.+]] = getelementptr i32, ptr %arrayidx9, i32 3
+; CHECK: store i32 %[[SEXT818]], ptr %[[C2_0]]
+; CHECK: store i32 %[[SEXT819]], ptr %[[C2_1]]
+; CHECK: store i32 %[[SEXT820]], ptr %[[C2_2]]
+; CHECK: store i32 %[[SEXT821]], ptr %[[C2_3]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
new file mode 100644
index 0000000000000..74bc119bb130c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_instructions_uniform.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_instructions -vecz-passes=scalarize -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_instructions(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c) {
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %a, align 16
+  %1 = load <4 x i32>, <4 x i32>* %b, align 16
+  %add = add <4 x i32> %1, %0
+  store <4 x i32> %add, <4 x i32>* %c, align 16
+  %arrayidx3 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 1
+  %2 = load <4 x i32>, <4 x i32>* %arrayidx3, align 16
+  %arrayidx4 = getelementptr inbounds <4 x i32>, <4 x i32>* %b, i64 1
+  %3 = load <4 x i32>, <4 x i32>* %arrayidx4, align 16
+  %cmp = icmp sgt <4 x i32> %2, %3
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %arrayidx5 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 1
+  store <4 x i32> %sext, <4 x i32>* %arrayidx5, align 16
+  %arrayidx6 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 2
+  %4 = load <4 x i32>, <4 x i32>* %arrayidx6, align 16
+  %cmp7 = icmp slt <4 x i32> %4, <i32 11, i32 12, i32 13, i32 14>
+  %sext8 = sext <4 x i1> %cmp7 to <4 x i32>
+  %arrayidx9 = getelementptr inbounds <4 x i32>, <4 x i32>* %c, i64 2
+  store <4 x i32> %sext8, <4 x i32>* %arrayidx9, align 16
+  ret void
+}
+
+; Checks that this function gets vectorized, although because every instruction is
+; uniform, the process of vectorization makes no actual changes whatsoever!
+; CHECK: define spir_kernel void @__vecz_v4_test_instructions(ptr %a, ptr %b, ptr %c)
+; CHECK: entry:
+; CHECK: %[[LA:.+]] = load <4 x i32>, ptr %a, align 16
+; CHECK: %[[LB:.+]] = load <4 x i32>, ptr %b, align 16
+; CHECK: %[[ADD:.+]] = add <4 x i32> %[[LB]], %[[LA]]
+; CHECK: store <4 x i32> %[[ADD]], ptr %c, align 16
+; CHECK: %[[A1:.+]] = getelementptr inbounds <4 x i32>, ptr %a, i64 1
+; CHECK: %[[LA1:.+]] = load <4 x i32>, ptr %[[A1]], align 16
+; CHECK: %[[B1:.+]] = getelementptr inbounds <4 x i32>, ptr %b, i64 1
+; CHECK: %[[LB1:.+]] = load <4 x i32>, ptr %[[B1]], align 16
+; CHECK: %[[CMP:.+]] = icmp sgt <4 x i32> %[[LA1]], %[[LB1]]
+; CHECK: %[[SEXT:.+]] = sext <4 x i1> %[[CMP]] to <4 x i32>
+; CHECK: %[[C1:.+]] = getelementptr inbounds <4 x i32>, ptr %c, i64 1
+; CHECK: store <4 x i32> %[[SEXT]], ptr %[[C1]], align 16
+; CHECK: %[[A2:.+]] = getelementptr inbounds <4 x i32>, ptr %a, i64 2
+; CHECK: %[[LA2:.+]] = load <4 x i32>, ptr %[[A2]], align 16
+; CHECK: %[[CMP7:.+]] = icmp slt <4 x i32> %[[LA2]], <i32 11, i32 12, i32 13, i32 14>
+; CHECK: %[[SEXT8:.+]] = sext <4 x i1> %[[CMP7]] to <4 x i32>
+; CHECK: %[[C2:.+]] = getelementptr inbounds <4 x i32>, ptr %c, i64 2
+; CHECK: store <4 x i32> %[[SEXT8]], ptr %[[C2]], align 16
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
new file mode 100644
index 0000000000000..712271d2b12b3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarization_masked_load_store.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=scalarize -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+declare i64 @__mux_get_global_id(i32)
+
+declare <2 x float> @__vecz_b_masked_load4_Dv2_fPDv2_fDv2_b(<2 x float>*, <2 x i1>)
+declare void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float>, <2 x float>*, <2 x i1>)
+
+define spir_kernel void @scalarize_masked_memops(<2 x float>* %pa, <2 x float>* %pz) {
+entry:
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %head = insertelement <2 x i64> poison, i64 %idx, i64 0
+  %splat = shufflevector <2 x i64> %head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %idxs = add <2 x i64> %splat, <i64 0, i64 1>
+  %mask = icmp slt <2 x i64> %idxs, <i64 8, i64 8>
+  %aptr = getelementptr <2 x float>, <2 x float>* %pa, i64 %idx
+  %ld = call <2 x float> @__vecz_b_masked_load4_Dv2_fPDv2_fDv2_b(<2 x float>* %aptr, <2 x i1> %mask)
+  %zptr = getelementptr <2 x float>, <2 x float>* %pz, i64 %idx
+  call void @__vecz_b_masked_store4_Dv2_fPDv2_fDv2_b(<2 x float> %ld, <2 x float>* %zptr, <2 x i1> %mask)
+  ret void
+ ; CHECK:  %idx = call i64 @__mux_get_global_id(i32 0)
+ ; CHECK:  %[[IDXS0:.*]] = add i64 %idx, 0
+ ; CHECK:  %[[IDXS1:.*]] = add i64 %idx, 1
+ ; CHECK:  %[[MASK0:.*]] = icmp slt i64 %[[IDXS0]], 8
+ ; CHECK:  %[[MASK1:.*]] = icmp slt i64 %[[IDXS1]], 8
+ ; CHECK:  %aptr = getelementptr <2 x float>, ptr %pa, i64 %idx
+ ; CHECK:  %[[TMP1:.*]] = getelementptr float, ptr %aptr, i32 0
+ ; CHECK:  %[[TMP2:.*]] = getelementptr float, ptr %aptr, i32 1
+ ; CHECK:  %[[TMP3:.*]] = call float @__vecz_b_masked_load4_fu3ptrb(ptr %[[TMP1]], i1 %[[MASK0]])
+ ; CHECK:  %[[TMP4:.*]] = call float @__vecz_b_masked_load4_fu3ptrb(ptr %[[TMP2]], i1 %[[MASK1]])
+ ; CHECK:  %zptr = getelementptr <2 x float>, ptr %pz, i64 %idx
+ ; CHECK:  %[[TMP6:.*]] = getelementptr float, ptr %zptr, i32 0
+ ; CHECK:  %[[TMP7:.*]] = getelementptr float, ptr %zptr, i32 1
+ ; CHECK:  call void @__vecz_b_masked_store4_fu3ptrb(float %[[TMP3]], ptr %[[TMP6]], i1 %[[MASK0]])
+ ; CHECK:  call void @__vecz_b_masked_store4_fu3ptrb(float %[[TMP4]], ptr %[[TMP7]], i1 %[[MASK1]])
+ ; CHECK:  ret void
+
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll
new file mode 100644
index 0000000000000..443104d84af75
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-bitcast.ll
@@ -0,0 +1,135 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define dso_local spir_kernel void @bitcast1(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %pin = getelementptr inbounds <2 x float>, ptr addrspace(1) %in, i64 %gid
+  %pout = getelementptr inbounds <4 x half>, ptr addrspace(1) %out, i64 %gid
+  %0 = load <2 x float>, ptr addrspace(1) %pin, align 4
+  %1 = bitcast <2 x float> %0 to <4 x half>
+  store <4 x half> %1, ptr addrspace(1) %pout, align 4
+  ret void
+}
+
+; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast1
+; CHECK:      [[A0:%.+]] = load float,
+; CHECK-NEXT: [[C0:%.+]] = load float,
+; CHECK-NEXT: [[A1:%.+]] = bitcast float [[A0]] to i32
+; CHECK-NEXT: [[A2:%.+]] = trunc i32 [[A1]] to i16
+; CHECK-NEXT: [[A3:%.+]] = bitcast i16 [[A2]] to half
+; CHECK-NEXT: [[B1:%.+]] = bitcast float [[A0]] to i32
+; CHECK-NEXT: [[B2:%.+]] = lshr i32 [[B1]], 16
+; CHECK-NEXT: [[B3:%.+]] = trunc i32 [[B2]] to i16
+; CHECK-NEXT: [[B4:%.+]] = bitcast i16 [[B3]] to half
+; CHECK-NEXT: [[C1:%.+]] = bitcast float [[C0]] to i32
+; CHECK-NEXT: [[C2:%.+]] = trunc i32 [[C1]] to i16
+; CHECK-NEXT: [[C3:%.+]] = bitcast i16 [[C2]] to half
+; CHECK-NEXT: [[D1:%.+]] = bitcast float [[C0]] to i32
+; CHECK-NEXT: [[D2:%.+]] = lshr i32 [[D1]], 16
+; CHECK-NEXT: [[D3:%.+]] = trunc i32 [[D2]] to i16
+; CHECK-NEXT: [[D4:%.+]] = bitcast i16 [[D3]] to half
+; CHECK:      store half [[A3]],
+; CHECK-NEXT: store half [[B4]],
+; CHECK-NEXT: store half [[C3]],
+; CHECK-NEXT: store half [[D4]],
+; CHECK-NEXT: ret void
+
+define dso_local spir_kernel void @bitcast2(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %pin = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %gid
+  %pout = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i64 %gid
+  %0 = load <4 x half>, ptr addrspace(1) %pin, align 4
+  %1 = bitcast <4 x half> %0 to <2 x float>
+  store <2 x float> %1, ptr addrspace(1) %pout, align 4
+  ret void
+}
+
+; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast2
+; CHECK:      [[A0:%.+]] = load half,
+; CHECK-NEXT: [[B0:%.+]] = load half,
+; CHECK-NEXT: [[C0:%.+]] = load half,
+; CHECK-NEXT: [[D0:%.+]] = load half,
+; CHECK-NEXT: [[A1:%.+]] = bitcast half [[A0]] to i16
+; CHECK-NEXT: [[A2:%.+]] = zext i16 [[A1]] to i32
+; CHECK-NEXT: [[B1:%.+]] = bitcast half [[B0]] to i16
+; CHECK-NEXT: [[B2:%.+]] = zext i16 [[B1]] to i32
+; CHECK-NEXT: [[B3:%.+]] = shl i32 [[B2]], 16
+; CHECK-NEXT: [[AB4:%.+]] = or i32 [[A2]], [[B3]]
+; CHECK-NEXT: [[AB5:%.+]] = bitcast i32 [[AB4]] to float
+; CHECK-NEXT: [[C1:%.+]] = bitcast half [[C0]] to i16
+; CHECK-NEXT: [[C2:%.+]] = zext i16 [[C1]] to i32
+; CHECK-NEXT: [[D1:%.+]] = bitcast half [[D0]] to i16
+; CHECK-NEXT: [[D2:%.+]] = zext i16 [[D1]] to i32
+; CHECK-NEXT: [[D3:%.+]] = shl i32 [[D2]], 16
+; CHECK-NEXT: [[CD4:%.+]] = or i32 [[C2]], [[D3]]
+; CHECK-NEXT: [[CD5:%.+]] = bitcast i32 [[CD4]] to float
+; CHECK:      store float [[AB5]],
+; CHECK-NEXT: store float [[CD5]],
+; CHECK-NEXT: ret void
+
+define dso_local spir_kernel void @bitcast3(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %pin = getelementptr inbounds <2 x i32>, ptr addrspace(1) %in, i64 %gid
+  %pout = getelementptr inbounds <2 x float>, ptr addrspace(1) %out, i64 %gid
+  %0 = load <2 x i32>, ptr addrspace(1) %pin, align 4
+  %1 = bitcast <2 x i32> %0 to <2 x float>
+  store <2 x float> %1, ptr addrspace(1) %pout, align 4
+  ret void
+}
+
+; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast3
+; CHECK:      [[A0:%.+]] = load i32,
+; CHECK-NEXT: [[B0:%.+]] = load i32,
+; CHECK-NEXT: [[A1:%.+]] = bitcast i32 [[A0]] to float
+; CHECK-NEXT: [[B1:%.+]] = bitcast i32 [[B0]] to float
+; CHECK:      store float [[A1]],
+; CHECK-NEXT: store float [[B1]],
+; CHECK-NEXT: ret void
+
+define dso_local spir_kernel void @bitcast4(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %gid = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %pin = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %gid
+  %pout = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %gid
+  %0 = load i32, ptr addrspace(1) %pin, align 4
+  %1 = insertelement <2 x i32> poison, i32 %0, i32 0
+  %2 = bitcast <2 x i32> %1 to <4 x i16>
+  %3 = shufflevector <4 x i16> %2, <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  store <4 x i16> %3, ptr addrspace(1) %pout, align 4
+  ret void
+}
+
+; CHECK-LABEL: define{{.*}}spir_kernel void @__vecz_v4_bitcast4
+; CHECK:      [[A0:%.+]] = load i32,
+; CHECK-NEXT: [[A1:%.+]] = trunc i32 [[A0]] to i16
+; CHECK-NEXT: [[B0:%.+]] = lshr i32 %0, 16
+; CHECK-NEXT: [[B1:%.+]] = trunc i32 [[B0]] to i16
+; CHECK:      store i16 [[A1]],
+; CHECK-NEXT: store i16 [[B1]],
+; CHECK-NEXT: store i16 [[A1]],
+; CHECK-NEXT: store i16 [[B1]],
+; CHECK-NEXT: ret void
+
+declare i64 @__mux_get_global_id(i32 noundef)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
new file mode 100644
index 0000000000000..7d361eaa47399
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gather.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define dso_local spir_kernel void @splat(i32 addrspace(1)* %data, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %data, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %0, i64 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %add = add <4 x i32> %splat.splat, <i32 2, i32 3, i32 5, i32 7>
+  %call1 = tail call spir_func i32 @not_scalarizable(<4 x i32> noundef %add)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32 noundef)
+declare spir_func i32 @not_scalarizable(<4 x i32> noundef)
+
+; It checks that the scalarizer scalarizes the add and reconstructs the vector
+; using insert element instructions to be consumed by the unscalarizable
+; function.
+; CHECK: void @__vecz_v4_splat({{.*}})
+; CHECK: entry:
+; CHECK:   %[[LD:.*]] = load i32
+; CHECK:   %[[ADD0:.*]] = add i32 %[[LD]]
+; CHECK:   %[[ADD1:.*]] = add i32 %[[LD]]
+; CHECK:   %[[ADD2:.*]] = add i32 %[[LD]]
+; CHECK:   %[[ADD3:.*]] = add i32 %[[LD]]
+; CHECK:   %[[INS0:.*]] = insertelement <4 x i32> poison, i32 %[[ADD0]], i32 0
+; CHECK:   %[[INS1:.+]] = insertelement <4 x i32> %[[INS0]], i32 %[[ADD1]], i32 1
+; CHECK:   %[[INS2:.+]] = insertelement <4 x i32> %[[INS1]], i32 %[[ADD2]], i32 2
+; CHECK:   %[[INS3:.+]] = insertelement <4 x i32> %[[INS2]], i32 %[[ADD3]], i32 3
+; CHECK-NOT: shufflevector <4 x i32>
+; CHECK:   %{{.*}} = tail call spir_func i32 @not_scalarizable(<4 x i32> noundef %[[INS3]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
new file mode 100644
index 0000000000000..b40ac87870871
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-gep.ll
@@ -0,0 +1,72 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k gep -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define dso_local spir_kernel void @gep(ptr addrspace(1) %data, ptr addrspace(1) %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %ptrdata = getelementptr inbounds <2 x ptr>, ptr addrspace(1) %data, i64 %call
+  %ptrdatavec = load <2 x ptr addrspace(1)>, ptr addrspace(1) %ptrdata
+  %ptrdatavec.gep = getelementptr inbounds i32, <2 x ptr addrspace(1)> %ptrdatavec, i64 1
+  %vec1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %ptrdatavec, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
+  %vec2 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %ptrdatavec.gep, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
+  %vec.add = add <2 x i32> %vec1, %vec2
+  %ptrout = getelementptr inbounds <2 x i32>, ptr addrspace(1) %out, i64 %call
+  store <2 x i32> %vec.add, ptr addrspace(1) %ptrout
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32 noundef)
+
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)>, i32, <2 x i1>, <2 x ptr addrspace(1)>)
+
+; The full scalarization has not completely removed the vectors, the gather
+; operation should have been replaced by non-vector loads, but check that at
+; least we do not crash.
+
+; CHECK: void @__vecz_v4_gep({{.*}})
+; CHECK: entry:
+; CHECK:   %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
+; CHECK:   %ptrdata = getelementptr <2 x ptr>, ptr addrspace(1) %data, i64 %call
+; CHECK:   %0 = getelementptr ptr addrspace(1), ptr addrspace(1) %ptrdata, i32 0
+; CHECK:   %1 = getelementptr ptr addrspace(1), ptr addrspace(1) %ptrdata, i32 1
+; CHECK:   %ptrdatavec1 = load ptr addrspace(1), ptr addrspace(1) %0, align 1
+; CHECK:   %ptrdatavec2 = load ptr addrspace(1), ptr addrspace(1) %1, align 1
+; CHECK:   %2 = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) %ptrdatavec1, i32 0
+; CHECK:   %3 = insertelement <2 x ptr addrspace(1)> %2, ptr addrspace(1) %ptrdatavec2, i32 1
+; CHECK:   %ptrdatavec.gep3 = getelementptr i32, ptr addrspace(1) %ptrdatavec1, i64 1
+; CHECK:   %ptrdatavec.gep4 = getelementptr i32, ptr addrspace(1) %ptrdatavec2, i64 1
+; CHECK:   %4 = insertelement <2 x ptr addrspace(1)> poison, ptr addrspace(1) %ptrdatavec.gep3, i32 0
+; CHECK:   %5 = insertelement <2 x ptr addrspace(1)> %4, ptr addrspace(1) %ptrdatavec.gep4, i32 1
+; CHECK:   %vec1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %3, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
+; CHECK:   %6 = extractelement <2 x i32> %vec1, i32 0
+; CHECK:   %7 = extractelement <2 x i32> %vec1, i32 1
+; CHECK:   %vec2 = call <2 x i32> @llvm.masked.gather.v2i32.v2p1(<2 x ptr addrspace(1)> %5, i32 16, <2 x i1> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer)
+; CHECK:   %8 = extractelement <2 x i32> %vec2, i32 0
+; CHECK:   %9 = extractelement <2 x i32> %vec2, i32 1
+; CHECK:   %vec.add5 = add i32 %6, %8
+; CHECK:   %vec.add6 = add i32 %7, %9
+; CHECK:   %ptrout = getelementptr <2 x i32>, ptr addrspace(1) %out, i64 %call
+; CHECK:   %10 = getelementptr i32, ptr addrspace(1) %ptrout, i32 0
+; CHECK:   %11 = getelementptr i32, ptr addrspace(1) %ptrout, i32 1
+; CHECK:   store i32 %vec.add5, ptr addrspace(1) %10, align 4
+; CHECK:   store i32 %vec.add6, ptr addrspace(1) %11, align 4
+; CHECK:   ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
new file mode 100644
index 0000000000000..4492b16c1c978
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize-splat.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k splat -vecz-simd-width=4 -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define dso_local spir_kernel void @splat(float addrspace(1)* %data, float addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %data, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %splat.splatinsert = insertelement <4 x float> poison, float %0, i64 0
+  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
+  %call1 = tail call spir_func float @not_scalarizable(<4 x float> noundef %splat.splat)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32 noundef)
+declare spir_func float @not_scalarizable(<4 x float> noundef)
+
+; It checks that the scalarizer turns the original vector splat back into a vector splat,
+; instead of a series of insertelement instructions.
+; CHECK: void @__vecz_v4_splat({{.*}})
+; CHECK: entry:
+; CHECK:   %[[LD:.*]] = load float
+; CHECK:   %[[INS0:.*]] = insertelement <4 x float> poison, float %[[LD]], {{i32|i64}} 0
+; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 1
+; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 2
+; CHECK-NOT: %{{.*}} = insertelement <4 x float> %{{.*}}, float %[[LD]], {{i32|i64}} 3
+; CHECK:   %[[SPLAT:.*]] = shufflevector <4 x float> %[[INS0]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK:   %{{.*}} = tail call spir_func float @not_scalarizable(<4 x float> noundef %[[SPLAT]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
new file mode 100644
index 0000000000000..d7bbd4a2d9ed8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scalarize_mixed_gep.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+; RUN: veczc -k bar -vecz-simd-width=4 -S -o - %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+define void @bar(i64** %ptrptrs, i64 %val) {
+  %idx = call i64 @__mux_get_global_id(i32 0)
+  %arrayidxa = getelementptr inbounds i64*, i64** %ptrptrs, i64 %idx
+  %ptrs = load i64*, i64** %arrayidxa, align 4
+  %addr = getelementptr inbounds i64, i64* %ptrs, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+
+  %elt0 = extractelement <4 x i64*> %addr, i32 0
+  %elt1 = extractelement <4 x i64*> %addr, i32 1
+  %elt2 = extractelement <4 x i64*> %addr, i32 2
+  %elt3 = extractelement <4 x i64*> %addr, i32 3
+
+  store i64 %val, i64* %elt0
+  store i64 %val, i64* %elt1
+  store i64 %val, i64* %elt2
+  store i64 %val, i64* %elt3
+  ret void
+}
+
+; it checks that the GEP with mixed scalar/vector operands in the kernel
+; gets scalarized/re-packetized correctly
+
+; CHECK: define void @__vecz_v4_bar
+; CHECK: %[[ADDR:.+]] = getelementptr {{i64|i8}}, <4 x ptr> %{{.+}}, {{i64 2|i64 16}}
+; CHECK: call void @__vecz_b_scatter_store8_Dv4_mDv4_u3ptr(<4 x i64> %.splat{{.*}}, <4 x ptr> %[[ADDR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
new file mode 100644
index 0000000000000..530b01b7a0d88
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/scan_fact.ll
@@ -0,0 +1,197 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k scan_fact -vecz-passes=cfg-convert -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@scan_fact.temp = internal addrspace(3) global [16 x i32] poison, align 4
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #0
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_local_id(i32) #0
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_local_size(i32) #0
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @scan_fact(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0) #3
+  %call1 = call i64 @__mux_get_global_id(i32 0) #3
+  %call2 = call i64 @__mux_get_local_size(i32 0) #3
+  %mul = shl i64 %call1, 1
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %mul
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %mul3 = shl i64 %call, 1
+  %arrayidx4 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %mul3
+  store i32 %0, i32 addrspace(3)* %arrayidx4, align 4
+  %mul5 = shl i64 %call1, 1
+  %add = or i64 %mul5, 1
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %add
+  %1 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %mul7 = shl i64 %call, 1
+  %add8 = or i64 %mul7, 1
+  %arrayidx9 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add8
+  store i32 %1, i32 addrspace(3)* %arrayidx9, align 4
+  %mul10 = shl i64 %call, 1
+  %add11 = or i64 %mul10, 1
+  %arrayidx12 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add11
+  %2 = load i32, i32 addrspace(3)* %arrayidx12, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %storemerge = phi i64 [ 1, %entry ], [ %mul29, %for.inc ]
+  %mul13 = shl i64 %call2, 1
+  %cmp = icmp ult i64 %storemerge, %mul13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  call void @__mux_work_group_barrier(i32 1, i32 1, i32 272) #4
+  %mul14 = shl i64 %call, 1
+  %mul15 = mul i64 %storemerge, %mul14
+  %mul16 = shl i64 %call2, 1
+  %cmp17 = icmp ult i64 %mul15, %mul16
+  br i1 %cmp17, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %mul18 = mul i64 %storemerge, 2
+  %add19 = add i64 %mul15, -1
+  %sub = add i64 %add19, %mul18
+  %arrayidx20 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub
+  %3 = load i32, i32 addrspace(3)* %arrayidx20, align 4
+  %add21 = add i64 %mul15, -1
+  %sub22 = add i64 %add21, %storemerge
+  %arrayidx23 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub22
+  %4 = load i32, i32 addrspace(3)* %arrayidx23, align 4
+  %mul24 = mul nsw i32 %4, %3
+  %mul25 = mul i64 %storemerge, 2
+  %add26 = add i64 %mul15, -1
+  %sub27 = add i64 %add26, %mul25
+  %arrayidx28 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub27
+  store i32 %mul24, i32 addrspace(3)* %arrayidx28, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %mul29 = shl i64 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %cmp30 = icmp eq i64 %call, 0
+  br i1 %cmp30, label %if.then31, label %if.end35
+
+if.then31:                                        ; preds = %for.end
+  %mul32 = mul i64 %call2, 2
+  %sub33 = add i64 %mul32, -1
+  %arrayidx34 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub33
+  store i32 1, i32 addrspace(3)* %arrayidx34, align 4
+  br label %if.end35
+
+if.end35:                                         ; preds = %if.then31, %for.end
+  br label %for.cond37
+
+for.cond37:                                       ; preds = %for.inc62, %if.end35
+  %storemerge1 = phi i64 [ %call2, %if.end35 ], [ %shr, %for.inc62 ]
+  %cmp38 = icmp eq i64 %storemerge1, 0
+  call void @__mux_work_group_barrier(i32 1, i32 1, i32 272) #4
+  %mul64 = shl i64 %call, 1
+  br i1 %cmp38, label %for.end63, label %for.body39
+
+for.body39:                                       ; preds = %for.cond37
+  %mul42 = mul i64 %storemerge1, %mul64
+  %mul43 = shl i64 %call2, 1
+  %cmp44 = icmp ult i64 %mul42, %mul43
+  br i1 %cmp44, label %if.then45, label %for.inc62
+
+if.then45:                                        ; preds = %for.body39
+  %add46 = add i64 %mul42, -1
+  %sub47 = add i64 %add46, %storemerge1
+  %arrayidx48 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub47
+  %5 = load i32, i32 addrspace(3)* %arrayidx48, align 4
+  %mul49 = mul i64 %storemerge1, 2
+  %add50 = add i64 %mul42, -1
+  %sub51 = add i64 %add50, %mul49
+  %arrayidx52 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub51
+  %6 = load i32, i32 addrspace(3)* %arrayidx52, align 4
+  %add53 = add i64 %mul42, -1
+  %sub54 = add i64 %add53, %storemerge1
+  %arrayidx55 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub54
+  store i32 %6, i32 addrspace(3)* %arrayidx55, align 4
+  %mul56 = mul nsw i32 %6, %5
+  %mul57 = mul i64 %storemerge1, 2
+  %add58 = add i64 %mul42, -1
+  %sub59 = add i64 %add58, %mul57
+  %arrayidx60 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %sub59
+  store i32 %mul56, i32 addrspace(3)* %arrayidx60, align 4
+  br label %for.inc62
+
+for.inc62:                                        ; preds = %if.then45, %for.body39
+  %shr = lshr i64 %storemerge1, 1
+  br label %for.cond37
+
+for.end63:                                        ; preds = %for.cond37
+  %add65 = or i64 %mul64, 1
+  %arrayidx66 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add65
+  %7 = load i32, i32 addrspace(3)* %arrayidx66, align 4
+  %mul67 = shl i64 %call1, 1
+  %arrayidx68 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %mul67
+  store i32 %7, i32 addrspace(1)* %arrayidx68, align 4
+  %sub69 = add i64 %call2, -1
+  %cmp70 = icmp eq i64 %call, %sub69
+  br i1 %cmp70, label %if.then71, label %if.else
+
+if.then71:                                        ; preds = %for.end63
+  %mul72 = shl i64 %call, 1
+  %add73 = or i64 %mul72, 1
+  %arrayidx74 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add73
+  %8 = load i32, i32 addrspace(3)* %arrayidx74, align 4
+  %mul75 = mul nsw i32 %8, %2
+  %mul76 = shl i64 %call1, 1
+  %add77 = or i64 %mul76, 1
+  %arrayidx78 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add77
+  store i32 %mul75, i32 addrspace(1)* %arrayidx78, align 4
+  br label %if.end85
+
+if.else:                                          ; preds = %for.end63
+  %mul79 = mul i64 %call, 2
+  %add80 = add i64 %mul79, 2
+  %arrayidx81 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* @scan_fact.temp, i64 0, i64 %add80
+  %9 = load i32, i32 addrspace(3)* %arrayidx81, align 4
+  %mul82 = shl i64 %call1, 1
+  %add83 = or i64 %mul82, 1
+  %arrayidx84 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %add83
+  store i32 %9, i32 addrspace(1)* %arrayidx84, align 4
+  br label %if.end85
+
+if.end85:                                         ; preds = %if.else, %if.then71
+  ret void
+}
+
+declare void @__mux_work_group_barrier(i32, i32, i32)
+
+; The purpose of this test is to make sure we simply manage to vectorize this
+; test. We would previously not because a phi node of a uniform loop has an
+; incoming value from a divergent block, but all the incoming values of the
+; phi node are the same.
+; We would thus previously consider the phi node varying and that would make
+; the loop divergent, with a barrier in it.
+
+; CHECK: spir_kernel void @__vecz_v4_scan_fact
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
new file mode 100644
index 0000000000000..e82e58b6ac662
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/secretly_scalar_load_store.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i32 @__mux_get_global_id(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %gid = call i32 @__mux_get_global_id(i32 0)
+  %and = and i32 %gid, 1
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if, label %early_ret
+
+early_ret:
+; just to prevent ROSCC from sticking its oar in
+  %gid1 = call i32 @__mux_get_global_id(i32 1)
+  ret void
+
+if:
+  %single_load = load i32, i32 addrspace(1)* %in
+  %single_add = add i32 %single_load, 42
+  store i32 %single_add, i32 addrspace(1)* %in
+
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[BITCAST:.*]] = bitcast <4 x i1> %cmp{{[0-9]*}} to i4
+; CHECK: %[[MASK:.*]] = icmp ne i4 %[[BITCAST]], 0
+; CHECK: %[[single_load:single_load[0-9]*]] = call i32 @__vecz_b_masked_load4_ju3ptrU3AS1b(ptr addrspace(1) %in, i1 %[[MASK]])
+; CHECK: %[[single_add:single_add[0-9]*]] = add i32 %[[single_load]], 42
+; CHECK: call void @__vecz_b_masked_store4_ju3ptrU3AS1b(i32 %[[single_add]], ptr addrspace(1) %in, i1 %[[MASK]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
new file mode 100644
index 0000000000000..0b4377802877b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/select-no-crash.ll
@@ -0,0 +1,93 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-fail-quietly -k test -vecz-passes="cfg-convert" -S < %s
+
+; This tests only that the kernel does not crash the vectorizer.
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @test(i32 addrspace(1)* %out, i32 %n) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  br label %while.body
+
+while.body:                                       ; preds = %e, %entry
+  %n.off = add i32 %n, -1
+  %0 = icmp ult i32 %n.off, 4
+  %cmp6 = icmp slt i32 %n, 3
+  %or.cond1 = or i1 %cmp6, %0
+  br i1 %or.cond1, label %f, label %if.else
+
+while.body5:                                      ; preds = %d
+  switch i32 %n, label %g [
+    i32 3, label %if.else
+    i32 2, label %h
+  ]
+
+if.else:                                          ; preds = %while.body5, %while.body
+  %cmp9 = icmp sge i32 %conv, %n
+  %and = and i32 %n, 1
+  %tobool = icmp eq i32 %and, 0
+  %or.cond2 = or i1 %tobool, %cmp9
+  br i1 %or.cond2, label %d, label %h
+
+d:                                                ; preds = %if.else
+  %cmp16 = icmp sgt i32 %n, 3
+  br i1 %cmp16, label %e, label %while.body5
+
+e:                                                ; preds = %d
+  %and20 = and i32 %n, 1
+  %tobool21 = icmp eq i32 %and20, 0
+  br i1 %tobool21, label %while.body, label %g
+
+f:                                                ; preds = %while.body
+  %cmp24 = icmp eq i32 %n, 2
+  br i1 %cmp24, label %h, label %g
+
+g:                                                ; preds = %f, %e, %while.body5
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %g
+  %ret.0 = phi i32 [ 0, %g ], [ %inc, %for.body ]
+  %storemerge = phi i32 [ 0, %g ], [ %inc31, %for.body ]
+  %cmp29 = icmp sgt i32 %storemerge, %n
+  br i1 %cmp29, label %h, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %inc = add nuw nsw i32 %ret.0, 1
+  %inc31 = add nuw nsw i32 %storemerge, 1
+  br label %for.cond
+
+h:                                                ; preds = %for.cond, %f, %if.else, %while.body5
+  %ret.1 = phi i32 [ 0, %f ], [ %ret.0, %for.cond ], [ 0, %if.else ], [ 0, %while.body5 ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
new file mode 100644
index 0000000000000..2728251ca02b3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_1.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idx.ext = sext i32 %mul3 to i64
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idx.ext
+  %0 = load i8, i8 addrspace(1)* %add.ptr, align 1
+  %arrayidx4 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr, i64 1
+  %1 = load i8, i8 addrspace(1)* %arrayidx4, align 1
+  %add7 = add i8 %1, %0
+  %idxprom = sext i32 %add to i64
+  %arrayidx11 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom
+  store i8 %add7, i8 addrspace(1)* %arrayidx11, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
new file mode 100644
index 0000000000000..b1082899dce4d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_2.ll
@@ -0,0 +1,55 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %conv4 = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %conv4
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %add5 = or i64 %conv4, 1
+  %arrayidx6 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %add5
+  %1 = load i8, i8 addrspace(1)* %arrayidx6, align 1
+  %add9 = add i8 %1, %0
+  %idxprom = sext i32 %add to i64
+  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom
+  store i8 %add9, i8 addrspace(1)* %arrayidx13, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
new file mode 100644
index 0000000000000..373c37fb20114
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_3.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idxprom = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %add7 = or i32 %mul3, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1
+  %add12 = add i8 %1, %0
+  %idxprom16 = sext i32 %add to i64
+  %arrayidx17 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom16
+  store i8 %add12, i8 addrspace(1)* %arrayidx17, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
new file mode 100644
index 0000000000000..240d52a220cda
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_4.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = or i32 %mul3, 1
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %idxprom8 = sext i32 %mul3 to i64
+  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1
+  %sub = sub i8 %0, %1
+  %idxprom15 = sext i32 %add to i64
+  %arrayidx16 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom15
+  store i8 %sub, i8 addrspace(1)* %arrayidx16, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
new file mode 100644
index 0000000000000..23533d2130155
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_5.ll
@@ -0,0 +1,70 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %idxprom = sext i32 %mul3 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %add7 = or i32 %mul3, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %arrayidx9 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom8
+  %1 = load i8, i8 addrspace(1)* %arrayidx9, align 1
+  %add13 = add nsw i32 %mul3, 2
+  %idxprom14 = sext i32 %add13 to i64
+  %arrayidx15 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom14
+  %2 = load i8, i8 addrspace(1)* %arrayidx15, align 1
+  %add19 = add nsw i32 %mul3, 3
+  %idxprom20 = sext i32 %add19 to i64
+  %arrayidx21 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom20
+  %3 = load i8, i8 addrspace(1)* %arrayidx21, align 1
+  %add24 = add i8 %1, %0
+  %add26 = add i8 %add24, %2
+  %add28 = add i8 %add26, %3
+  %idxprom32 = sext i32 %add to i64
+  %arrayidx33 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom32
+  store i8 %add28, i8 addrspace(1)* %arrayidx33, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
new file mode 100644
index 0000000000000..12b0cca975cd0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/shuffled_load_6.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define spir_kernel void @load16(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i32 %stride) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %conv = trunc i64 %call to i32
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %conv2 = trunc i64 %call1 to i32
+  %mul = mul nsw i32 %conv2, %stride
+  %add = add nsw i32 %mul, %conv
+  %mul3 = shl nsw i32 %add, 1
+  %add4 = add nsw i32 %mul3, 3
+  %idxprom = sext i32 %add4 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom
+  %0 = load i8, i8 addrspace(1)* %arrayidx, align 1
+  %shl = shl i8 %0, 1
+  %add10 = add nsw i32 %mul3, 2
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 %idxprom11
+  %1 = load i8, i8 addrspace(1)* %arrayidx12, align 1
+  %sub = sub i8 %shl, %1
+  %idxprom18 = sext i32 %add to i64
+  %arrayidx19 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 %idxprom18
+  store i8 %sub, i8 addrspace(1)* %arrayidx19, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK: spir_kernel void @load16
+; CHECK: load <4 x i8>
+; CHECK: load <4 x i8>
+; CHECK-NOT: load <4 x i8>
+; CHECK-NOT: call <4 x i8> @__vecz_b_interleaved_load
+; CHECK-NOT: call <4 x i8> @__vecz_b_gather_load
+; CHECK: shufflevector <4 x i8>
+; CHECK: shufflevector <4 x i8>
+; CHECK-NOT: shufflevector <4 x i8>
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll
new file mode 100644
index 0000000000000..b28b347ade826
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/simplify-masked-memops.ll
@@ -0,0 +1,42 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k foo -vecz-passes=mask-memops -S < %s | FileCheck %s
+
+define void @foo(i16 %x, i32 %y, ptr addrspace(1) %p) {
+entry:
+  call void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16 %x, ptr addrspace(1) %p, i1 true)
+  call void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32 %y, ptr addrspace(1) %p, i1 true)
+  %f = call float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1) %p, i1 true)
+  %v4f = call <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1) %p, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+; Check we correctly set the alignment on the optimized loads and stores. The
+; alignment must come from the builtin, not from the natural/preferred
+; alignment for that type.
+; CHECK: define void @__vecz_v4_foo(i16 %x, i32 %y, ptr addrspace(1) %p)
+; CHECK: entry:
+; CHECK:      store i16 %x, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: store i32 %y, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: %f = load float, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: %v4f = load <4 x float>, ptr addrspace(1) %p, align 2
+; CHECK-NEXT: ret void
+
+declare void @__vecz_b_masked_store2_tu3ptrU3AS1b(i16, ptr addrspace(1), i1)
+declare void @__vecz_b_masked_store2_ju3ptrU3AS1b(i32, ptr addrspace(1), i1)
+declare float @__vecz_b_masked_load2_fu3ptrU3AS1b(ptr addrspace(1), i1)
+declare <4 x float> @__vecz_b_masked_load2_Dv4_fu3ptrU3AS1Dv4_b(ptr addrspace(1), <4 x i1>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
new file mode 100644
index 0000000000000..dc7c2fed68520
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
+  %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
+  %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
+  %ele0 = extractelement <4 x i8> %data.ld, i32 0
+  %ele1 = extractelement <4 x i8> %data.ld, i32 1
+  %ele2 = extractelement <4 x i8> %data.ld, i32 2
+  %ele3 = extractelement <4 x i8> %data.ld, i32 3
+  %zext0 = sext i8 %ele0 to i32
+  %zext1 = sext i8 %ele1 to i32
+  %zext2 = sext i8 %ele2 to i32
+  %zext3 = sext i8 %ele3 to i32
+  %sum1 = add i32 %zext0, %zext1
+  %sum2 = xor i32 %sum1, %zext2
+  %sum3 = and i32 %sum2, %zext3
+  %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid
+  store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <4 x i8> is converted into a i32 and uses shifts
+; to implement the extract elements and sexts.
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[DATA:.+]] = load <16 x i8>
+; CHECK-NOT: shufflevector
+; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
+; CHECK:  %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}}
+; CHECK:  %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}}
+; CHECK:  %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[SEXT0]], %[[SEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[SEXT2]]
+; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[SEXT3]]
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
new file mode 100644
index 0000000000000..c329b342b5835
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_sext_bigendian.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "E-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
+  %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
+  %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
+  %ele0 = extractelement <4 x i8> %data.ld, i32 3
+  %ele1 = extractelement <4 x i8> %data.ld, i32 2
+  %ele2 = extractelement <4 x i8> %data.ld, i32 1
+  %ele3 = extractelement <4 x i8> %data.ld, i32 0
+  %zext0 = sext i8 %ele0 to i32
+  %zext1 = sext i8 %ele1 to i32
+  %zext2 = sext i8 %ele2 to i32
+  %zext3 = sext i8 %ele3 to i32
+  %sum1 = add i32 %zext0, %zext1
+  %sum2 = xor i32 %sum1, %zext2
+  %sum3 = and i32 %sum2, %zext3
+  %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid
+  store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <4 x i8> is converted into a i32 and uses shifts
+; to implement the extract elements and sexts.
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[DATA:.+]] = load <16 x i8>
+; CHECK-NOT: shufflevector
+; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
+; CHECK:  %[[EXTR0:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SEXT0:.+]] = ashr <4 x i32> %[[EXTR0]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[EXTR1:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}}
+; CHECK:  %[[SEXT1:.+]] = ashr <4 x i32> %[[EXTR1]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[EXTR2:.+]] = shl <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}}
+; CHECK:  %[[SEXT2:.+]] = ashr <4 x i32> %[[EXTR2]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SEXT3:.+]] = ashr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[SEXT0]], %[[SEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[SEXT2]]
+; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[SEXT3]]
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
new file mode 100644
index 0000000000000..94e72dc92e09f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext.ll
@@ -0,0 +1,122 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=squash-small-vecs -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; It checks that the <4 x i8> is converted into a i32 and uses shifts and masks
+; to implement the extract elements and zexts.
+; CHECK: void @__vecz_v4_squashv4i8(
+; CHECK:  %[[DATA:.+]] = load <4 x i8>
+; CHECK:  %[[FREEZE:.+]] = freeze <4 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <4 x i8> %[[FREEZE]] to i32
+; CHECK:  %[[ZEXT0:.+]] = and i32 %[[SQUASH]], 255
+; CHECK:  %[[EXTR1:.+]] = lshr i32 %[[SQUASH]], 8
+; CHECK:  %[[ZEXT1:.+]] = and i32 %[[EXTR1]], 255
+; CHECK:  %[[EXTR2:.+]] = lshr i32 %[[SQUASH]], 16
+; CHECK:  %[[ZEXT2:.+]] = and i32 %[[EXTR2]], 255
+; CHECK:  %[[EXTR3:.+]] = lshr i32 %[[SQUASH]], 24
+; CHECK:  %[[ZEXT3:.+]] = and i32 %[[EXTR3]], 255
+; CHECK:  %[[SUM1:.+]] = add i32 %[[ZEXT0]], %[[ZEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor i32 %[[SUM1]], %[[ZEXT2]]
+; CHECK:  %[[SUM3:.+]] = and i32 %[[SUM2]], %[[ZEXT3]]
+; CHECK:  ret void
+define spir_kernel void @squashv4i8(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #1
+  %data.ptr = getelementptr inbounds <4 x i8>, ptr addrspace(1) %data, i64 %gid
+  %data.ld = load <4 x i8>, ptr addrspace(1) %data.ptr, align 4
+  %ele0 = extractelement <4 x i8> %data.ld, i32 0
+  %ele1 = extractelement <4 x i8> %data.ld, i32 1
+  %ele2 = extractelement <4 x i8> %data.ld, i32 2
+  %ele3 = extractelement <4 x i8> %data.ld, i32 3
+  %zext0 = zext i8 %ele0 to i32
+  %zext1 = zext i8 %ele1 to i32
+  %zext2 = zext i8 %ele2 to i32
+  %zext3 = zext i8 %ele3 to i32
+  %sum1 = add i32 %zext0, %zext1
+  %sum2 = xor i32 %sum1, %zext2
+  %sum3 = and i32 %sum2, %zext3
+  %output.ptr = getelementptr inbounds i32, ptr addrspace(1) %output, i64 %gid
+  store i32 %sum3, ptr addrspace(1) %output.ptr, align 4
+  ret void
+}
+
+; CHECK: void @__vecz_v4_squashv2i32(
+; CHECK:  %[[DATA:.+]] = load <2 x i32>
+; CHECK:  %[[FREEZE:.+]] = freeze <2 x i32> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <2 x i32> %[[FREEZE]] to i64
+; CHECK:  %[[ZEXT0:.+]] = and i64 %[[SQUASH]], 4294967295
+; CHECK:  %[[EXTR1:.+]] = lshr i64 %[[SQUASH]], 32
+; CHECK:  %[[ZEXT1:.+]] = and i64 %[[EXTR1]], 4294967295
+; CHECK:  %[[SUM1:.+]] = add i64 %[[ZEXT0]], %[[ZEXT1]]
+define spir_kernel void @squashv2i32(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #1
+  %data.ptr = getelementptr inbounds <2 x i32>, ptr addrspace(1) %data, i64 %gid
+  %data.ld = load <2 x i32>, ptr addrspace(1) %data.ptr, align 4
+  %ele0 = extractelement <2 x i32> %data.ld, i32 0
+  %ele1 = extractelement <2 x i32> %data.ld, i32 1
+  %zext0 = zext i32 %ele0 to i64
+  %zext1 = zext i32 %ele1 to i64
+  %sum = add i64 %zext0, %zext1
+  %output.ptr = getelementptr inbounds i64, ptr addrspace(1) %output, i64 %gid
+  store i64 %sum, ptr addrspace(1) %output.ptr, align 4
+  ret void
+}
+
+; Check we don't squash vectors we consider too large.
+; CHECK: void @__vecz_v4_squashv8i32(
+; CHECK-NOT: bitcast
+define spir_kernel void @squashv8i32(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #1
+  %data.ptr = getelementptr inbounds <8 x i32>, ptr addrspace(1) %data, i64 %gid
+  %data.ld = load <8 x i32>, ptr addrspace(1) %data.ptr, align 32
+  %ele0 = extractelement <8 x i32> %data.ld, i32 0
+  %ele1 = extractelement <8 x i32> %data.ld, i32 1
+  %zext0 = zext i32 %ele0 to i256
+  %zext1 = zext i32 %ele1 to i256
+  %sum = add i256 %zext0, %zext1
+  %output.ptr = getelementptr inbounds i256, ptr addrspace(1) %output, i64 %gid
+  store i256 %sum, ptr addrspace(1) %output.ptr, align 32
+  ret void
+}
+
+; Check we don't squash vectors we consider too large.
+; CHECK: void @__vecz_v4_squashv4i64(
+; CHECK-NOT: bitcast
+define spir_kernel void @squashv4i64(ptr addrspace(1) %data, ptr addrspace(1) %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #1
+  %data.ptr = getelementptr inbounds <4 x i64>, ptr addrspace(1) %data, i64 %gid
+  %data.ld = load <4 x i64>, ptr addrspace(1) %data.ptr, align 32
+  %ele0 = extractelement <4 x i64> %data.ld, i32 0
+  %ele1 = extractelement <4 x i64> %data.ld, i32 1
+  %zext0 = zext i64 %ele0 to i256
+  %zext1 = zext i64 %ele1 to i256
+  %sum = add i256 %zext0, %zext1
+  %output.ptr = getelementptr inbounds i256, ptr addrspace(1) %output, i64 %gid
+  store i256 %sum, ptr addrspace(1) %output.ptr, align 32
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64)
+
+attributes #0 = { nounwind }
+attributes #1 = { nobuiltin nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
new file mode 100644
index 0000000000000..e336a961b2576
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_extract_zext_bigendian.ll
@@ -0,0 +1,69 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k squash -vecz-choices=TargetIndependentPacketization -vecz-passes="squash-small-vecs,function(dce),packetizer" -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "E-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(<4 x i8> addrspace(1)* %data, i32 addrspace(1)* %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
+  %data.ptr = getelementptr inbounds <4 x i8>, <4 x i8> addrspace(1)* %data, i64 %gid
+  %data.ld = load <4 x i8>, <4 x i8> addrspace(1)* %data.ptr, align 8
+  %ele0 = extractelement <4 x i8> %data.ld, i32 3
+  %ele1 = extractelement <4 x i8> %data.ld, i32 2
+  %ele2 = extractelement <4 x i8> %data.ld, i32 1
+  %ele3 = extractelement <4 x i8> %data.ld, i32 0
+  %zext0 = zext i8 %ele0 to i32
+  %zext1 = zext i8 %ele1 to i32
+  %zext2 = zext i8 %ele2 to i32
+  %zext3 = zext i8 %ele3 to i32
+  %sum1 = add i32 %zext0, %zext1
+  %sum2 = xor i32 %sum1, %zext2
+  %sum3 = and i32 %sum2, %zext3
+  %output.ptr = getelementptr inbounds i32, i32 addrspace(1)* %output, i64 %gid
+  store i32 %sum3, i32 addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <4 x i8> is converted into a i32 and uses shifts and masks
+; to implement the extract elements and zexts.
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[DATA:.+]] = load <16 x i8>
+; CHECK-NOT: shufflevector
+; CHECK:  %[[FREEZE:.+]] = freeze <16 x i8> %[[DATA]]
+; CHECK:  %[[SQUASH:.+]] = bitcast <16 x i8> %[[FREEZE]] to <4 x i32>
+; CHECK:  %[[ZEXT0:.+]] = and <4 x i32> %[[SQUASH]], {{<(i32 255(, )?)+>|splat \(i32 255\)}}
+; CHECK:  %[[EXTR1:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 8(, )?)+>|splat \(i32 8\)}}
+; CHECK:  %[[ZEXT1:.+]] = and <4 x i32> %[[EXTR1]], {{<(i32 255(, )?)+>|splat \(i32 255\)}}
+; CHECK:  %[[EXTR2:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 16(, )?)+>|splat \(i32 16\)}}
+; CHECK:  %[[ZEXT2:.+]] = and <4 x i32> %[[EXTR2]], {{<(i32 255(, )?)+>|splat \(i32 255\)}}
+; CHECK:  %[[EXTR3:.+]] = lshr <4 x i32> %[[SQUASH]], {{<(i32 24(, )?)+>|splat \(i32 24\)}}
+; CHECK:  %[[ZEXT3:.+]] = and <4 x i32> %[[EXTR3]], {{<(i32 255(, )?)+>|splat \(i32 255\)}}
+; CHECK:  %[[SUM1:.+]] = add <4 x i32> %[[ZEXT0]], %[[ZEXT1]]
+; CHECK:  %[[SUM2:.+]] = xor <4 x i32> %[[SUM1]], %[[ZEXT2]]
+; CHECK:  %[[SUM3:.+]] = and <4 x i32> %[[SUM2]], %[[ZEXT3]]
+; CHECK:  ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
new file mode 100644
index 0000000000000..5615f7107d892
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/squash_float2_gather.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k squash -vecz-passes="squash-small-vecs,packetizer" -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @squash(i64 addrspace(1)* %idx, <2 x float> addrspace(1)* %data, <2 x float> addrspace(1)* %output) #0 {
+entry:
+  %gid = call i64 @__mux_get_global_id(i64 0) #2
+  %idx.ptr = getelementptr inbounds i64, i64 addrspace(1)* %idx, i64 %gid
+  %idx.ld = load i64, i64 addrspace(1)* %idx.ptr, align 8
+  %data.ptr = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %data, i64 %idx.ld
+  %data.ld = load <2 x float>, <2 x float> addrspace(1)* %data.ptr, align 8
+  %output.ptr = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %output, i64 %gid
+  store <2 x float> %data.ld, <2 x float> addrspace(1)* %output.ptr, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+; It checks that the <2 x float> is converted into a i64 for the purpose of the
+; gather load
+;
+; CHECK: void @__vecz_v4_squash
+; CHECK:  %[[GID:.+]] = call i64 @__mux_get_global_id(i64 0) #[[ATTRS:[0-9]+]]
+; CHECK:  %[[IDX_PTR:.+]] = getelementptr i64, ptr addrspace(1) %idx, i64 %[[GID]]
+; CHECK:  %[[WIDE_LOAD:.+]] = load <4 x i64>, ptr addrspace(1) %[[IDX_PTR]], align 8
+; CHECK:  %[[DATA_PTR:.+]] = getelementptr <2 x float>, ptr addrspace(1) %data, <4 x i64> %[[WIDE_LOAD]]
+; CHECK:  %[[GATHER:.+]] = call <4 x i64> @__vecz_b_gather_load8_Dv4_mDv4_u3ptrU3AS1(<4 x ptr addrspace(1)> %[[DATA_PTR]])
+; CHECK:  %[[UNSQUASH:.+]] = bitcast <4 x i64> %[[GATHER]] to <8 x float>
+; CHECK:  %[[OUTPUT_PTR:.+]] = getelementptr <2 x float>, ptr addrspace(1) %output, i64 %[[GID]]
+; CHECK:  store <8 x float> %[[UNSQUASH]], ptr addrspace(1) %[[OUTPUT_PTR]], align 8
+; CHECK:  ret void
+
+; CHECK: attributes #[[ATTRS]] = { nobuiltin nounwind }
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
new file mode 100644
index 0000000000000..73993e3c2883b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned.ll
@@ -0,0 +1,76 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct.PerItemKernelInfo = type <{ <4 x i64>, i32, i32 }>
+
+; Function start
+; CHECK: spir_kernel void @__vecz_v4_foo(
+
+; There should be exactly 4 vector stores
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64>
+; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_{{.*}}(<4 x i64>
+ 
+; There is one interleaved store from the scalar write
+; CHECK: call void @__vecz_b_interleaved_store1_10_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32>
+ 
+; There shouldn't be any other stores
+; CHECK-NOT: call void @__vecz_b_{{.*}}_store
+ 
+; Function end
+; CHECK: ret void
+
+define dso_local spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* nocapture noundef writeonly %info) !reqd_work_group_size !11 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 noundef 0)
+  %call1 = tail call i64 @__mux_get_global_id(i32 noundef 1)
+  %call2 = tail call i64 @__mux_get_global_id(i32 noundef 2)
+  %call3 = tail call i64 @__mux_get_global_size(i32 noundef 0)
+  %call5 = tail call i64 @__mux_get_global_size(i32 noundef 1)
+  %mul7 = mul nuw nsw i64 %call5, %call2
+  %reass.add = add nuw nsw i64 %mul7, %call1
+  %reass.mul = mul nuw nsw i64 %reass.add, %call3
+  %add8 = add nuw nsw i64 %reass.mul, %call
+  %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0
+  %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
+  %call12 = tail call i64 @__mux_get_global_size(i32 noundef 2)
+  %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
+  %call14 = tail call i64 @__mux_get_global_size(i32 noundef 3)
+  %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
+  %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
+  store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
+  %call16 = tail call i32 @__mux_get_work_dim()
+  %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
+  store i32 %call16, i32 addrspace(1)* %work_dim, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i64 @__mux_get_global_size(i32)
+
+declare i32 @__mux_get_work_dim()
+
+!11 = !{i32 4, i32 1, i32 1}
+
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
new file mode 100644
index 0000000000000..dfba183808512
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_aligned_scalarized.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s -vecz-choices=FullScalarization | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.PerItemKernelInfo = type <{ <4 x i64>, i32, i32 }>
+
+define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %call2 = tail call i64 @__mux_get_global_id(i32 2)
+  %call3 = tail call i64 @__mux_get_global_size(i32 0)
+  %call5 = tail call i64 @__mux_get_global_size(i32 1)
+  %mul7 = mul nuw nsw i64 %call5, %call2
+  %reass.add = add nuw nsw i64 %mul7, %call1
+  %reass.mul = mul nuw nsw i64 %reass.add, %call3
+  %add8 = add nuw nsw i64 %reass.mul, %call
+  %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0
+  %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
+  %call12 = tail call i64 @__mux_get_global_size(i32 2)
+  %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
+  %call14 = tail call i64 @__mux_get_global_size(i32 3)
+  %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
+  %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
+  store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
+  %call16 = tail call i32 @__mux_get_work_dim()
+  %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
+  store i32 %call16, i32 addrspace(1)* %work_dim, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i64 @__mux_get_global_size(i32)
+
+declare i32 @__mux_get_work_dim()
+
+; CHECK: spir_kernel void @foo
+; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_5_Dv4_m{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_m{{.*}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_10_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32>
+; CHECK-NOT: call void @__vecz_b_{{.*}}_store
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
new file mode 100644
index 0000000000000..95dfb9f4ef732
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_analysis.ll
@@ -0,0 +1,178 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes="print<strides>" -S < %s -o /dev/null 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_foo':
+define spir_kernel void @foo(ptr addrspace(1) align 1 %input) {
+entry:
+  %localid0 = tail call i64 @__mux_get_local_id(i32 0)
+  %localsize0 = tail call i64 @__mux_get_local_size(i32 0)
+  %groupid0 = tail call i64 @__mux_get_group_id(i32 0)
+  %globalid0 = tail call i64 @__mux_get_global_id(i32 0)
+
+; CHECK: Stride for ptr addrspace(1) %input
+; CHECK-NEXT: uniform
+  %lduniform = load i8, ptr addrspace(1) %input, align 1
+
+; CHECK: Stride for %arrayidx0 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0
+; CHECK-NEXT: linear stride of 1
+  %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0
+  %ld0 = load i8, ptr addrspace(1) %arrayidx0, align 1
+
+  %truncglobalid0 = trunc i64 %globalid0 to i32
+
+; CHECK: Stride for %arrayidx1 = getelementptr i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0
+; CHECK-NEXT: linear stride of 1
+  %sexttruncglobalid0 = sext i32 %truncglobalid0 to i64
+  %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %sexttruncglobalid0
+  %ld1 = load i8, ptr addrspace(1) %arrayidx1, align 1
+
+; CHECK: Stride for %arrayidx2 = getelementptr i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0
+; CHECK-NEXT: divergent
+  %zexttruncglobalid0 = zext i32 %truncglobalid0 to i64
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %zexttruncglobalid0
+  %ld2 = load i8, ptr addrspace(1) %arrayidx2, align 1
+
+; CHECK: Stride for %arrayidx3 = getelementptr i32, ptr addrspace(1) %input, i64 %globalid0
+; CHECK-NEXT: linear stride of 4
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0
+  %ld3 = load i8, ptr addrspace(1) %arrayidx3, align 1
+
+; CHECK: Stride for %arrayidx4 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0mul8
+; CHECK-NEXT: linear stride of 8
+  %globalid0mul8 = mul i64 %globalid0, 8
+  %arrayidx4 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul8
+  %ld4 = load i8, ptr addrspace(1) %arrayidx4, align 1
+
+; CHECK: Stride for %arrayidx5 = getelementptr i8, ptr addrspace(1) %input, i64 %globalid0mul16
+; CHECK-NEXT: linear stride of 16
+  %globalid0mul16 = mul i64 %globalid0mul8, 2
+  %arrayidx5 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %globalid0mul16
+  %ld5 = load i8, ptr addrspace(1) %arrayidx5, align 1
+
+; CHECK: Stride for %arrayidx6 = getelementptr i32, ptr addrspace(1) %input, i64 %globalid0mul8
+; CHECK-NEXT: linear stride of 32
+  %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %input, i64 %globalid0mul8
+  %ld6 = load i32, ptr addrspace(1) %arrayidx6, align 1
+
+; CHECK: Stride for %arrayidx7 = getelementptr i16, ptr addrspace(1) %input, i64 %idxprom7
+; CHECK-NEXT: linear stride of 2
+  %mul7 = mul i64 %localsize0, %groupid0
+  %add7 = add i64 %mul7, %localid0
+  %trunc7 = trunc i64 %add7 to i32
+  %conv7 = add i32 %trunc7, -1
+  %idxprom7 = sext i32 %conv7 to i64
+  %arrayidx7 = getelementptr inbounds i16, ptr addrspace(1) %input, i64 %idxprom7
+  %ld7 = load i16, ptr addrspace(1) %arrayidx7, align 1
+
+; CHECK: Stride for %arrayidx8 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom8
+; CHECK-NEXT: divergent
+  %mul8 = mul i64 %localsize0, %groupid0
+  %add8 = add i64 %mul8, %localid0
+  %trunc8 = trunc i64 %add8 to i32
+  %conv8 = add i32 %trunc8, -1
+  %idxprom8 = zext i32 %conv8 to i64
+  %arrayidx8 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom8
+  %ld8 = load i8, ptr addrspace(1) %arrayidx8, align 1
+
+; CHECK: Stride for %arrayidx9 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom9
+; CHECK-NEXT: divergent
+  %mul9 = mul i64 %groupid0, %localsize0
+  %add9 = add nuw nsw i64 %localid0, 4294967295
+  %conv9 = add i64 %add9, %mul9
+  %idxprom9 = and i64 %conv9, 4294967295
+  %arrayidx9 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom9
+  %ld9 = load i8, ptr addrspace(1) %arrayidx9, align 1
+
+  ret void
+}
+
+; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_canny_regression':
+define spir_kernel void @canny_regression(ptr addrspace(1) align 1 %input) {
+entry:
+  %groupid0 = tail call i64 @__mux_get_group_id(i32 0)
+  %localid0 = tail call i64 @__mux_get_local_id(i32 0)
+  %localsize0 = tail call i64 @__mux_get_local_size(i32 0)
+  %mul = mul i64 %groupid0, %localsize0
+  %add = add i64 %mul, %localid0
+  %0 = trunc i64 %add to i32
+  %conv = add i32 %0, -1
+  %trunclocalsize0 = trunc i64 %localsize0 to i32
+
+; CHECK: Stride for %arrayidx_pre = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom_pre
+; CHECK-NEXT: divergent
+  %idxprom_pre = zext i32 %conv to i64
+  %arrayidx_pre = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_pre
+  %ld_pre = load i8, ptr addrspace(1) %arrayidx_pre, align 1
+
+  br label %for.body
+
+for.body:
+; The below is fundamentally the same stride calculation as %arrayidx_pre -
+; make sure the loop and the PHI don't throw off the analysis.
+; CHECK: Stride for %arrayidx_loop = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom_loop
+; CHECK-NEXT: divergent
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gx2.050.us = phi i32 [ %conv, %entry ], [ %conv26.us, %for.body ]
+  %idxprom_loop = zext i32 %gx2.050.us to i64
+  %arrayidx_loop = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom_loop
+
+  %ld_loop = load i8, ptr addrspace(1) %arrayidx_loop, align 1
+
+  %conv26.us = add i32 %gx2.050.us, %trunclocalsize0
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit_cond = icmp ult i64 %iv.next, 2
+  br i1 %exit_cond, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+; CHECK-LABEL: StrideAnalysis for function '__vecz_v4_select_regression':
+define spir_kernel void @select_regression(ptr addrspace(1) align 1 %input, i1 %cmp) {
+entry:
+  %groupid0 = tail call i64 @__mux_get_group_id(i32 0)
+  %localid0 = tail call i64 @__mux_get_local_id(i32 0)
+  %localsize0 = tail call i64 @__mux_get_local_size(i32 0)
+  %mul = mul i64 %groupid0, %localsize0
+  %add = add i64 %mul, %localid0
+  %addtrunc = trunc i64 %add to i32
+
+; CHECK: Stride for %arrayidx0 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom0
+; CHECK-NEXT: divergent
+  %idxprom0 = zext i32 %addtrunc to i64
+  %arrayidx0 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom0
+  %ld0 = load i8, ptr addrspace(1) %arrayidx0, align 1
+
+; The below is fundamentally the same stride calculation as %arrayidx0 - make
+; sure the select doesn't throw off the analysis.
+; CHECK: Stride for %arrayidx1 = getelementptr i8, ptr addrspace(1) %input, i64 %idxprom1
+; CHECK-NEXT: divergent
+  %sel1 = select i1 %cmp, i32 %addtrunc, i32 %addtrunc
+  %idxprom1 = zext i32 %sel1 to i64
+  %arrayidx1 = getelementptr inbounds i8, ptr addrspace(1) %input, i64 %idxprom1
+  %ld1 = load i8, ptr addrspace(1) %arrayidx1, align 1
+
+  ret void
+}
+
+declare i64 @__mux_get_local_id(i32)
+declare i64 @__mux_get_local_size(i32)
+declare i64 @__mux_get_group_id(i32)
+declare i64 @__mux_get_global_id(i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
new file mode 100644
index 0000000000000..0b51e0f078b05
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned.ll
@@ -0,0 +1,64 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.PerItemKernelInfo = type <{ <4 x i64>, i32 }>
+
+define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %call2 = tail call i64 @__mux_get_global_id(i32 2)
+  %call3 = tail call i64 @__mux_get_global_size(i32 0)
+  %call5 = tail call i64 @__mux_get_global_size(i32 1)
+  %mul7 = mul nuw nsw i64 %call5, %call2
+  %reass.add = add nuw nsw i64 %mul7, %call1
+  %reass.mul = mul nuw nsw i64 %reass.add, %call3
+  %add8 = add nuw nsw i64 %reass.mul, %call
+  %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0
+  %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
+  %call12 = tail call i64 @__mux_get_global_size(i32 2)
+  %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
+  %call14 = tail call i64 @__mux_get_global_size(i32 3)
+  %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
+  %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
+  store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
+  %call16 = tail call i32 @__mux_get_work_dim()
+  %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
+  store i32 %call16, i32 addrspace(1)* %work_dim, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i64 @__mux_get_global_size(i32)
+
+declare i32 @__mux_get_work_dim()
+
+; CHECK: spir_kernel void @foo
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK: store <4 x i64>
+; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64>
+; CHECK-NOT: call void @__vecz_b_interleaved_store1_5_Dv4_{{.*}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_9_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32>
+; CHECK-NOT: call void @__vecz_b_{{.*}}_store
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
new file mode 100644
index 0000000000000..ffdb64718d8b8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/stride_misaligned_scalarized.ll
@@ -0,0 +1,63 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -S < %s -vecz-choices=FullScalarization | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+%struct.PerItemKernelInfo = type <{ <4 x i64>, i32 }>
+
+define spir_kernel void @foo(%struct.PerItemKernelInfo addrspace(1)* %info) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call i64 @__mux_get_global_id(i32 1)
+  %call2 = tail call i64 @__mux_get_global_id(i32 2)
+  %call3 = tail call i64 @__mux_get_global_size(i32 0)
+  %call5 = tail call i64 @__mux_get_global_size(i32 1)
+  %mul7 = mul nuw nsw i64 %call5, %call2
+  %reass.add = add nuw nsw i64 %mul7, %call1
+  %reass.mul = mul nuw nsw i64 %reass.add, %call3
+  %add8 = add nuw nsw i64 %reass.mul, %call
+  %vecinit = insertelement <4 x i64> poison, i64 %call3, i64 0
+  %vecinit11 = insertelement <4 x i64> %vecinit, i64 %call5, i64 1
+  %call12 = tail call i64 @__mux_get_global_size(i32 2)
+  %vecinit13 = insertelement <4 x i64> %vecinit11, i64 %call12, i64 2
+  %call14 = tail call i64 @__mux_get_global_size(i32 3)
+  %vecinit15 = insertelement <4 x i64> %vecinit13, i64 %call14, i64 3
+  %global_size = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 0
+  store <4 x i64> %vecinit15, <4 x i64> addrspace(1)* %global_size, align 1
+  %call16 = tail call i32 @__mux_get_work_dim()
+  %work_dim = getelementptr inbounds %struct.PerItemKernelInfo, %struct.PerItemKernelInfo addrspace(1)* %info, i64 %add8, i32 1
+  store i32 %call16, i32 addrspace(1)* %work_dim, align 1
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i64 @__mux_get_global_size(i32)
+
+declare i32 @__mux_get_work_dim()
+
+; CHECK: spir_kernel void @foo
+; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{(u3ptrU3AS1|PU3AS1m)}}(<4 x i64>
+; CHECK-NOT: call void @__vecz_b_scatter_store1_Dv4_mDv4_{{.*}}(<4 x i64>
+; CHECK: call void @__vecz_b_interleaved_store1_9_Dv4_j{{(u3ptrU3AS1|PU3AS1j)}}(<4 x i32>
+; CHECK-NOT: call void @__vecz_b_{{.*}}_store
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
new file mode 100644
index 0000000000000..ee78646485d04
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_phi.ll
@@ -0,0 +1,107 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct_type = type { i32, i32 }
+
+define spir_kernel void @test(i32* %in, i32* %out, %struct_type* %sin) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %inp = getelementptr inbounds i32, i32* %in, i64 %call
+  %oup = getelementptr inbounds i32, i32* %out, i64 %call
+  %o = load i32, i32* %oup
+  ; do this little compare + phi to throw off the InstCombine pass and ensure
+  ; we end up with a phi %struct_type that must be instantiated
+  %s = insertvalue %struct_type poison, i32 %o, 1
+  %cmpcall = icmp ult i64 16, %call
+  br i1 %cmpcall, label %lower, label %higher
+
+lower:
+  %lowers = insertvalue %struct_type %s, i32 0, 0
+  br label %lower.higher.phi
+
+higher:
+  %highers = insertvalue %struct_type %s, i32 1, 0
+  br label %lower.higher.phi
+
+lower.higher.phi:
+  %lowerhigherstruct = phi %struct_type [%lowers, %lower], [%highers, %higher]
+  br label %for.cond
+
+for.cond:
+  %storemerge = phi %struct_type [ %incv, %for.inc ], [ %lowerhigherstruct, %lower.higher.phi ]
+  %s1 = extractvalue %struct_type %storemerge, 1
+  %s1ext = zext i32 %s1 to i64
+  %cmp = icmp ult i64 %s1ext, %call
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  %l = load i32, i32* %inp, align 4
+  store i32 %l, i32* %oup, align 4
+  br label %for.inc
+
+for.inc:
+  %toadd = extractvalue %struct_type %storemerge, 1
+  %toadd64 = zext i32 %toadd to i64
+  %ca = add i64 %toadd64, %call
+  %sinp = getelementptr inbounds %struct_type, %struct_type* %sin, i64 %ca
+  %sinv = load %struct_type, %struct_type* %sinp
+  %sinintv = extractvalue %struct_type %sinv, 1
+  %incv = insertvalue %struct_type %storemerge, i32 %sinintv, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1)
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; Check if the struct creation has been instantiated
+; CHECK: %[[V2:[0-9]+]] = load <4 x i32>, ptr %oup, align 4
+; CHECK: %[[V3:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 0
+; CHECK: %[[V4:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 1
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 2
+; CHECK: %[[V6:[0-9]+]] = extractelement <4 x i32> %[[V2]], {{(i32|i64)}} 3
+; CHECK: %[[S24:.+]] = insertvalue %struct_type poison, i32 %[[V3]], 1
+; CHECK: %[[S25:.+]] = insertvalue %struct_type poison, i32 %[[V4]], 1
+; CHECK: %[[S26:.+]] = insertvalue %struct_type poison, i32 %[[V5]], 1
+; CHECK: %[[S27:.+]] = insertvalue %struct_type poison, i32 %[[V6]], 1
+
+; Check if the phi node has been instantiated
+; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: phi %struct_type [ %{{.+}}, %entry ], [ %{{.+}}, %for.cond ]
+; CHECK: extractvalue %struct_type %{{.+}}, 1
+; CHECK: extractvalue %struct_type %{{.+}}, 1
+; CHECK: extractvalue %struct_type %{{.+}}, 1
+; CHECK: extractvalue %struct_type %{{.+}}, 1
+
+; Check if the operations that use integer types are vectorized
+; CHECK: zext <4 x i32>
+; CHECK: icmp ugt <4 x i64>
+; CHECK: select <4 x i1>
+; CHECK: %[[L423:.+]] = call <4 x i32> @__vecz_b_masked_load4_Dv4_ju3ptrDv4_b(ptr %{{.*}}, <4 x i1>
+; CHECK: call void @__vecz_b_masked_store4_Dv4_ju3ptrDv4_b(<4 x i32> %[[L423]], ptr{{( nonnull)? %.*}}, <4 x i1>
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
new file mode 100644
index 0000000000000..a4b88856af96a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/struct_select.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+%struct_type = type { i32, i64 }
+
+define spir_kernel void @test(%struct_type* %in1, %struct_type* %in2, %struct_type* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %in1p = getelementptr inbounds %struct_type, %struct_type* %in1, i64 %call
+  %in2p = getelementptr inbounds %struct_type, %struct_type* %in2, i64 %call
+  %outp = getelementptr inbounds %struct_type, %struct_type* %out, i64 %call
+  %in1v = load %struct_type, %struct_type* %in1p
+  %in2v = load %struct_type, %struct_type* %in2p
+  %mod = urem i64 %call, 3
+  %cmp = icmp eq i64 %mod, 0
+  %res = select i1 %cmp, %struct_type %in1v, %struct_type %in2v
+  store %struct_type %res, %struct_type* %outp
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare void @llvm.memset.p0i8.i32(i8*,i8,i32,i32,i1)
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+
+; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}}
+; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}}
+; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}}
+; CHECK: select i1 %{{.+}}, %struct_type %{{.+}}, %struct_type %{{.+}}
+
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
new file mode 100644
index 0000000000000..bf1f2b19b178b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_broadcast.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s 
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i32 @__mux_get_sub_group_id()
+declare i32 @__mux_get_sub_group_local_id()
+declare i32 @__mux_sub_group_broadcast_i32(i32, i32)
+
+; It makes sure broadcast still works when its source operand is uniform
+define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call i32 @__mux_get_sub_group_id()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
+  %v = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %broadcast = call i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0)
+  %idx = tail call i32 @__mux_get_sub_group_local_id()
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %idx
+  store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast(
+; CHECK: [[LD:%.+]] = load i32, ptr addrspace(1) %{{.+}}, align 4
+; CHECK: [[INS:%.+]] = insertelement <4 x i32> poison, i32 [[LD]], i64 0
+; CHECK: [[BCAST:%.+]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: %idx = tail call i32 @__mux_get_sub_group_local_id()
+; CHECK: [[EXT:%.*]] = sext i32 %idx to i64
+; CHECK: %arrayidx2 = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: store <4 x i32> [[BCAST]], ptr addrspace(1) %arrayidx2, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
new file mode 100644
index 0000000000000..d6b074d1d266f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_builtins.ll
@@ -0,0 +1,115 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=4 -S < %s | FileCheck %s 
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i32 @__mux_get_sub_group_id()
+declare spir_func i32 @__mux_get_sub_group_size()
+declare spir_func i32 @__mux_get_sub_group_local_id()
+declare spir_func i32 @__mux_sub_group_broadcast_i32(i32, i32)
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i1 @__mux_sub_group_any_i1(i1)
+
+define spir_kernel void @get_sub_group_size(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call.i = tail call spir_func i32 @__mux_get_sub_group_id()
+  %conv = zext i32 %call.i to i64
+  %call2 = tail call spir_func i32 @__mux_get_sub_group_size()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_get_sub_group_size(
+; CHECK: [[RED:%.*]] = call i32 @__mux_sub_group_reduce_add_i32(i32 4)
+; CHECK: store i32 [[RED]], ptr addrspace(1) {{.*}}
+}
+
+define spir_kernel void @get_sub_group_local_id(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %call, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_get_sub_group_local_id(
+; CHECK: %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+; CHECK: [[MUL:%.*]] = shl i32 %call, 2
+; CHECK: [[SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[MUL]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[ID:%.*]] = or {{(disjoint )?}}<4 x i32> [[SPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; CHECK: [[EXT:%.*]] = sext i32 %call to i64
+; CHECK: %arrayidx = getelementptr i32, ptr addrspace(1) %out, i64 [[EXT]]
+; CHECK: store <4 x i32> [[ID]], ptr addrspace(1) %arrayidx
+}
+
+define spir_kernel void @sub_group_broadcast(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
+  %v = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast(
+; CHECK: [[LD:%.*]] = load <4 x i32>, ptr addrspace(1) {{%.*}}, align 4
+; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 0
+; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 0)
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1)
+}
+
+define spir_kernel void @sub_group_broadcast_wider_than_vf(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+  %call = tail call spir_func i32 @__mux_get_sub_group_local_id()
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %call
+  %v = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %broadcast = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 %v, i32 6)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %call
+  store i32 %broadcast, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_sub_group_broadcast_wider_than_vf(
+; CHECK: [[LD:%.*]] = load <4 x i32>, ptr addrspace(1) {{%.*}}, align 4
+; The sixth sub-group member is the (6 % 4 ==) 2nd vector group member
+; CHECK: [[EXT:%.*]] = extractelement <4 x i32> [[LD]], i64 2
+; CHECK: [[BDCAST:%.*]] = call spir_func i32 @__mux_sub_group_broadcast_i32(i32 [[EXT]], i32 1)
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[BDCAST]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i32> [[SPLAT]], ptr addrspace(1)
+}
+
+; This used to crash as packetizing get_sub_group_local_id produces a Constant, which we weren't expecting.
+define spir_kernel void @regression_sub_group_local_id(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %xy, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_local_id()
+  %0 = shl i64 %call, 32
+  %idxprom = ashr exact i64 %0, 32
+  %arrayidx = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %xy, i64 %idxprom
+  %1 = load <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, align 16
+  %2 = insertelement <4 x i32> %1, i32 %call1, i64 0
+  %3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arrayidx, i64 0, i64 0
+  store i32 %call1, i32 addrspace(1)* %3, align 16
+  %call2 = tail call spir_func i32 @__mux_get_sub_group_id()
+  %4 = insertelement <4 x i32> %2, i32 %call2, i64 1
+  store <4 x i32> %4, <4 x i32> addrspace(1)* %arrayidx, align 16
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = icmp ne i32 %5, 0
+  %call7 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %6)
+  %7 = sext i1 %call7 to i32
+  %arrayidx9 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %7, i32 addrspace(1)* %arrayidx9, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
new file mode 100644
index 0000000000000..c69d993acdd18
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions.ll
@@ -0,0 +1,260 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
+
+declare spir_func i1 @__mux_sub_group_all_i1(i1)
+declare spir_func i1 @__mux_sub_group_any_i1(i1)
+
+declare spir_func i32 @__mux_sub_group_reduce_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_add_i64(i64)
+declare spir_func float @__mux_sub_group_reduce_fadd_f32(float)
+declare spir_func i32 @__mux_sub_group_reduce_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_umax_i32(i32)
+declare spir_func float @__mux_sub_group_reduce_fmin_f32(float)
+declare spir_func float @__mux_sub_group_reduce_fmax_f32(float)
+
+define spir_kernel void @reduce_all_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %1 = icmp ne i32 %0, 0
+  %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 %1)
+  %2 = sext i1 %call2 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_all_i32(
+; CHECK: [[T2:%.*]] = icmp eq <4 x i32> %{{.*}}, zeroinitializer
+
+; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[T2]] to i4
+; CHECK: [[R:%.*]] = icmp eq i4 [[T3]], 0
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_all_i1(i1 [[R]])
+; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
+; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_any_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %1 = icmp ne i32 %0, 0
+  %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 %1)
+  %2 = sext i1 %call2 to i32
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_any_i32(
+; CHECK: [[T2:%.*]] = icmp ne <4 x i32> %{{.*}}, zeroinitializer
+
+; CHECK: [[T3:%.*]] = bitcast <4 x i1> [[T2]] to i4
+; CHECK: [[R:%.*]] = icmp ne i4 [[T3]], 0
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_any_i1(i1 [[R]])
+; CHECK: [[EXT:%.*]] = sext i1 %call2 to i32
+; CHECK: store i32 [[EXT]], ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_add_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %{{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+; Given we've checked a "full" expanded reduction sequence above for LLVM < 13,
+; reduce duplicate CHECKs by assuming all reductions work orthogonally.
+
+define spir_kernel void @reduce_add_i32_uniform(i32 addrspace(1)* %out, i32 %n) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 %n)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_add_i32_uniform(
+; LLVM is clever enough to optimize this reduction, but not when it's an
+; intrinsic. LLVM 10 does the shift-left in a vector, LLVMs 11 and 12 do it in
+; scalar.
+; CHECK: [[CALL:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> {{%.*}})
+; CHECK: %call1 = tail call spir_func i32 @__mux_sub_group_reduce_add_i32(i32 [[CALL]])
+; CHECK: [[INS:%.*]] = insertelement <4 x i32> poison, i32 %call1, {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i32> [[SPLAT]],
+}
+
+define spir_kernel void @reduce_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %conv
+  store i64 %call2, i64 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_add_i64(
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %{{.*}})
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_add_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_add_f32(
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %{{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fadd_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_smin_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %{{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smin_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_umin_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %{{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umin_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_smax_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %{{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_smax_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %conv
+  store i32 %call2, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_umax_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %{{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_umax_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_fmin_f32(
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %{{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmin_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+define spir_kernel void @reduce_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float %0)
+  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %out, i64 %conv
+  store float %call2, float addrspace(1)* %arrayidx3, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_fmax_f32(
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %{{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmax_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
+}
+
+!opencl.ocl.version = !{!0}
+
+!0 = !{i32 3, i32 0}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..4719739ded72b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_reductions_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,197 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+declare spir_func i32 @__mux_get_sub_group_id()
+
+declare spir_func i32 @__mux_sub_group_reduce_mul_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_mul_i64(i64)
+declare spir_func float @__mux_sub_group_reduce_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_reduce_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_reduce_or_i32(i32)
+declare spir_func i64 @__mux_sub_group_reduce_xor_i64(i64)
+
+declare spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1)
+
+; CHECK-LABEL: @__vecz_v4_reduce_mul_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %{{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_mul_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_mul_i64(
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %{{.*}})
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_mul_i64(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_mul_f32(
+; CHECK: [[R:%.*]] = call float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> %{{.*}})
+; CHECK: %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float [[R]])
+; CHECK: store float %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func float @__mux_sub_group_reduce_fmul_f32(float %0)
+  %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %conv
+  store float %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_and_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %{{.*}})
+; CHECK: %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 [[R]])
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_and_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_or_i32(
+; CHECK: [[R:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %{{.*}})
+; CHECK: store i32 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i32 @__mux_sub_group_reduce_or_i32(i32 %0)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  store i32 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_xor_i32(
+; CHECK: [[R:%.*]] = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %{{.*}})
+; CHECK: %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 [[R]])
+; CHECK: store i64 %call2, ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i64, ptr addrspace(1) %arrayidx, align 4
+  %call2 = tail call spir_func i64 @__mux_sub_group_reduce_xor_i64(i64 %0)
+  %arrayidx3 = getelementptr inbounds i64, ptr addrspace(1) %out, i64 %conv
+  store i64 %call2, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_logical_and(
+; This doesn't generate a reduction intrinsic...
+; CHECK: [[T:%.*]] = icmp eq i4 {{%.*}}, -1
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 [[T]])
+; CHECK: [[E:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_and_i1(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_logical_or(
+; CHECK: [[T:%.*]] = icmp ne i4 {{%.*}}, 0
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 [[T]])
+; CHECK: [[E:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_or_i1(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_logical_xor(
+; CHECK: [[X:%.*]] = call {{.*}}i4 @llvm.ctpop.i4(i4 {{%.*}})
+; CHECK: %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 [[T:%.*]])
+; CHECK: [[E:%.*]] = zext i1 %call2 to i32
+; CHECK: store i32 [[E]], ptr addrspace(1) {{%.*}}, align 4
+define spir_kernel void @reduce_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %call1 = tail call spir_func i32 @__mux_get_sub_group_id() #6
+  %conv = zext i32 %call1 to i64
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call2 = tail call spir_func i1 @__mux_sub_group_reduce_logical_xor_i1(i1 %1)
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %conv
+  %zext = zext i1 %call2 to i32
+  store i32 %zext, ptr addrspace(1) %arrayidx3, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
new file mode 100644
index 0000000000000..ad98dbfe5f788
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans.ll
@@ -0,0 +1,204 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32)
+declare spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64)
+declare spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_add_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_add_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <4 x i32> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i64 @__mux_sub_group_scan_inclusive_add_i64(i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i64(
+; CHECK: [[SCAN:%.*]] = call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_sub_group_scan_exclusive_add_i64(i64 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i64> poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[HEAD]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <4 x i64> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x i64> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fadd_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.0{{.*}}, <4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fadd_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = fadd <4 x float> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x float> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smin_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smin_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smin_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umin_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umin_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umin_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_smax_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smax_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_smax_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_umax_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umax_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_sub_group_scan_exclusive_umax_i32(i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmin_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmin_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmin_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]])
+; CHECK: store <4 x float> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmax_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmax_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_sub_group_scan_exclusive_fmax_f32(float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]])
+; CHECK: store <4 x float> [[FINAL]],
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
new file mode 100644
index 0000000000000..691b7aba7100f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_scans_spv_khr_uniform_group_instructions.ll
@@ -0,0 +1,171 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32)
+declare spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float)
+
+declare spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32)
+declare spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1)
+declare spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1)
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_mul_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_mul_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_excl_mul_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_exclusive_mul_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_mul_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_inclusive_mul_Dv4_f(<4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_incl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_inclusive_fmul_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_excl_mul_f32(
+; CHECK: call <4 x float> @__vecz_b_sub_group_scan_exclusive_mul_Dv4_f(<4 x float> %{{.*}})
+define spir_kernel void @reduce_scan_excl_mul_f32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %in, i64 %call
+  %0 = load float, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func float @__mux_sub_group_scan_exclusive_fmul_f32(float %0)
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %out, i64 %call
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_and_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_and_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_and_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_and_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_or_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_or_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_or_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_or_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_xor_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_j(<4 x i32> %{{.*}})
+define spir_kernel void @reduce_scan_incl_xor_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %call1 = tail call spir_func i32 @__mux_sub_group_scan_inclusive_xor_i32(i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_and(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_and_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_and(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_and_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_or(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_or_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_or(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_or_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_logical_xor(
+; CHECK: call <4 x i1> @__vecz_b_sub_group_scan_inclusive_xor_Dv4_b(<4 x i1> %{{.*}})
+define spir_kernel void @reduce_scan_incl_logical_xor(ptr addrspace(1) %in, ptr addrspace(1) %out) {
+entry:
+  %call = tail call spir_func i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %call
+  %0 = load i32, ptr addrspace(1) %arrayidx, align 4
+  %1 = trunc i32 %0 to i1
+  %call1 = tail call spir_func i1 @__mux_sub_group_scan_inclusive_logical_xor_i1(i1 %1)
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 %call
+  %2 = zext i1 %call1 to i32
+  store i32 %2, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
new file mode 100644
index 0000000000000..b5783f2c9e55c
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle.ll
@@ -0,0 +1,190 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; See @kernel_varying_idx, below
+; CHECK: Could not packetize sub-group shuffle %shuffle9
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %in, ptr %out)
+; CHECK: [[VECIDX:%.*]] = urem i32 %size_minus_1, 4
+; CHECK: [[MUXIDX:%.*]] = udiv i32 %size_minus_1, 4
+; CHECK: [[VEC:%.*]] = extractelement <4 x i64> {{%.*}}, i32 [[VECIDX]]
+; CHECK: [[SHUFFLE:%.*]] = call i64 @__mux_sub_group_shuffle_i64(i64 [[VEC]], i32 [[MUXIDX]])
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 [[SHUFFLE]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i64> [[SPLAT]]
+define spir_kernel void @kernel(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid
+  %val = load i64, ptr %arrayidx.in, align 8
+  %shuffle1 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle1, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %in, ptr %out)
+; CHECK: [[VECIDX:%.*]] = urem i32 %size_minus_1, 4
+; CHECK: [[MUXIDX:%.*]] = udiv i32 %size_minus_1, 4
+; CHECK: [[BASE:%.*]] = mul i32 %2, 2
+; CHECK: [[IDX0:%.*]] = add i32 [[BASE]], 0
+; CHECK: [[ELT0:%.*]] = extractelement <8 x float> %1, i32 [[IDX0]]
+; CHECK: [[TVEC:%.*]] = insertelement <2 x float> poison, float [[ELT0]], i32 0
+; CHECK: [[IDX1:%.*]] = add i32 [[BASE]], 1
+; CHECK: [[ELT1:%.*]] = extractelement <8 x float> %1, i32 [[IDX1]]
+; CHECK: [[VEC:%.*]] = insertelement <2 x float> [[TVEC]], float [[ELT1]], i32 1
+; CHECK: [[SHUFFLE:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[VEC]], i32 [[MUXIDX]])
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> poison,
+; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+define spir_kernel void @kernel_vec_data(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid
+  %val = load <2 x float>, ptr %arrayidx.in, align 8
+  %shuffle2 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle2, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_const_idx(ptr %in, ptr %out)
+; CHECK: [[VEC:%.*]] = extractelement <4 x i64> {{%.*}}, i32 1
+; CHECK: [[SHUFFLE:%.*]] = call i64 @__mux_sub_group_shuffle_i64(i64 [[VEC]], i32 0)
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 [[SHUFFLE]], i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i64> [[SPLAT]]
+define spir_kernel void @kernel_const_idx(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid
+  %val = load i64, ptr %arrayidx.in, align 8
+  %shuffle3 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 1)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle3, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data_const_idx(ptr %in, ptr %out)
+; We're wanting the "1th" sub-group member, which becomes the 2-element vector
+; at element index 2
+; CHECK: [[VEC:%.*]] = call <2 x float> @llvm.vector.extract.v2f32.v8f32(<8 x float> {{%.*}}, i64 2)
+; CHECK: [[SHUFFLE:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[VEC]], i32 0)
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> [[SHUFFLE]], <2 x float> poison,
+; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK: store <8 x float> [[SPLAT]]
+define spir_kernel void @kernel_vec_data_const_idx(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid
+  %val = load <2 x float>, ptr %arrayidx.in, align 8
+  %shuffle4 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 1)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle4, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data(i64 %val, ptr %out)
+; It doesn't matter what sub-group index we choose because the data is uniform.
+; Just splat it.
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 %val, i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i64> [[SPLAT]]
+define spir_kernel void @kernel_uniform_data(i64 %val, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %shuffle5 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle5, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data_varying_idx(i64 %val, ptr %idxs, ptr %out)
+; It doesn't matter what sub-group index we choose because the data is uniform.
+; Just splat it.
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x i64> poison, i64 %val, i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[SPLATINS]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x i64> [[SPLAT]]
+define spir_kernel void @kernel_uniform_data_varying_idx(i64 %val, ptr %idxs, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid
+  %idx = load i32, ptr %arrayidx.idxs, align 4
+  %shuffle6 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %idx)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle6, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_vec_data(<2 x float> %val, ptr %out)
+; It doesn't matter what sub-group index we choose because the data is uniform.
+; Just splat it.
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> poison,
+; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK: store <8 x float> [[SPLAT]]
+define spir_kernel void @kernel_uniform_vec_data(<2 x float> %val, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %shuffle7 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %size_minus_1)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle7, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_vec_data_varying_idx(<2 x float> %val, ptr %idxs, ptr %out)
+; It doesn't matter what sub-group index we choose because the data is uniform.
+; Just splat it.
+; CHECK: [[SPLAT:%.*]] = shufflevector <2 x float> %val, <2 x float> poison,
+; CHECK-SAME:                          <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+; CHECK: store <8 x float> [[SPLAT]]
+define spir_kernel void @kernel_uniform_vec_data_varying_idx(<2 x float> %val, ptr %idxs, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid
+  %idx = load i32, ptr %arrayidx.idxs, align 4
+  %shuffle8 = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %idx)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle8, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; We don't support vectorization of varying indices (for now) - see the check
+; above (which is printed before the final IR)
+define spir_kernel void @kernel_varying_idx(ptr %in, ptr %idxs, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %size = call i32 @__mux_get_sub_group_size()
+  %size_minus_1 = sub i32 %size, 1
+  %arrayidx.in = getelementptr inbounds i64, ptr %in, i64 %gid
+  %val = load i64, ptr %arrayidx.in, align 8
+  %arrayidx.idxs = getelementptr inbounds i32, ptr %idxs, i64 %gid
+  %idx = load i32, ptr %arrayidx.idxs, align 4
+  %shuffle9 = call i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %idx)
+  %arrayidx.out = getelementptr inbounds i64, ptr %out, i64 %gid
+  store i64 %shuffle9, ptr %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i32 @__mux_get_sub_group_size()
+
+declare i64 @__mux_sub_group_shuffle_i64(i64 %val, i32 %lid)
+declare <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> %val, i32 %lid)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
new file mode 100644
index 0000000000000..5a7c4b4e7f8fb
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_down.ll
@@ -0,0 +1,206 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %lhsptr, ptr %rhsptr, ptr %out)
+; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
+; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
+
+; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]])
+; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]]
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]]
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]]
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]]
+define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid
+  %lhs = load float, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid
+  %rhs = load float, ptr %arrayidx.rhs, align 4
+  %shuffle_up = call float @__mux_sub_group_shuffle_down_f32(float %lhs, float %rhs, i32 1)
+  %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
+  store float %shuffle_up, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out)
+; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}}
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS:%.*]], <16 x i8> [[RHS:%.*]], i32 [[DELTA0]])
+; CHECK: [[SUBVECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELTBASE0:%.*]] = mul i32 [[SUBVECIDX0]], 4
+; CHECK: [[VECIDX00:%.*]] = add i32 [[ELTBASE0]], 0
+; CHECK: [[ELT00:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX00]]
+; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> poison, i8 [[ELT00]], i32 0
+; CHECK: [[VECIDX01:%.*]] = add i32 [[ELTBASE0]], 1
+; CHECK: [[ELT01:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX01]]
+; CHECK: [[VEC01:%.*]] = insertelement <4 x i8> [[VEC00]], i8 [[ELT01]], i32 1
+; CHECK: [[VECIDX02:%.*]] = add i32 [[ELTBASE0]], 2
+; CHECK: [[ELT02:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX02]]
+; CHECK: [[VEC02:%.*]] = insertelement <4 x i8> [[VEC01]], i8 [[ELT02]], i32 2
+; CHECK: [[VECIDX03:%.*]] = add i32 [[ELTBASE0]], 3
+; CHECK: [[ELT03:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX03]]
+; CHECK: [[VEC03:%.*]] = insertelement <4 x i8> [[VEC02]], i8 [[ELT03]], i32 3
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[SUBVECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELTBASE1:%.*]] = mul i32 [[SUBVECIDX1]], 4
+; CHECK: [[VECIDX10:%.*]] = add i32 [[ELTBASE1]], 0
+; CHECK: [[ELT10:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX10]]
+; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> poison, i8 [[ELT10]], i32 0
+; CHECK: [[VECIDX11:%.*]] = add i32 [[ELTBASE1]], 1
+; CHECK: [[ELT11:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX11]]
+; CHECK: [[VEC11:%.*]] = insertelement <4 x i8> [[VEC10]], i8 [[ELT11]], i32 1
+; CHECK: [[VECIDX12:%.*]] = add i32 [[ELTBASE1]], 2
+; CHECK: [[ELT12:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX12]]
+; CHECK: [[VEC12:%.*]] = insertelement <4 x i8> [[VEC11]], i8 [[ELT12]], i32 2
+; CHECK: [[VECIDX13:%.*]] = add i32 [[ELTBASE1]], 3
+; CHECK: [[ELT13:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX13]]
+; CHECK: [[VEC13:%.*]] = insertelement <4 x i8> [[VEC12]], i8 [[ELT13]], i32 3
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[SUBVECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELTBASE2:%.*]] = mul i32 [[SUBVECIDX2]], 4
+; CHECK: [[VECIDX20:%.*]] = add i32 [[ELTBASE2]], 0
+; CHECK: [[ELT20:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX20]]
+; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> poison, i8 [[ELT20]], i32 0
+; CHECK: [[VECIDX21:%.*]] = add i32 [[ELTBASE2]], 1
+; CHECK: [[ELT21:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX21]]
+; CHECK: [[VEC21:%.*]] = insertelement <4 x i8> [[VEC20]], i8 [[ELT21]], i32 1
+; CHECK: [[VECIDX22:%.*]] = add i32 [[ELTBASE2]], 2
+; CHECK: [[ELT22:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX22]]
+; CHECK: [[VEC22:%.*]] = insertelement <4 x i8> [[VEC21]], i8 [[ELT22]], i32 2
+; CHECK: [[VECIDX23:%.*]] = add i32 [[ELTBASE2]], 3
+; CHECK: [[ELT23:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX23]]
+; CHECK: [[VEC23:%.*]] = insertelement <4 x i8> [[VEC22]], i8 [[ELT23]], i32 3
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_down_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[SUBVECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELTBASE3:%.*]] = mul i32 [[SUBVECIDX3]], 4
+; CHECK: [[VECIDX30:%.*]] = add i32 [[ELTBASE3]], 0
+; CHECK: [[ELT30:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX30]]
+; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> poison, i8 [[ELT30]], i32 0
+; CHECK: [[VECIDX31:%.*]] = add i32 [[ELTBASE3]], 1
+; CHECK: [[ELT31:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX31]]
+; CHECK: [[VEC31:%.*]] = insertelement <4 x i8> [[VEC30]], i8 [[ELT31]], i32 1
+; CHECK: [[VECIDX32:%.*]] = add i32 [[ELTBASE3]], 2
+; CHECK: [[ELT32:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX32]]
+; CHECK: [[VEC32:%.*]] = insertelement <4 x i8> [[VEC31]], i8 [[ELT32]], i32 2
+; CHECK: [[VECIDX33:%.*]] = add i32 [[ELTBASE3]], 3
+; CHECK: [[ELT33:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX33]]
+; CHECK: [[VEC33:%.*]] = insertelement <4 x i8> [[VEC32]], i8 [[ELT33]], i32 3
+define spir_kernel void @kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds <4 x i8>, ptr %lhsptr, i64 %gid
+  %lhs = load <4 x i8>, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds <4 x i8>, ptr %rhsptr, i64 %gid
+  %rhs = load <4 x i8>, ptr %arrayidx.rhs, align 4
+  %shuffle_up = call <4 x i8> @__mux_sub_group_shuffle_down_v4i8(<4 x i8> %lhs, <4 x i8> %rhs, i32 2)
+  %arrayidx.out = getelementptr inbounds <4 x i8>, ptr %out, i64 %gid
+  store <4 x i8> %shuffle_up, ptr %arrayidx.out, align 4
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out)
+; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
+; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
+; CHECK: [[DELTALD:%.*]] = load <4 x i32>, ptr %arrayidx.deltas, align 4
+
+; CHECK: [[DELTAS:%.*]] = add <4 x i32> {{%.*}}, [[DELTALD]]
+; CHECK: [[MUXIDS:%.*]] = udiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[VECELTS:%.*]] = urem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> [[MUXIDS]], {{%.*}}
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]])
+; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]]
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]]
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]]
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_down_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]]
+define spir_kernel void @kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid
+  %lhs = load float, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid
+  %rhs = load float, ptr %arrayidx.rhs, align 4
+  %arrayidx.deltas = getelementptr inbounds i32, ptr %deltaptr, i64 %gid
+  %delta = load i32, ptr %arrayidx.deltas, align 4
+  %shuffle_up = call float @__mux_sub_group_shuffle_down_f32(float %lhs, float %rhs, i32 %delta)
+  %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
+  store float %shuffle_up, ptr %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare float @__mux_sub_group_shuffle_down_f32(float %prev, float %curr, i32 %delta)
+declare <4 x i8> @__mux_sub_group_shuffle_down_v4i8(<4 x i8> %prev, <4 x i8> %curr, i32 %delta)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
new file mode 100644
index 0000000000000..779596da58a14
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_up.ll
@@ -0,0 +1,242 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel(ptr %lhsptr, ptr %rhsptr, ptr %out)
+; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
+; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
+
+; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
+; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
+; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
+
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} 
+
+; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
+; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
+
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]]
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]])
+; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]]
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]]
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]]
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]]
+define spir_kernel void @kernel(ptr %lhsptr, ptr %rhsptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid
+  %lhs = load float, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid
+  %rhs = load float, ptr %arrayidx.rhs, align 4
+  %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %lhs, float %rhs, i32 1)
+  %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
+  store float %shuffle_up, ptr %arrayidx.out, align 8
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out)
+; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, {{<(i32 2(, )?)+>|splat \(i32 2\)}}
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
+; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
+; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
+
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} 
+
+; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
+; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
+
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]]
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS:%.*]], <16 x i8> [[RHS:%.*]], i32 [[DELTA0]])
+; CHECK: [[SUBVECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELTBASE0:%.*]] = mul i32 [[SUBVECIDX0]], 4
+; CHECK: [[VECIDX00:%.*]] = add i32 [[ELTBASE0]], 0
+; CHECK: [[ELT00:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX00]]
+; CHECK: [[VEC00:%.*]] = insertelement <4 x i8> poison, i8 [[ELT00]], i32 0
+; CHECK: [[VECIDX01:%.*]] = add i32 [[ELTBASE0]], 1
+; CHECK: [[ELT01:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX01]]
+; CHECK: [[VEC01:%.*]] = insertelement <4 x i8> [[VEC00]], i8 [[ELT01]], i32 1
+; CHECK: [[VECIDX02:%.*]] = add i32 [[ELTBASE0]], 2
+; CHECK: [[ELT02:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX02]]
+; CHECK: [[VEC02:%.*]] = insertelement <4 x i8> [[VEC01]], i8 [[ELT02]], i32 2
+; CHECK: [[VECIDX03:%.*]] = add i32 [[ELTBASE0]], 3
+; CHECK: [[ELT03:%.*]] = extractelement <16 x i8> [[SHUFF0]], i32 [[VECIDX03]]
+; CHECK: [[VEC03:%.*]] = insertelement <4 x i8> [[VEC02]], i8 [[ELT03]], i32 3
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[SUBVECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELTBASE1:%.*]] = mul i32 [[SUBVECIDX1]], 4
+; CHECK: [[VECIDX10:%.*]] = add i32 [[ELTBASE1]], 0
+; CHECK: [[ELT10:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX10]]
+; CHECK: [[VEC10:%.*]] = insertelement <4 x i8> poison, i8 [[ELT10]], i32 0
+; CHECK: [[VECIDX11:%.*]] = add i32 [[ELTBASE1]], 1
+; CHECK: [[ELT11:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX11]]
+; CHECK: [[VEC11:%.*]] = insertelement <4 x i8> [[VEC10]], i8 [[ELT11]], i32 1
+; CHECK: [[VECIDX12:%.*]] = add i32 [[ELTBASE1]], 2
+; CHECK: [[ELT12:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX12]]
+; CHECK: [[VEC12:%.*]] = insertelement <4 x i8> [[VEC11]], i8 [[ELT12]], i32 2
+; CHECK: [[VECIDX13:%.*]] = add i32 [[ELTBASE1]], 3
+; CHECK: [[ELT13:%.*]] = extractelement <16 x i8> [[SHUFF1]], i32 [[VECIDX13]]
+; CHECK: [[VEC13:%.*]] = insertelement <4 x i8> [[VEC12]], i8 [[ELT13]], i32 3
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[SUBVECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELTBASE2:%.*]] = mul i32 [[SUBVECIDX2]], 4
+; CHECK: [[VECIDX20:%.*]] = add i32 [[ELTBASE2]], 0
+; CHECK: [[ELT20:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX20]]
+; CHECK: [[VEC20:%.*]] = insertelement <4 x i8> poison, i8 [[ELT20]], i32 0
+; CHECK: [[VECIDX21:%.*]] = add i32 [[ELTBASE2]], 1
+; CHECK: [[ELT21:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX21]]
+; CHECK: [[VEC21:%.*]] = insertelement <4 x i8> [[VEC20]], i8 [[ELT21]], i32 1
+; CHECK: [[VECIDX22:%.*]] = add i32 [[ELTBASE2]], 2
+; CHECK: [[ELT22:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX22]]
+; CHECK: [[VEC22:%.*]] = insertelement <4 x i8> [[VEC21]], i8 [[ELT22]], i32 2
+; CHECK: [[VECIDX23:%.*]] = add i32 [[ELTBASE2]], 3
+; CHECK: [[ELT23:%.*]] = extractelement <16 x i8> [[SHUFF2]], i32 [[VECIDX23]]
+; CHECK: [[VEC23:%.*]] = insertelement <4 x i8> [[VEC22]], i8 [[ELT23]], i32 3
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <16 x i8> @__mux_sub_group_shuffle_up_v16i8(
+; CHECK-SAME:                      <16 x i8> [[LHS]], <16 x i8> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[SUBVECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELTBASE3:%.*]] = mul i32 [[SUBVECIDX3]], 4
+; CHECK: [[VECIDX30:%.*]] = add i32 [[ELTBASE3]], 0
+; CHECK: [[ELT30:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX30]]
+; CHECK: [[VEC30:%.*]] = insertelement <4 x i8> poison, i8 [[ELT30]], i32 0
+; CHECK: [[VECIDX31:%.*]] = add i32 [[ELTBASE3]], 1
+; CHECK: [[ELT31:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX31]]
+; CHECK: [[VEC31:%.*]] = insertelement <4 x i8> [[VEC30]], i8 [[ELT31]], i32 1
+; CHECK: [[VECIDX32:%.*]] = add i32 [[ELTBASE3]], 2
+; CHECK: [[ELT32:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX32]]
+; CHECK: [[VEC32:%.*]] = insertelement <4 x i8> [[VEC31]], i8 [[ELT32]], i32 2
+; CHECK: [[VECIDX33:%.*]] = add i32 [[ELTBASE3]], 3
+; CHECK: [[ELT33:%.*]] = extractelement <16 x i8> [[SHUFF3]], i32 [[VECIDX33]]
+; CHECK: [[VEC33:%.*]] = insertelement <4 x i8> [[VEC32]], i8 [[ELT33]], i32 3
+define spir_kernel void @kernel_vec_data(ptr %lhsptr, ptr %rhsptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds <4 x i8>, ptr %lhsptr, i64 %gid
+  %lhs = load <4 x i8>, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds <4 x i8>, ptr %rhsptr, i64 %gid
+  %rhs = load <4 x i8>, ptr %arrayidx.rhs, align 4
+  %shuffle_up = call <4 x i8> @__mux_sub_group_shuffle_up_v4i8(<4 x i8> %lhs, <4 x i8> %rhs, i32 2)
+  %arrayidx.out = getelementptr inbounds <4 x i8>, ptr %out, i64 %gid
+  store <4 x i8> %shuffle_up, ptr %arrayidx.out, align 4
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out)
+; CHECK: [[LHS:%.*]] = load <4 x float>, ptr %arrayidx.lhs, align 4
+; CHECK: [[RHS:%.*]] = load <4 x float>, ptr %arrayidx.rhs, align 4
+; CHECK: [[DELTALD:%.*]] = load <4 x i32>, ptr %arrayidx.deltas, align 4
+
+; CHECK: [[DELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[DELTALD]]
+; CHECK: [[QUOTIENT:%.*]] = sdiv <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[REMAINDER:%.*]] = srem <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+
+; CHECK: [[ARGXOR:%.*]] = xor <4 x i32> [[DELTAS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[SIGNDIFF:%.*]] = icmp slt <4 x i32> [[ARGXOR]], zeroinitializer
+; CHECK: [[REMNONZERO:%.*]] = icmp ne <4 x i32> [[REMAINDER]], zeroinitializer
+; CHECK: [[CONDITION:%.*]] = and <4 x i1> [[REMNONZERO]], [[SIGNDIFF]]
+
+; CHECK: [[MIN1:%.*]] = sub <4 x i32> [[QUOTIENT]], {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: [[PLUSR:%.*]] = add <4 x i32> [[REMAINDER]], {{<(i32 4(, )?)+>|splat \(i32 4\)}} 
+
+; CHECK: [[MUXIDS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[MIN1]], <4 x i32> [[QUOTIENT]]
+; CHECK: [[VECELTS:%.*]] = select <4 x i1> [[CONDITION]], <4 x i32> [[PLUSR]], <4 x i32> [[REMAINDER]]
+
+; CHECK: [[MUXDELTAS:%.*]] = sub <4 x i32> {{%.*}}, [[MUXIDS]]
+
+; CHECK: [[DELTA0:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 0
+; CHECK: [[SHUFF0:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA0]])
+; CHECK: [[VECIDX0:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x float> [[SHUFF0]], i32 [[VECIDX0]]
+
+; CHECK: [[DELTA1:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 1
+; CHECK: [[SHUFF1:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA1]])
+; CHECK: [[VECIDX1:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x float> [[SHUFF1]], i32 [[VECIDX1]]
+
+; CHECK: [[DELTA2:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 2
+; CHECK: [[SHUFF2:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA2]])
+; CHECK: [[VECIDX2:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x float> [[SHUFF2]], i32 [[VECIDX2]]
+
+; CHECK: [[DELTA3:%.*]] = extractelement <4 x i32> [[MUXDELTAS]], i32 3
+; CHECK: [[SHUFF3:%.*]] = call <4 x float> @__mux_sub_group_shuffle_up_v4f32(
+; CHECK-SAME:                      <4 x float> [[LHS]], <4 x float> [[RHS]], i32 [[DELTA3]])
+; CHECK: [[VECIDX3:%.*]] = extractelement <4 x i32> [[VECELTS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x float> [[SHUFF3]], i32 [[VECIDX3]]
+define spir_kernel void @kernel_varying_delta(ptr %lhsptr, ptr %rhsptr, ptr %deltaptr, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.lhs = getelementptr inbounds float, ptr %lhsptr, i64 %gid
+  %lhs = load float, ptr %arrayidx.lhs, align 4
+  %arrayidx.rhs = getelementptr inbounds float, ptr %rhsptr, i64 %gid
+  %rhs = load float, ptr %arrayidx.rhs, align 4
+  %arrayidx.deltas = getelementptr inbounds i32, ptr %deltaptr, i64 %gid
+  %delta = load i32, ptr %arrayidx.deltas, align 4
+  %shuffle_up = call float @__mux_sub_group_shuffle_up_f32(float %lhs, float %rhs, i32 %delta)
+  %arrayidx.out = getelementptr inbounds float, ptr %out, i64 %gid
+  store float %shuffle_up, ptr %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare float @__mux_sub_group_shuffle_up_f32(float %prev, float %curr, i32 %delta)
+declare <4 x i8> @__mux_sub_group_shuffle_up_v4i8(<4 x i8> %prev, <4 x i8> %curr, i32 %delta)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
new file mode 100644
index 0000000000000..c1aaca731d2cd
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/subgroup_shuffle_xor.ll
@@ -0,0 +1,231 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=packetizer,verify -S \
+; RUN:   --pass-remarks-missed=vecz < %s 2>&1 | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-p:64:64:64-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_const_value(ptr %in, ptr %out)
+; The XOR'd sub-group local IDs
+; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
+; Which mux sub-group each of the XOR'd sub-group local IDs correspond to
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; Which vector group element each of the XOR'd sub-group local IDs correspond to
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+
+; Extract the first XOR'd vector-local sub-group local ID from the vector of vector indices
+; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
+; Extract the data element that this XOR'd local ID corresponds to
+; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
+; Extract the first XOR'd mux-local sub-group local ID from the vector of mux indices
+; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
+; Shuffle across any hardware sub-group
+; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
+; Put that result into the final vector
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0
+
+; And so on for the other shuffle values
+; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]]
+; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
+; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]])
+; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1
+
+; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]]
+; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2
+; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]])
+; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2
+
+; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]]
+; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3
+; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]])
+; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3
+
+; CHECK: store <4 x half> [[SHUFF_VEC3]],
+define spir_kernel void @kernel_varying_data_const_value(ptr %in, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid
+  %data = load half, ptr %arrayidx.in, align 2
+  %shuffle1 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 4)
+  %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
+  store half %shuffle1, ptr %arrayidx.out, align 2
+  ret void
+}
+
+; This should just be the same as the previous kernel. The uniform value doesn't change anything.
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_uniform_value(ptr %in, i32 %val, ptr %out)
+; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
+; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
+; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0
+; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]]
+; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
+; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]])
+; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1
+; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]]
+; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2
+; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]])
+; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2
+; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]]
+; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3
+; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]])
+; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3
+; CHECK: store <4 x half> [[SHUFF_VEC3]],
+define spir_kernel void @kernel_varying_data_uniform_value(ptr %in, i32 %val, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid
+  %data = load half, ptr %arrayidx.in, align 2
+  %shuffle2 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val)
+  %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
+  store half %shuffle2, ptr %arrayidx.out, align 2
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_uniform_data_uniform_value(half %data, i32 %val, ptr %out)
+; CHECK: [[SPLATINS:%.*]] = insertelement <4 x half> poison, half %data, i64 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x half> [[SPLATINS]], <4 x half> poison, <4 x i32> zeroinitializer
+; CHECK: store <4 x half> [[SPLAT]]
+define spir_kernel void @kernel_uniform_data_uniform_value(half %data, i32 %val, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %shuffle3 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val)
+  %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
+  store half %shuffle3, ptr %arrayidx.out, align 2
+  ret void
+}
+
+; This should just be the same as the previous kernel. The varying value doesn't change anything.
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_data_varying_value(ptr %in, ptr %vals, ptr %out)
+; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
+; CHECK: [[ELT0:%.*]] = extractelement <4 x half> [[DATA:%.*]], i32 [[IDXELT0]]
+; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
+; CHECK: [[SHUFF_ELT0:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT0]], i32 [[ID0]])
+; CHECK: [[SHUFF_VEC0:%.*]] = insertelement <4 x half> poison, half [[SHUFF_ELT0]], i32 0
+; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
+; CHECK: [[ELT1:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT1]]
+; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
+; CHECK: [[SHUFF_ELT1:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT1]], i32 [[ID1]])
+; CHECK: [[SHUFF_VEC1:%.*]] = insertelement <4 x half> [[SHUFF_VEC0]], half [[SHUFF_ELT1]], i32 1
+; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2
+; CHECK: [[ELT2:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT2]]
+; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2
+; CHECK: [[SHUFF_ELT2:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT2]], i32 [[ID2]])
+; CHECK: [[SHUFF_VEC2:%.*]] = insertelement <4 x half> [[SHUFF_VEC1]], half [[SHUFF_ELT2]], i32 2
+; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3
+; CHECK: [[ELT3:%.*]] = extractelement <4 x half> [[DATA]], i32 [[IDXELT3]]
+; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3
+; CHECK: [[SHUFF_ELT3:%.*]] = call half @__mux_sub_group_shuffle_f16(half [[ELT3]], i32 [[ID3]])
+; CHECK: [[SHUFF_VEC3:%.*]] = insertelement <4 x half> [[SHUFF_VEC2]], half [[SHUFF_ELT3]], i32 3
+; CHECK: store <4 x half> [[SHUFF_VEC3]],
+define spir_kernel void @kernel_varying_data_varying_value(ptr %in, ptr %vals, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds half, ptr %in, i64 %gid
+  %data = load half, ptr %arrayidx.in, align 2
+  %arrayidx.vals = getelementptr inbounds i32, ptr %in, i64 %gid
+  %val = load i32, ptr %arrayidx.vals, align 4
+  %shuffle4 = call half @__mux_sub_group_shuffle_xor_f16(half %data, i32 %val)
+  %arrayidx.out = getelementptr inbounds half, ptr %out, i64 %gid
+  store half %shuffle4, ptr %arrayidx.out, align 2
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_kernel_varying_vec_data_varying_value(ptr %in, ptr %vals, ptr %out)
+; CHECK: [[XORIDS:%.*]] = xor <4 x i32>
+; CHECK-DAG: [[MUXXORIDS:%.*]] = udiv <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+; CHECK-DAG: [[VECXORIDS:%.*]] = urem <4 x i32> [[XORIDS]], {{<(i32 4(, )?)+>|splat \(i32 4\)}}
+
+; CHECK: [[IDXELT0:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 0
+; CHECK: [[MULIDXELT0:%.*]] = mul i32 [[IDXELT0]], 2
+; CHECK: [[MADIDXELT00:%.*]] = add i32 [[MULIDXELT0]], 0
+; CHECK: [[ELT00:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT00]]
+; CHECK: [[DATAELT00:%.*]] = insertelement <2 x float> poison, float [[ELT00]], i32 0
+; CHECK: [[MADIDXELT01:%.*]] = add i32 [[MULIDXELT0]], 1
+; CHECK: [[ELT01:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT01]]
+; CHECK: [[DATAELT01:%.*]] = insertelement <2 x float> [[DATAELT00]], float [[ELT01]], i32 1
+; CHECK: [[ID0:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 0
+; CHECK: [[SHUFF_ELT0:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT01]], i32 [[ID0]])
+; CHECK: [[SHUFF_RES0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
+; CHECK-SAME:                                      <8 x float> poison, <2 x float> [[SHUFF_ELT0]], i64 0)
+
+; CHECK: [[IDXELT1:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 1
+; CHECK: [[MULIDXELT1:%.*]] = mul i32 [[IDXELT1]], 2
+; CHECK: [[MADIDXELT10:%.*]] = add i32 [[MULIDXELT1]], 0
+; CHECK: [[ELT10:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT10]]
+; CHECK: [[DATAELT10:%.*]] = insertelement <2 x float> poison, float [[ELT10]], i32 0
+; CHECK: [[MADIDXELT11:%.*]] = add i32 [[MULIDXELT1]], 1
+; CHECK: [[ELT11:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT11]]
+; CHECK: [[DATAELT11:%.*]] = insertelement <2 x float> [[DATAELT10]], float [[ELT11]], i32 1
+; CHECK: [[ID1:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 1
+; CHECK: [[SHUFF_ELT1:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT11]], i32 [[ID1]])
+; CHECK: [[SHUFF_RES1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
+; CHECK-SAME:                                      <8 x float> [[SHUFF_RES0]], <2 x float> [[SHUFF_ELT1]], i64 2)
+
+; CHECK: [[IDXELT2:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 2
+; CHECK: [[MULIDXELT2:%.*]] = mul i32 [[IDXELT2]], 2
+; CHECK: [[MADIDXELT20:%.*]] = add i32 [[MULIDXELT2]], 0
+; CHECK: [[ELT20:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT20]]
+; CHECK: [[DATAELT20:%.*]] = insertelement <2 x float> poison, float [[ELT20]], i32 0
+; CHECK: [[MADIDXELT21:%.*]] = add i32 [[MULIDXELT2]], 1
+; CHECK: [[ELT21:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT21]]
+; CHECK: [[DATAELT21:%.*]] = insertelement <2 x float> [[DATAELT20]], float [[ELT21]], i32 1
+; CHECK: [[ID2:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 2
+; CHECK: [[SHUFF_ELT2:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT21]], i32 [[ID2]])
+; CHECK: [[SHUFF_RES2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
+; CHECK-SAME:                                      <8 x float> [[SHUFF_RES1]], <2 x float> [[SHUFF_ELT2]], i64 4)
+
+; CHECK: [[IDXELT3:%.*]] = extractelement <4 x i32> [[VECXORIDS]], i32 3
+; CHECK: [[MULIDXELT3:%.*]] = mul i32 [[IDXELT3]], 2
+; CHECK: [[MADIDXELT30:%.*]] = add i32 [[MULIDXELT3]], 0
+; CHECK: [[ELT30:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT30]]
+; CHECK: [[DATAELT30:%.*]] = insertelement <2 x float> poison, float [[ELT30]], i32 0
+; CHECK: [[MADIDXELT31:%.*]] = add i32 [[MULIDXELT3]], 1
+; CHECK: [[ELT31:%.*]] = extractelement <8 x float> [[DATA:%.*]], i32 [[MADIDXELT31]]
+; CHECK: [[DATAELT31:%.*]] = insertelement <2 x float> [[DATAELT30]], float [[ELT31]], i32 1
+; CHECK: [[ID3:%.*]] = extractelement <4 x i32> [[MUXXORIDS]], i32 3
+; CHECK: [[SHUFF_ELT3:%.*]] = call <2 x float> @__mux_sub_group_shuffle_v2f32(<2 x float> [[DATAELT31]], i32 [[ID3]])
+; CHECK: [[SHUFF_RES3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(
+; CHECK-SAME:                                      <8 x float> [[SHUFF_RES2]], <2 x float> [[SHUFF_ELT3]], i64 6)
+
+; CHECK: store <8 x float> [[SHUFF_RES3]]
+define spir_kernel void @kernel_varying_vec_data_varying_value(ptr %in, ptr %vals, ptr %out) {
+  %gid = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx.in = getelementptr inbounds <2 x float>, ptr %in, i64 %gid
+  %data = load <2 x float>, ptr %arrayidx.in, align 8
+  %arrayidx.vals = getelementptr inbounds i32, ptr %in, i64 %gid
+  %val = load i32, ptr %arrayidx.vals, align 4
+  %shuffle5 = call <2 x float> @__mux_sub_group_shuffle_xor_v2f32(<2 x float> %data, i32 %val)
+  %arrayidx.out = getelementptr inbounds <2 x float>, ptr %out, i64 %gid
+  store <2 x float> %shuffle5, ptr %arrayidx.out, align 8
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare half @__mux_sub_group_shuffle_xor_f16(half, i32)
+declare <2 x float> @__mux_sub_group_shuffle_xor_v2f32(<2 x float>, i32)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
new file mode 100644
index 0000000000000..2a8464528d01d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform.ll
@@ -0,0 +1,118 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-passes=ternary-transform,verify -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_positive(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+define spir_kernel void @test_positive_gep_different_type(i64 %a, i64 %b, i8* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i8, i8* %c2, i64 %gid
+  store i8 1, i8* %c3, align 4
+  ret void
+}
+
+define spir_kernel void @test_negative(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  %c1 = getelementptr i64, i64* %c, i64 0
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  store i64 %b, i64* %c2, align 4
+  ret void
+ }
+
+
+define spir_kernel void @test_vector_scalar_cond(i64 %a, <2 x i32> %b, <2 x i32>* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr <2 x i32>, <2 x i32>* %c, i64 %gid
+  %c1 = getelementptr <2 x i32>, <2 x i32>* %c, i64 0
+  %c2 = select i1 %cond, <2 x i32>* %c0, <2 x i32>* %c1
+  %c3 = getelementptr <2 x i32>, <2 x i32>* %c2, i64 %gid
+  store <2 x i32> <i32 1, i32 0>, <2 x i32>* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_positive(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_positive_gep_different_type(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i8, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i8, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_hu3ptrb(i8 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_hu3ptrb(i8 1, ptr %[[GEP2]], i1 %[[XOR]])
+
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test_negative(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: store i64 %b, ptr %c2, align 4
+
+; Note: we don't perform this transform on vector accesses.
+; CHECK: define spir_kernel void @__vecz_v4_test_vector_scalar_cond(i64 %a, <2 x i32> %b, ptr %c)
+; CHECK:   %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK:   %cond = icmp eq i64 %a, %gid
+; CHECK:   %c0 = getelementptr <2 x i32>, ptr %c, i64 %gid
+; CHECK:   %c1 = getelementptr <2 x i32>, ptr %c, i64 0
+; CHECK:   %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK:   %c3 = getelementptr <2 x i32>, ptr %c2, i64 %gid
+; CHECK:   store <2 x i32> <i32 1, i32 0>, ptr %c3, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
new file mode 100644
index 0000000000000..69756d0886cc3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_different_strides.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %gid_shift = shl i64 %gid, 1
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_shift
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform is applied when the source GEPs have
+; constant strides, even though they are different.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %gid_shift = shl i64 %gid, 1
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_shift
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
new file mode 100644
index 0000000000000..7636e5411a171
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_gep.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %gid_mashed = xor i64 %gid, 12462
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_offset
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid_mashed
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform pass is not applied when the GEP index
+; is divergent, which would result in a scatter store regardless.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid_offset = add i64 %gid, 16
+; CHECK: %gid_mashed = xor i64 %gid, 12462
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid_mashed
+; CHECK: store i64 1, ptr %c3, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
new file mode 100644
index 0000000000000..02573c3ce0b59
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_divergent_source.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %gid_mashed = xor i64 %gid, 12462
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_mashed
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform pass is not applied when a source GEP
+; is divergent, which would result in a scatter store regardless.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid_offset = add i64 %gid, 16
+; CHECK: %gid_mashed = xor i64 %gid, 12462
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_mashed
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid
+; CHECK: store i64 1, ptr %c3, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
new file mode 100644
index 0000000000000..fe73640be0612
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_negative.ll
@@ -0,0 +1,44 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_negative -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_negative(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  %c1 = getelementptr i64, i64* %c, i64 0
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  store i64 %b, i64* %c2, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform is not applied when the select is not
+; accessed through an additional GEP.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_negative(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: store i64 %b, ptr %c2, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
new file mode 100644
index 0000000000000..6eff9b6ad58e4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_positive.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_offset
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform is applied when the source GEPs have
+; equal constant strides.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %gid_offset = add i64 %gid, 16
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
new file mode 100644
index 0000000000000..8e88963b75871
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_cond_diff_strides.ll
@@ -0,0 +1,54 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %gid_shift = shl i64 %gid, 1
+  %cond = icmp eq i64 %a, 0
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_shift
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform is applied when the condition is
+; uniform, and the source GEPs have different constant strides.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %gid_shift = shl i64 %gid, 1
+; CHECK: %cond = icmp eq i64 %a, 0
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_shift
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
new file mode 100644
index 0000000000000..3cee1ff3eb4b4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %cond = icmp eq i64 %a, 0
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_offset
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform is not applied when the condition is
+; uniform, and the two strides are the same.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid_offset = add i64 %gid, 16
+; CHECK: %cond = icmp eq i64 %a, 0
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 %gid_offset
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %c2 = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: %c3 = getelementptr i64, ptr %c2, i64 %gid
+; CHECK: store i64 1, ptr %c3, align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
new file mode 100644
index 0000000000000..1f2b59b23456d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_condition_packetized.ll
@@ -0,0 +1,46 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %gid_offset = add i64 %gid, 16
+  %cond = icmp eq i64 %a, 0
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 %gid_offset
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 0
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform is not applied when the condition is
+; uniform and the two strides are equal, and that the result is a contiguous
+; vector store.
+
+; CHECK: %[[SELECT:.+]] = select i1 %cond, ptr %c0, ptr %c1
+; CHECK: %[[BASE:.+]] = getelementptr i64, ptr %[[SELECT]], i64 0
+; CHECK: store <4 x i64> {{<(i64 1(, )?)+>|splat \(i64 1\)}}, ptr %[[BASE]], align 4
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
new file mode 100644
index 0000000000000..a9d1a37b305b8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_source.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 %gid
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform is applied when one of the source GEPs
+; is uniform
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 %gid
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
new file mode 100644
index 0000000000000..b577f149f82e3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/ternary_transform_uniform_sources.ll
@@ -0,0 +1,52 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_ternary -vecz-passes=ternary-transform -vecz-simd-width=4 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test_ternary(i64 %a, i64 %b, i64* %c) {
+entry:
+  %gid = call i64 @__mux_get_global_id(i32 0)
+  %cond = icmp eq i64 %a, %gid
+  %c0 = getelementptr i64, i64* %c, i64 1
+  store i64 %b, i64* %c0, align 4
+  %c1 = getelementptr i64, i64* %c, i64 0
+  store i64 0, i64* %c1, align 4
+  %c2 = select i1 %cond, i64* %c0, i64* %c1
+  %c3 = getelementptr i64, i64* %c2, i64 %gid
+  store i64 1, i64* %c3, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This checks that the ternary transform is applied when the source GEPs are
+; both uniform.
+
+; CHECK: define spir_kernel void @__vecz_v4_test_ternary(i64 %a, i64 %b, ptr %c)
+; CHECK: %gid = call i64 @__mux_get_global_id(i32 0)
+; CHECK: %cond = icmp eq i64 %a, %gid
+; CHECK: %c0 = getelementptr i64, ptr %c, i64 1
+; CHECK: store i64 %b, ptr %c0, align 4
+; CHECK: %c1 = getelementptr i64, ptr %c, i64 0
+; CHECK: store i64 0, ptr %c1, align 4
+; CHECK: %[[XOR:.+]] = xor i1 %cond, true
+; CHECK: %[[GEP1:.+]] = getelementptr i64, ptr %c0, i64 %gid
+; CHECK: %[[GEP2:.+]] = getelementptr i64, ptr %c1, i64 %gid
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP1]], i1 %cond)
+; CHECK: call void @__vecz_b_masked_store4_mu3ptrb(i64 1, ptr %[[GEP2]], i1 %[[XOR]])
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
new file mode 100644
index 0000000000000..8b5d83c3b6835
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_packetization.ll
@@ -0,0 +1,118 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -vecz-simd-width=128 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+; CHECK-LABEL: define spir_kernel void @__vecz_v128_add(ptr addrspace(1) %in1, ptr addrspace(1) %in2, ptr addrspace(1) %out) 
+; CHECK: = load <128 x i32>, ptr addrspace(1)
+; CHECK: = load <128 x i32>, ptr addrspace(1)
+; CHECK: = add nsw <128 x i32>
+; CHECK: store <128 x i32>
+define spir_kernel void @add(i32 addrspace(1)* %in1, i32 addrspace(1)* %in2, i32 addrspace(1)* %out) #0 !dbg !4 {
+entry:
+  %in1.addr = alloca i32 addrspace(1)*, align 8
+  %in2.addr = alloca i32 addrspace(1)*, align 8
+  %out.addr = alloca i32 addrspace(1)*, align 8
+  %tid = alloca i64, align 8
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  store i32 addrspace(1)* %in1, i32 addrspace(1)** %in1.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in1.addr, metadata !11, metadata !29), !dbg !30
+  store i32 addrspace(1)* %in2, i32 addrspace(1)** %in2.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %in2.addr, metadata !12, metadata !29), !dbg !30
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 8
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %out.addr, metadata !13, metadata !29), !dbg !30
+  call void @llvm.dbg.declare(metadata i64* %tid, metadata !14, metadata !29), !dbg !31
+  %call = call i64 @__mux_get_global_id(i32 0) #3, !dbg !31
+  store i64 %call, i64* %tid, align 8, !dbg !31
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !29), !dbg !32
+  %0 = load i64, i64* %tid, align 8, !dbg !32
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %in1.addr, align 8, !dbg !32
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 %0, !dbg !32
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !32
+  store i32 %2, i32* %a, align 4, !dbg !32
+  call void @llvm.dbg.declare(metadata i32* %b, metadata !20, metadata !29), !dbg !33
+  %3 = load i64, i64* %tid, align 8, !dbg !33
+  %4 = load i32 addrspace(1)*, i32 addrspace(1)** %in2.addr, align 8, !dbg !33
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %4, i64 %3, !dbg !33
+  %5 = load i32, i32 addrspace(1)* %arrayidx1, align 4, !dbg !33
+  store i32 %5, i32* %b, align 4, !dbg !33
+  %6 = load i32, i32* %a, align 4, !dbg !34
+  %7 = load i32, i32* %b, align 4, !dbg !34
+  %add = add nsw i32 %6, %7, !dbg !34
+  %8 = load i64, i64* %tid, align 8, !dbg !34
+  %9 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 8, !dbg !34
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %8, !dbg !34
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !34
+  ret void, !dbg !35
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare i64 @__mux_get_global_id(i32) #2
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nobuiltin }
+
+!llvm.dbg.cu = !{!0}
+!opencl.kernels = !{!21}
+!llvm.module.flags = !{!27}
+!llvm.ident = !{!28}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/tmp")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "add", scope: !5, file: !5, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !10)
+!5 = !DIFile(filename: "kernel.opencl", directory: "/tmp")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8, !8, !8}
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64, align: 64)
+!9 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!10 = !{!11, !12, !13, !14, !19, !20}
+!11 = !DILocalVariable(name: "in1", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+!12 = !DILocalVariable(name: "in2", arg: 2, scope: !4, file: !5, line: 1, type: !8)
+!13 = !DILocalVariable(name: "out", arg: 3, scope: !4, file: !5, line: 1, type: !8)
+!14 = !DILocalVariable(name: "tid", scope: !4, file: !5, line: 3, type: !15)
+!15 = !DIDerivedType(tag: DW_TAG_typedef, name: "size_t", file: !16, line: 33, baseType: !17)
+!16 = !DIFile(filename: "/Aorta/OCL/modules/builtins/include/builtins/builtins.h", directory: "/tmp")
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "ulong", file: !16, line: 31, baseType: !18)
+!18 = !DIBasicType(name: "long unsigned int", size: 64, align: 64, encoding: DW_ATE_unsigned)
+!19 = !DILocalVariable(name: "a", scope: !4, file: !5, line: 5, type: !9)
+!20 = !DILocalVariable(name: "b", scope: !4, file: !5, line: 6, type: !9)
+!21 = !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32 addrspace(1)*)* @add, !22, !23, !24, !25, !26}
+!22 = !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
+!23 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!24 = !{!"kernel_arg_type", !"int*", !"int*", !"int*"}
+!25 = !{!"kernel_arg_base_type", !"int*", !"int*", !"int*"}
+!26 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!27 = !{i32 2, !"Debug Info Version", i32 3}
+!28 = !{!"clang version 3.8.0 "}
+!29 = !DIExpression()
+!30 = !DILocation(line: 1, scope: !4)
+!31 = !DILocation(line: 3, scope: !4)
+!32 = !DILocation(line: 5, scope: !4)
+!33 = !DILocation(line: 6, scope: !4)
+!34 = !DILocation(line: 7, scope: !4)
+!35 = !DILocation(line: 8, scope: !4)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
new file mode 100644
index 0000000000000..0f667a71134e5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/too_large_simdwidth_scalarization.ll
@@ -0,0 +1,40 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -vecz-passes=scalarizer -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_add(ptr %in1, ptr %in2, ptr %out)
+; CHECK-COUNT-128: = extractelement <128 x i32> %in1v,
+; CHECK-COUNT-128: insertelement <128 x i32>
+define spir_kernel void @add(<128 x i32>* %in1, <128 x i32>* %in2, <128 x i32>* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %in1p = getelementptr inbounds <128 x i32>, <128 x i32>* %in1, i64 %call
+  %in1v = load <128 x i32>, <128 x i32>* %in1p, align 4
+  %in2p = getelementptr inbounds <128 x i32>, <128 x i32>* %in2, i64 %call
+  %in2v = load <128 x i32>, <128 x i32>* %in2p, align 4
+  %add = add nsw <128 x i32> %in1v, %in2v
+  %outp = getelementptr inbounds <128 x i32>, <128 x i32>* %out, i64 %call
+  store <128 x i32> %add, <128 x i32>* %outp, align 4
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #2
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
new file mode 100644
index 0000000000000..d7b37641357b8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_base.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k uniform_address_index -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @uniform_address_index(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %a, i32 %b) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
+  %0 = icmp eq i32 %a, -2147483648
+  %1 = icmp eq i32 %b, -1
+  %2 = and i1 %0, %1
+  %3 = icmp eq i32 %b, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %b
+  %div = sdiv i32 %a, %5
+  %6 = trunc i64 %call to i32
+  %conv1 = add i32 %div, %6
+  %idxprom = sext i32 %conv1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %7 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %7, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr #1
+
+; It tests to ensure that the array index is correctly identified
+; as having a uniform stride and generates plain vector loads and not
+; gather/scatter builtin calls
+
+; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index
+; CHECK: entry:
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+; CHECK-DAG: %[[INA:.+]] = getelementptr i32, ptr addrspace(1) %in, i32 %[[X:.+]]
+; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]]
+; CHECK-DAG: %[[OUTA:.+]] = getelementptr i32, ptr addrspace(1) %out, i32 %[[X:.+]]
+; CHECK-DAG: store <4 x i32> %[[LOAD]], ptr addrspace(1) %[[OUTA]]
+; CHECK-NOT: call <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
new file mode 100644
index 0000000000000..d7b37641357b8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_address_index.ll
@@ -0,0 +1,56 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k uniform_address_index -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @uniform_address_index(i32 addrspace(1)* nocapture readonly %in, i32 addrspace(1)* nocapture %out, i32 %a, i32 %b) local_unnamed_addr #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
+  %0 = icmp eq i32 %a, -2147483648
+  %1 = icmp eq i32 %b, -1
+  %2 = and i1 %0, %1
+  %3 = icmp eq i32 %b, 0
+  %4 = or i1 %3, %2
+  %5 = select i1 %4, i32 1, i32 %b
+  %div = sdiv i32 %a, %5
+  %6 = trunc i64 %call to i32
+  %conv1 = add i32 %div, %6
+  %idxprom = sext i32 %conv1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %7 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %7, i32 addrspace(1)* %arrayidx3, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) local_unnamed_addr #1
+
+; It tests to ensure that the array index is correctly identified
+; as having a uniform stride and generates plain vector loads and not
+; gather/scatter builtin calls
+
+; CHECK: define spir_kernel void @__vecz_v4_uniform_address_index
+; CHECK: entry:
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+; CHECK-DAG: %[[INA:.+]] = getelementptr i32, ptr addrspace(1) %in, i32 %[[X:.+]]
+; CHECK-DAG: %[[LOAD:.+]] = load <4 x i32>, ptr addrspace(1) %[[INA]]
+; CHECK-DAG: %[[OUTA:.+]] = getelementptr i32, ptr addrspace(1) %out, i32 %[[X:.+]]
+; CHECK-DAG: store <4 x i32> %[[LOAD]], ptr addrspace(1) %[[OUTA]]
+; CHECK-NOT: call <4 x i32>
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
new file mode 100644
index 0000000000000..86e3d6145c4c3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop.ll
@@ -0,0 +1,45 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare i32 @__mux_get_local_size(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %size = call i32 @__mux_get_local_size(i32 0)
+  br label %loop
+
+loop:
+  %index = phi i32 [0, %entry], [%inc, %loop]
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc = add i32 %index, 1
+  %cmp = icmp ne i32 %inc, %size
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK-NOT: define spir_kernel void @test
+; CHECK: %[[LOAD:load.*]] = load i32, ptr addrspace(1) %in
+; CHECK: store i32 %[[LOAD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
new file mode 100644
index 0000000000000..f6e12b7d83615
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi1.ll
@@ -0,0 +1,49 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call i64 @__mux_get_global_id(i64 0) #2
+  %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
+  %load = load i32, i32 addrspace(1)* %init_addr
+  br label %loop
+
+loop:
+  %index = phi i64 [0, %entry], [%inc, %loop]
+  %slot = phi i32 addrspace(1)* [%init_addr, %entry], [%inc_addr, %loop]
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc_addr = getelementptr inbounds i32, i32 addrspace(1)* %slot, i64 16
+  %inc = add i64 %index, 1
+  %cmp = icmp ne i64 %inc, 16
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64)
+
+; It checks that the stride analysis can tell the store is contiguous through the PHI node.
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr
+; CHECK: loop:
+; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
new file mode 100644
index 0000000000000..bc6dc059cb554
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi2.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call i64 @__mux_get_global_id(i64 0) #2
+  %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
+  %load = load i32, i32 addrspace(1)* %init_addr
+  br label %loop
+
+loop:
+  %index = phi i64 [0, %entry], [%inc, %loop]
+  %slot = phi i32 addrspace(1)* [%inc_addr, %loop], [%init_addr, %entry]
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc_addr = getelementptr inbounds i32, i32 addrspace(1)* %slot, i64 16
+  %inc = add i64 %index, 1
+  %cmp = icmp ne i64 %inc, 16
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64)
+
+; It checks that the stride analysis can tell the store is contiguous through the PHI node.
+; Same as uniform_loop_contiguous_phi1.ll except with the PHI node incoming values reversed.
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr
+; CHECK: loop:
+; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
new file mode 100644
index 0000000000000..4baf7d5791f7b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi3.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call i64 @__mux_get_global_id(i64 0) #2
+  %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
+  %load = load i32, i32 addrspace(1)* %init_addr
+  br label %loop
+
+loop:
+  %count = phi i64 [0, %entry], [%inc, %loop]
+  %index = phi i64 [%id, %entry], [%inc_index, %loop]
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc_index = add i64 %index, 16
+  %inc = add i64 %count, 1
+  %cmp = icmp ne i64 %inc, 16
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64)
+
+; It checks that the stride analysis can tell the store is contiguous through the PHI node.
+; Same as uniform_loop_contiguous_phi1.ll except with the index GEP inside the loop.
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr
+; CHECK: loop:
+; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
new file mode 100644
index 0000000000000..33033bd0d9518
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_contiguous_phi4.ll
@@ -0,0 +1,51 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %id = call i64 @__mux_get_global_id(i64 0) #2
+  %init_addr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id
+  %load = load i32, i32 addrspace(1)* %init_addr
+  br label %loop
+
+loop:
+  %count = phi i64 [0, %entry], [%inc, %loop]
+  %index = phi i64 [%inc_index, %loop], [%id, %entry]
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc_index = add i64 %index, 16
+  %inc = add i64 %count, 1
+  %cmp = icmp ne i64 %inc, 16
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i64)
+
+; It checks that the stride analysis can tell the store is contiguous through the PHI node.
+; Same as uniform_loop_contiguous_phi3.ll except with the PHI node incoming values reversed.
+
+; CHECK: define spir_kernel void @__vecz_v4_test
+; CHECK: %[[LD:.+]] = load <4 x i32>, ptr addrspace(1) %init_addr
+; CHECK: loop:
+; CHECK: store <4 x i32> %[[LD]], ptr addrspace(1) %slot
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
new file mode 100644
index 0000000000000..ac8cb69ee5fc5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_loop_metadata.ll
@@ -0,0 +1,50 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir-unknown-unknown"
+
+declare spir_func i32 @__mux_get_local_size(i32);
+
+define spir_kernel void @test(i32 addrspace(1)* %in) {
+entry:
+  %size = call i32 @__mux_get_local_size(i32 0)
+  br label %loop
+
+loop:
+  %index = phi i32 [0, %entry], [%inc, %loop]
+  %load = load i32, i32 addrspace(1)* %in
+  %slot = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %index
+  store i32 %load, i32 addrspace(1)* %slot
+  %inc = add i32 %index, 1
+  %cmp = icmp ne i32 %inc, %size
+  br i1 %cmp, label %loop, label %merge
+
+merge:
+  ret void
+}
+
+; CHECK: define spir_kernel void @test(ptr addrspace(1) %in) !codeplay_ca_vecz.base !0
+; CHECK: entry:
+; CHECK: loop:
+; CHECK: define spir_kernel void @__vecz_v4_test(ptr addrspace(1) %in) #0 !codeplay_ca_vecz.derived !2
+; CHECK: entry:
+; CHECK: loop:
+; CHECK: !0 = !{!1, ptr @__vecz_v4_test}
+; CHECK: !1 = !{i32 4, i32 0, i32 0, i32 0}
+; CHECK: !2 = !{!1, ptr @test}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
new file mode 100644
index 0000000000000..6ce5f1cfc7ce4
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation1.ll
@@ -0,0 +1,58 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %y
+  %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
+  %varying = load i32, i32 addrspace(1)* %a_gep
+  %uniform1 = load i32, i32 addrspace(1)* %b_gep
+  %uniform2 = load i32, i32 addrspace(1)* %c_gep
+  %vu = add i32 %varying, %uniform1
+  %vuu = add i32 %vu, %uniform2
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %vuu, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks that a sum of a varying value with two uniform values
+; gets re-associated from (Varying + Uniform) + Uniform
+; to Varying + (Uniform + Uniform)
+; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation
+; CHECK: load
+
+; Ensure the two uniforms are added together directly
+; CHECK: %[[REASSOC:.+]] = add i32 %uniform1, %uniform2
+
+; Ensure there is only one vector splat
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %[[REASSOC]], {{(i32|i64)}} 0
+; CHECK-NOT: insertelement <4 x i32> poison, i32 %{{.+}}, {{(i32|i64)}} 0
+
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: %[[RESULT:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]]
+; CHECK: store <4 x i32> %vuu{{.*}}, ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
new file mode 100644
index 0000000000000..1315a92a7a9d3
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation2.ll
@@ -0,0 +1,59 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %x
+  %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
+  %varying1 = load i32, i32 addrspace(1)* %a_gep
+  %varying2 = load i32, i32 addrspace(1)* %b_gep
+  %uniform = load i32, i32 addrspace(1)* %c_gep
+  %vu = add i32 %varying1, %uniform
+  %vvu = add i32 %vu, %varying2
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %vvu, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks that a sum of a varying value with two uniform values
+; gets re-associated from (Varying + Uniform) + Varying
+; to (Varying + Varying) + Uniform
+; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation
+
+; CHECK: %[[VARYING1:.+]] = load <4 x i32>
+; CHECK: %[[VARYING2:.+]] = load <4 x i32>
+
+; The splat of the uniform value
+; CHECK: %uniform = load
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %uniform, {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+
+; Ensure the two varyings are added together directly
+; CHECK: %[[REASSOC:.+]] = add <4 x i32> %[[VARYING1]], %[[VARYING2]]
+; CHECK: %[[VVU:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]]
+; CHECK: store <4 x i32> %[[VVU]], ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
new file mode 100644
index 0000000000000..10dab1c06440e
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/uniform_reassociation3.ll
@@ -0,0 +1,59 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k uniform_reassociation -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @uniform_reassociation(i32 addrspace(1)* noalias %a, i32 addrspace(1)* noalias %b, i32 addrspace(1)* noalias %d) #0 {
+entry:
+  %x = call i64 @__mux_get_global_id(i32 0) #2
+  %y = call i64 @__mux_get_global_id(i32 1) #2
+  %a_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %x
+  %b_gep = getelementptr inbounds i32, i32 addrspace(1)* %b, i64 %x
+  %c_gep = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %y
+  %varying1 = load i32, i32 addrspace(1)* %a_gep
+  %varying2 = load i32, i32 addrspace(1)* %b_gep
+  %uniform = load i32, i32 addrspace(1)* %c_gep
+  %vu = add i32 %varying1, %uniform
+  %vvu = add i32 %varying2, %vu
+  %d_gep = getelementptr inbounds i32, i32 addrspace(1)* %d, i64 %x
+  store i32 %vvu, i32 addrspace(1)* %d_gep
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+; This test checks that a sum of a varying value with two uniform values
+; gets re-associated from Varying + (Varying + Uniform)
+; to (Varying + Varying) + Uniform
+; CHECK: define spir_kernel void @__vecz_v4_uniform_reassociation
+
+; CHECK: %[[VARYING1:.+]] = load <4 x i32>
+; CHECK: %[[VARYING2:.+]] = load <4 x i32>
+
+; The splat of the uniform value
+; CHECK: %uniform = load
+; CHECK: %[[SPLATINS:.+]] = insertelement <4 x i32> poison, i32 %uniform, {{(i32|i64)}} 0
+; CHECK: %[[SPLAT:.+]] = shufflevector <4 x i32> %[[SPLATINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+
+; Ensure the two varyings are added together directly
+; CHECK: %[[REASSOC:.+]] = add <4 x i32> %[[VARYING1]], %[[VARYING2]]
+; CHECK: %[[VVU:.+]] = add <4 x i32> %{{.*}}, %[[SPLAT]]
+; CHECK: store <4 x i32> %[[VVU]], ptr addrspace(1) %{{.+}}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
new file mode 100644
index 0000000000000..e698f17df7339
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/unmangled_builtin_call.ll
@@ -0,0 +1,67 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k k_controlflow_loop_if -S < %s | FileCheck %s
+
+; ModuleID = 'test.cl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind uwtable
+define void @k_controlflow_loop_if(float* nocapture %out, float* nocapture readonly %in1, i32* nocapture readnone %in2) #0 {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0) #2
+  %sext = shl i64 %call, 32
+  %idxprom = ashr exact i64 %sext, 32
+  %arrayidx = getelementptr inbounds float, float* %in1, i64 %idxprom
+  %0 = bitcast float* %arrayidx to i32*
+  %1 = load i32, i32* %0, align 4, !tbaa !7
+  %arrayidx2 = getelementptr inbounds float, float* %out, i64 %idxprom
+  %2 = bitcast float* %arrayidx2 to i32*
+  store i32 %1, i32* %2, align 4, !tbaa !7
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nobuiltin nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (float*, float*, i32*)* @k_controlflow_loop_if, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 0, i32 0, i32 0}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"float*", !"float*", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"float*", !"float*", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !"", !"", !""}
+!6 = !{!"clang version 3.8.0 "}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"float", !9, i64 0}
+!9 = !{!"omnipotent char", !10, i64 0}
+!10 = !{!"Simple C/C++ TBAA"}
+
+; The vectorized function
+; CHECK: define void @__vecz_v[[WIDTH:[0-9]+]]_k_controlflow_loop_if(
+
+; The unmangled __mux_get_global_id call
+; CHECK: tail call i64 @__mux_get_global_id(i32 0)
+
+; The vectorized loads and stores
+; CHECK: load <4 x i32>, ptr %arrayidx, align 4
+; CHECK: store <4 x i32> %0, ptr %arrayidx2, align 4
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
new file mode 100644
index 0000000000000..ccc581108605a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/user_calls.ll
@@ -0,0 +1,113 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k entry -w 2 -vecz-handle-declaration-only-calls -vecz-passes=cfg-convert,packetizer -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir64-unknown-unknown"
+
+@.str.1 = private unnamed_addr addrspace(2) constant [10 x i8] c"Test %ld\0A\00", align 1
+@.str.2 = private unnamed_addr addrspace(2) constant [6 x i8] c"Test\0A\00", align 1
+
+define spir_kernel void @entry(i64* %input, i64* %output) {
+entry:
+  %gid = call i64 @__mux_get_local_id(i32 0)
+  %i1ptr = getelementptr i64, i64* %output, i64 %gid
+  call void @__mux_mem_barrier(i32 2, i32 264) 
+  %ii = call i64 @functionD(i64* %input)
+  %ib = trunc i64 %ii to i1
+  call void @functionA(i64* %i1ptr, i1 %ib)
+  %i1 = load i64, i64* %i1ptr
+  %i2ptr = getelementptr i64, i64* %input, i64 %gid
+  %i2 = load i64, i64* %i2ptr
+  %cond = icmp eq i64 %i1, %i2
+  br i1 %cond, label %middle, label %end
+
+middle:
+  %ci3ptr = getelementptr i64, i64* %output, i64 %gid
+  %ci3 = load i64, i64* %ci3ptr
+  %fc = call i64 @functionB(i64* %ci3ptr, i64 %ci3, i32 16, i1 false)
+  %call2 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str.1, i64 0, i64 0), i64 %ci3)
+  br label %end
+
+end:
+  %rr = phi i64 [42, %entry], [%fc, %middle]
+  call void @functionC(i64 %rr)
+  %nah = call i64 @functionB(i64* %i2ptr, i64 %rr, i32 8, i1 true)
+  %call3 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(2)* @.str.2, i64 0, i64 0))
+  ret void
+}
+
+declare void @functionA(i64*, i1)
+
+declare i64 @functionB(i64*, i64, i32, i1)
+
+declare void @functionC(i64)
+
+define i64 @functionD(i64* %input) {
+entry:
+  %r = load i64, i64* %input
+  ret i64 %r
+}
+
+declare void @__mux_mem_barrier(i32, i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+declare i64 @__mux_get_local_id(i32)
+
+; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_entry
+; CHECK: entry:
+; Check that we didn't mask the __mux_get_local_id call
+; CHECK: %gid = call i64 @__mux_get_local_id(i32 0)
+; Check that we didn't mask the mem_fence call
+; CHECK: call void @__mux_mem_barrier(i32 2, i32 264)
+; Check that we instantiated functionA without a mask
+; CHECK: call void @functionA(ptr {{.+}}, i1 %ib)
+; CHECK: call void @functionA(ptr {{.+}}, i1 %ib)
+
+; Get the condition -- Also works as a sanity check for this test
+; CHECK: [[COND:%cond.*]] = icmp eq <[[WIDTH]] x i64>
+
+; Check if we instatiated functionB with a mask
+; CHECK: [[COND1:%[0-9]+]] = extractelement <[[WIDTH]] x i1> [[COND]], {{(i32|i64)}} 0
+; CHECK: [[COND2:%[0-9]+]] = extractelement <[[WIDTH]] x i1> [[COND]], {{(i32|i64)}} 1
+; CHECK: {{.+}} = call i64 @__vecz_b_masked_functionB(ptr {{(nonnull )?}}{{%[0-9]+}}, i64 {{%[0-9]+}}, i32 16, i1 false, i1 [[COND1]])
+; CHECK: {{.+}} = call i64 @__vecz_b_masked_functionB(ptr {{(nonnull )?}}{{%[0-9]+}}, i64 {{%[0-9]+}}, i32 16, i1 false, i1 [[COND2]])
+; CHECK: call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2) @.str.1, i64 {{%[0-9]+}}, i1 [[COND1]])
+; CHECK: call spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2) @.str.1, i64 {{%[0-9]+}}, i1 [[COND2]])
+
+; The following checks check the generated functionB masked function
+; CHECK: define private i64 @__vecz_b_masked_functionB(ptr{{( %0)?}}, i64{{( %1)?}}, i32{{( %2)?}}, i1{{( %3)?}}, i1{{( %4)?}}) {
+; CHECK: entry:
+; CHECK: br i1 %4, label %active, label %exit
+; CHECK: active:
+; CHECK: [[RES:%[0-9]+]] = call i64 @functionB(ptr {{(nonnull )?}}%0, i64 %1, i32 %2, i1 %3)
+; CHECK: br label %exit
+; CHECK: exit:
+; CHECK: [[RET:%[0-9]+]] = phi i64 [ [[RES]], %active ], [ 0, %entry ]
+; CHECK: ret i64 [[RET]]
+
+; The following checks check the generated printf masked function
+; CHECK: define private spir_func i32 @__vecz_b_masked_printf_u3ptrU3AS2mb(ptr addrspace(2){{( %0)?}}, i64{{( %1)?}}, i1{{( %2)?}}) {
+; CHECK: entry:
+; CHECK: br i1 %2, label %active, label %exit
+; CHECK: active:
+; CHECK: [[RES:%[0-9]+]] = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) %0, i64 %1)
+; CHECK: br label %exit
+; CHECK: exit:
+; CHECK: [[RET:%[0-9]+]] = phi i32 [ [[RES]], %active ], [ 0, %entry ]
+; CHECK: ret i32 [[RET]]
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
new file mode 100644
index 0000000000000..2f68a9297f6b5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load1.ll
@@ -0,0 +1,86 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k varying_load1 -vecz-passes=cfg-convert -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @varying_load1(i32 addrspace(1)* %out, i32 %n, i32 addrspace(1)* %meta) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %cmp = icmp slt i32 %conv, 11
+  br i1 %cmp, label %if.then, label %if.end16
+
+if.then:                                          ; preds = %entry
+  %0 = load i32, i32 addrspace(1)* %meta, align 4
+  %cmp2 = icmp eq i32 %0, 0
+  br i1 %cmp2, label %if.then4, label %if.end
+
+if.then4:                                         ; preds = %if.then
+  %mul5 = mul nsw i32 %conv, %n
+  %1 = icmp eq i32 %mul5, -2147483648
+  %2 = icmp eq i32 %n, -1
+  %3 = and i1 %2, %1
+  %4 = icmp eq i32 %n, 0
+  %5 = or i1 %4, %3
+  %6 = select i1 %5, i32 1, i32 %n
+  %div6 = sdiv i32 %mul5, %6
+  %add = add nsw i32 %div6, %conv
+  %shl7 = mul i32 %add, 8
+  %add8 = add nsw i32 %shl7, %mul5
+  %shl9 = shl i32 %add8, 3
+  br label %if.end
+
+if.end:                                           ; preds = %if.then4, %if.then
+  %sum.0 = phi i32 [ %shl9, %if.then4 ], [ %n, %if.then ]
+  %rem1 = and i32 %conv, 1
+  %cmp10 = icmp eq i32 %rem1, 0
+  br i1 %cmp10, label %if.then12, label %if.end16
+
+if.then12:                                        ; preds = %if.end
+  %7 = load i32, i32 addrspace(1)* %meta, align 4
+  %add13 = add nsw i32 %7, %n
+  %mul14 = mul nsw i32 %add13, %sum.0
+  br label %if.end16
+
+if.end16:                                         ; preds = %if.end, %if.then12, %entry
+  %ret.1 = phi i32 [ 0, %entry ], [ %mul14, %if.then12 ], [ 0, %if.end ]
+  %idxprom = sext i32 %conv to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom
+  store i32 %ret.1, i32 addrspace(1)* %arrayidx, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+; The purpose of this test is to make sure that if a condition is a use of a
+; uniform load that is control dependent of a varying path, then the load will
+; be considered "mask varying" and so the condition is still uniform.
+
+; CHECK: spir_kernel void @__vecz_v4_varying_load1
+; CHECK: if.then:
+; CHECK: %{{.+}} = call i32 @__vecz_b_masked_load4
+; CHECK: br i1
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
new file mode 100644
index 0000000000000..5a90f9cdf0b55
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/varying_load2.ll
@@ -0,0 +1,89 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k varying_load2 -vecz-passes=cfg-convert -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+source_filename = "kernel.opencl"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @varying_load2(i32 addrspace(1)* %input, i32 addrspace(1)* %out) #0 {
+entry:
+  %call1 = call i64 @__mux_get_local_size(i32 0) #3
+  %call2 = call i64 @__mux_get_local_id(i32 0) #3
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %input, i64 %call2
+  %cmp = icmp ne i64 %call2, 0
+  br i1 %cmp, label %for.cond.preheader, label %if.end14
+
+for.cond.preheader:                               ; preds = %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.preheader, %for.inc
+  %max.0 = phi i32 [ %max.1, %for.inc ], [ 0, %for.cond.preheader ]
+  %storemerge = phi i64 [ %inc, %for.inc ], [ 0, %for.cond.preheader ]
+  %call6 = call i64 @__mux_get_local_size(i32 0) #3
+  %cmp7 = icmp ult i64 %storemerge, %call6
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %load1 = load i32, i32 addrspace(1)* %input, align 4
+  %cmp9 = icmp ugt i32 %load1, %max.0
+  br i1 %cmp9, label %if.then, label %for.inc
+
+if.then:                                        ; preds = %for.body
+  %load2 = load i32, i32 addrspace(1)* %input, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %for.body
+  %max.1 = phi i32 [ %load2, %if.then ], [ %max.0, %for.body ]
+  %inc = add i64 %storemerge, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %max.0.lcssa = phi i32 [ %max.0, %for.cond ]
+  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call1
+  store i32 %max.0.lcssa, i32 addrspace(1)* %arrayidx13, align 4
+  br label %if.end14
+
+if.end14:                                         ; preds = %for.end, %entry
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_local_id(i32) #1
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_local_size(i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent noduplicate "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { convergent nobuiltin nounwind readonly }
+attributes #4 = { nounwind }
+
+; The purpose of this test is to make sure that if a condition is a use of a
+; uniform load that is control dependent of a varying path, then the load will
+; be considered "mask varying" and so the condition is still uniform.
+
+; CHECK: spir_kernel void @__vecz_v4_varying_load2
+; CHECK: for.body:
+; CHECK: %{{.+}} = call i32 @__vecz_b_masked_load4
+; CHECK: br i1
+; CHECK: if.then:
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
new file mode 100644
index 0000000000000..7755913a779a8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_intrinsics_scalarization.ll
@@ -0,0 +1,80 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; Test the -cl-opt-disable compile option
+; RUN: veczc -vecz-passes=scalarize -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @fmuladd(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx1 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx1, align 32
+  %arrayidx2 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx2, align 32
+  %arrayidx3 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx3, align 32
+  %div = fdiv <4 x double> %2, %3
+  %4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %div)
+  %arrayidx4 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %5 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx4, align 32
+  %sub = fsub <4 x double> %5, %4
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx4, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_fmuladd(
+; Check if the scalar fmuladd exists
+; CHECK: call double @llvm.fmuladd.f64(
+; Check if the vector fmuladd doesn't exist
+; CHECK-NOT: call double @llvm.fmuladd.v4f64(
+; CHECK: ret void
+
+define spir_kernel void @fma(<4 x double> addrspace(1)* %a, <4 x double> addrspace(1)* %b, <4 x double> addrspace(1)* %c, <4 x double> addrspace(1)* %d, <4 x double> addrspace(1)* %e) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %b, i64 %call
+  %0 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx, align 32
+  %arrayidx1 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %c, i64 %call
+  %1 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx1, align 32
+  %arrayidx2 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %d, i64 %call
+  %2 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx2, align 32
+  %arrayidx3 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %e, i64 %call
+  %3 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx3, align 32
+  %div = fdiv <4 x double> %2, %3
+  %4 = call <4 x double> @llvm.fma.v4f64(<4 x double> %0, <4 x double> %1, <4 x double> %div)
+  %arrayidx4 = getelementptr inbounds <4 x double>, <4 x double> addrspace(1)* %a, i64 %call
+  %5 = load <4 x double>, <4 x double> addrspace(1)* %arrayidx4, align 32
+  %sub = fsub <4 x double> %5, %4
+  store <4 x double> %sub, <4 x double> addrspace(1)* %arrayidx4, align 32
+  ret void
+}
+
+; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_fma(
+; Check if the scalar fma exists
+; CHECK: call double @llvm.fma.f64(
+; Check if the vector fma doesn't exist
+; CHECK-NOT: call double @llvm.fma.v4f64(
+; CHECK: ret void
+
+declare i64 @__mux_get_global_id(i32)
+
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
new file mode 100644
index 0000000000000..7d9b0385dbb90
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_uniform.ll
@@ -0,0 +1,87 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %call1 = call i64 @__mux_get_global_size(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
+  %0 = extractelement <4 x i1> %cmp2, i64 0
+  br i1 %0, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, <i32 1, i32 1, i32 1, i32 1>
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
+
+; This test checks if a uniform <4 x i32> phi is not scalarized
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %[[STOREMERGE:.+]] = phi <4 x i32> [ %[[INC:.+]], %for.body ], [ zeroinitializer, %entry.ROSCC ]
+; CHECK: %[[INC]] = add <4 x i32> %storemerge, {{<(i32 1(, )?)+>|splat \(i32 1\)}}
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
new file mode 100644
index 0000000000000..998c283a2f46f
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_phi_varying.ll
@@ -0,0 +1,97 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k vector_loop -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @vector_loop(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %call.trunc = trunc i64 %call to i32
+  %call.splatinsert = insertelement <4 x i32> poison, i32 %call.trunc, i32 0
+  %call.splat = shufflevector <4 x i32> %call.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %cmp = icmp eq i64 %call, 0
+  br i1 %cmp, label %for.end, label %for.cond
+
+for.cond:                                         ; preds = %entry, %for.body
+  %storemerge = phi <4 x i32> [ %inc, %for.body ], [ zeroinitializer, %entry ]
+  %call1 = call i64 @__mux_get_global_size(i32 0)
+  %conv = trunc i64 %call1 to i32
+  %splat.splatinsert = insertelement <4 x i32> poison, i32 %conv, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
+  %cmp2 = icmp slt <4 x i32> %storemerge, %splat.splat
+  %0 = extractelement <4 x i1> %cmp2, i64 0
+  br i1 %0, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom = sext i32 %1 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %3 = extractelement <4 x i32> %storemerge, i64 0
+  %idxprom3 = sext i32 %3 to i64
+  %arrayidx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom3
+  store i32 %2, i32 addrspace(1)* %arrayidx4, align 4
+  %4 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom5 = sext i32 %4 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom5
+  %5 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %6 = extractelement <4 x i32> %storemerge, i64 1
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom7
+  store i32 %5, i32 addrspace(1)* %arrayidx8, align 4
+  %7 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom9
+  %8 = load i32, i32 addrspace(1)* %arrayidx10, align 4
+  %9 = extractelement <4 x i32> %storemerge, i64 2
+  %idxprom11 = sext i32 %9 to i64
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 %8, i32 addrspace(1)* %arrayidx12, align 4
+  %10 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom13 = sext i32 %10 to i64
+  %arrayidx14 = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %idxprom13
+  %11 = load i32, i32 addrspace(1)* %arrayidx14, align 4
+  %12 = extractelement <4 x i32> %storemerge, i64 3
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom15
+  store i32 %11, i32 addrspace(1)* %arrayidx16, align 4
+  %inc = add <4 x i32> %storemerge, %call.splat
+  br label %for.cond
+
+for.end:                                          ; preds = %entry, %for.cond
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+declare i64 @__mux_get_global_size(i32)
+
+; This test checks if a varying <4 x i32> phi is scalarized into 4 i32 phis
+; and then re-packetized
+; CHECK: define spir_kernel void @__vecz_v4_vector_loop
+; CHECK: %[[STOREMERGE1:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC2:.+]], %for.cond ]
+; CHECK: %[[STOREMERGE4:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC5:.+]], %for.cond ]
+; CHECK: %[[STOREMERGE6:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC7:.+]], %for.cond ]
+; CHECK: %[[STOREMERGE8:.+]] = phi <4 x i32> [ zeroinitializer, %entry.ROSCC ], [ %[[INC9:.+]], %for.cond ]
+; CHECK: %[[INC2]] = add <4 x i32> %[[STOREMERGE1]], [[CALL:.+]]
+; CHECK: %[[INC5]] = add <4 x i32> %[[STOREMERGE4]], [[CALL]]
+; CHECK: %[[INC7]] = add <4 x i32> %[[STOREMERGE6]], [[CALL]]
+; CHECK: %[[INC9]] = add <4 x i32> %[[STOREMERGE8]], [[CALL]]
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
new file mode 100644
index 0000000000000..5582091b8ccd5
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf.ll
@@ -0,0 +1,92 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [29 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho\0A\00"
+
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
new file mode 100644
index 0000000000000..1e5257625ac75
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf32.ll
@@ -0,0 +1,92 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test32 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [225 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho\0A\00"
+
+; CHECK: define spir_kernel void @__vecz_v4_test32(
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
new file mode 100644
index 0000000000000..d63db033b2971
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf64.ll
@@ -0,0 +1,92 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test64 -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [449 x i8] c"%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho,%#4hho\0A\00"
+
+; CHECK: define spir_kernel void @__vecz_v4_test64(ptr %out, ptr %in1, ptr %in2)
+; CHECK: %call465130 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: %call465131 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: %call465132 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: %call465133 = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}}, i8 %{{.+}})
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll
new file mode 100644
index 0000000000000..a426c804c1fe1
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_def.ll
@@ -0,0 +1,43 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [4 x i8] c"%p\0A\00", align 1
+
+define spir_kernel void @test() {
+entry:
+  %gid = call spir_func i64 @__mux_get_global_id(i32 0)
+  %printf = call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64 %gid)
+  ret void
+}
+
+declare spir_func i64 @__mux_get_global_id(i32)
+
+define spir_func i32 @printf(ptr, ...) {
+  ret i32 0
+}
+
+; CHECK: define spir_kernel void @__vecz_v4_test(
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @.str, i64
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
new file mode 100644
index 0000000000000..82b5926f3d280
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats.ll
@@ -0,0 +1,102 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_float_vectors -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [13 x i8] c"%#16A,%#16A\0A\00", align 1
+
+; CHECK: define spir_kernel void @__vecz_v4_test_float_vectors
+; CHECK: %[[V4:[0-9]+]] = fpext <4 x float> %{{.+}} to <4 x double>
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 0
+; CHECK: %[[V6:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 1
+; CHECK: %[[V7:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 2
+; CHECK: %[[V8:[0-9]+]] = extractelement <4 x double> %[[V4]], {{(i32|i64)}} 3
+; CHECK: %[[V9:[0-9]+]] = fpext <4 x float> %{{.+}} to <4 x double>
+; CHECK: %[[V10:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 0
+; CHECK: %[[V11:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 1
+; CHECK: %[[V12:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 2
+; CHECK: %[[V13:[0-9]+]] = extractelement <4 x double> %[[V9]], {{(i32|i64)}} 3
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V5]], double %[[V10]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V6]], double %[[V11]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V7]], double %[[V12]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], double %[[V8]], double %[[V13]])
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
new file mode 100644
index 0000000000000..d2010a9e95b6b
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_printf_floats_no_double_support.ll
@@ -0,0 +1,100 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test_float_vectors -vecz-simd-width=4 -vecz-double-support=false -vecz-choices=FullScalarization -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+@.str = private unnamed_addr addrspace(2) constant [10 x i8] c"%#4v4hho\0A\00", align 1
+@.str32 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v32hho\0A\00", align 1
+@.str64 = private unnamed_addr addrspace(2) constant [11 x i8] c"%#4v64hho\0A\00", align 1
+@.strfv = private unnamed_addr addrspace(2) constant [11 x i8] c"%#16v2hlA\0A\00", align 1
+
+; Function Attrs: nounwind
+define spir_kernel void @test(<4 x i8>* %out, <4 x i8>* %in1, <4 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <4 x i8>, <4 x i8>* %in1, i64 %call
+  %0 = load <4 x i8>, <4 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <4 x i8>, <4 x i8>* %in2, i64 %call
+  %1 = load <4 x i8>, <4 x i8>* %arrayidx1, align 4
+  %add = add <4 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <4 x i8>, <4 x i8>* %out, i64 %call
+  store <4 x i8> %add, <4 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([10 x i8], [10 x i8] addrspace(2)* @.str, i64 0, i64 0), <4 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test32(<32 x i8>* %out, <32 x i8>* %in1, <32 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <32 x i8>, <32 x i8>* %in1, i64 %call
+  %0 = load <32 x i8>, <32 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <32 x i8>, <32 x i8>* %in2, i64 %call
+  %1 = load <32 x i8>, <32 x i8>* %arrayidx1, align 4
+  %add = add <32 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <32 x i8>, <32 x i8>* %out, i64 %call
+  store <32 x i8> %add, <32 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str32, i64 0, i64 0), <32 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test64(<64 x i8>* %out, <64 x i8>* %in1, <64 x i8>* %in2) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <64 x i8>, <64 x i8>* %in1, i64 %call
+  %0 = load <64 x i8>, <64 x i8>* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds <64 x i8>, <64 x i8>* %in2, i64 %call
+  %1 = load <64 x i8>, <64 x i8>* %arrayidx1, align 4
+  %add = add <64 x i8> %1, %0
+  %arrayidx2 = getelementptr inbounds <64 x i8>, <64 x i8>* %out, i64 %call
+  store <64 x i8> %add, <64 x i8>* %arrayidx2, align 4
+  %call4 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.str64, i64 0, i64 0), <64 x i8> %add)
+  ret void
+}
+
+define spir_kernel void @test_float_vectors(<2 x float>* %in) {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds <2 x float>, <2 x float>* %in, i64 %call
+  %0 = load <2 x float>, <2 x float>* %arrayidx, align 8
+  %mul = fmul <2 x float> %0, %0
+  %call8 = call spir_func i32 (i8 addrspace(2)*, ...) @printf(i8 addrspace(2)* getelementptr inbounds ([11 x i8], [11 x i8] addrspace(2)* @.strfv, i64 0, i64 0), <2 x float> %mul)
+  ret void
+}
+
+declare i64 @__mux_get_global_id(i32)
+
+declare extern_weak spir_func i32 @printf(i8 addrspace(2)*, ...)
+
+; CHECK: @[[STR:.+]] = private unnamed_addr addrspace(2) constant [13 x i8] c"%#16A,%#16A\0A\00", align 1
+
+; CHECK: define spir_kernel void @__vecz_v4_test_float_vectors
+; CHECK: %[[V5:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 0
+; CHECK: %[[V6:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 1
+; CHECK: %[[V7:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 2
+; CHECK: %[[V8:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 3
+; CHECK: %[[V10:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 0
+; CHECK: %[[V11:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 1
+; CHECK: %[[V12:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 2
+; CHECK: %[[V13:[0-9]+]] = extractelement <4 x float> %{{.+}}, {{(i32|i64)}} 3
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V5]], float %[[V10]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V6]], float %[[V11]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V7]], float %[[V12]])
+; CHECK: call spir_func i32 (ptr addrspace(2), ...) @printf(ptr addrspace(2) @[[STR]], float %[[V8]], float %[[V13]])
+; CHECK: ret void
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll
new file mode 100644
index 0000000000000..0a121a27a795d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vector_size_1.ll
@@ -0,0 +1,38 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k test -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @test(ptr %src, ptr %dst) {
+entry:
+  %lid = tail call i32 @__mux_get_sub_group_local_id()
+  %lid.i64 = zext i32 %lid to i64
+  %src.i = getelementptr i64, ptr %src, i64 %lid.i64
+  %val = load <1 x i64>, ptr %src.i, align 8
+  %vec = shufflevector <1 x i64> %val, <1 x i64> zeroinitializer, <8 x i32> zeroinitializer
+  %dst.i = getelementptr <8 x i64>, ptr %dst, i64 %lid.i64
+  store <8 x i64> %vec, ptr %dst.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: define spir_kernel void @test
+; CHECK-LABEL: define spir_kernel void @__vecz_v4_test
+
+declare i32 @__mux_get_sub_group_local_id()
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
new file mode 100644
index 0000000000000..813dcfe9cc94a
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_blend_div_loop.ll
@@ -0,0 +1,154 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k blend_div_loop -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @blend_div_loop(i8 addrspace(1)* %src1ptr, i32 %src1_step, i32 %src1_offset, i8 addrspace(1)* %dstptr, i32 %dst_step, i32 %dst_offset, i32 %dst_rows, i32 %dst_cols, i8 addrspace(1)* %src2ptr, i32 %src2_step, i32 %src2_offset, i8 addrspace(1)* %src3ptr, i32 %src3_step, i32 %src3_offset, i32 %rowsPerWI) #0 {
+entry:
+  %call = call i64 @__mux_get_global_id(i32 0) #2
+  %conv = trunc i64 %call to i32
+  %call1 = call i64 @__mux_get_global_id(i32 1) #2
+  %0 = trunc i64 %call1 to i32
+  %conv3 = mul i32 %0, %rowsPerWI
+  %cmp = icmp slt i32 %conv, %dst_cols
+  br i1 %cmp, label %if.then, label %if.end62
+
+if.then:                                          ; preds = %entry
+  %call5 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src1_offset) #2
+  %call6 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src1_step, i32 %call5) #2
+  %call7 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %dst_offset) #2
+  %call8 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %dst_step, i32 %call7) #2
+  %call9 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src2_offset) #2
+  %call10 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src2_step, i32 %call9) #2
+  %call11 = call spir_func i32 @_Z5mad24iii(i32 %conv, i32 1, i32 %src3_offset) #2
+  %call12 = call spir_func i32 @_Z5mad24iii(i32 %conv3, i32 %src3_step, i32 %call11) #2
+  %add = add nsw i32 %conv3, %rowsPerWI
+  %call13 = call spir_func i32 @_Z3minii(i32 %dst_rows, i32 %add) #2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end54, %if.then
+  %src1_index.0 = phi i32 [ %call6, %if.then ], [ %add59, %for.end54 ]
+  %dst_index.0 = phi i32 [ %call8, %if.then ], [ %add60, %for.end54 ]
+  %src2_index.0 = phi i32 [ %call10, %if.then ], [ %add55, %for.end54 ]
+  %src3_index.0 = phi i32 [ %call12, %if.then ], [ %add56, %for.end54 ]
+  %y.0 = phi i32 [ %conv3, %if.then ], [ %inc58, %for.end54 ]
+  %cmp14 = icmp slt i32 %y.0, %call13
+  br i1 %cmp14, label %for.body, label %if.end62
+
+for.body:                                         ; preds = %for.cond
+  %idx.ext = sext i32 %src1_index.0 to i64
+  %add.ptr = getelementptr inbounds i8, i8 addrspace(1)* %src1ptr, i64 %idx.ext
+  %idx.ext16 = sext i32 %dst_index.0 to i64
+  %add.ptr17 = getelementptr inbounds i8, i8 addrspace(1)* %dstptr, i64 %idx.ext16
+  %idx.ext18 = sext i32 %src2_index.0 to i64
+  %add.ptr19 = getelementptr inbounds i8, i8 addrspace(1)* %src2ptr, i64 %idx.ext18
+  %idx.ext20 = sext i32 %src3_index.0 to i64
+  %add.ptr21 = getelementptr inbounds i8, i8 addrspace(1)* %src3ptr, i64 %idx.ext20
+  br label %for.cond22
+
+for.cond22:                                       ; preds = %for.inc49, %for.body
+  %src1.0 = phi i8 addrspace(1)* [ %add.ptr, %for.body ], [ %add.ptr51, %for.inc49 ]
+  %src2.0 = phi i8 addrspace(1)* [ %add.ptr19, %for.body ], [ %add.ptr52, %for.inc49 ]
+  %src3.0 = phi i8 addrspace(1)* [ %add.ptr21, %for.body ], [ %add.ptr53, %for.inc49 ]
+  %px.0 = phi i32 [ 0, %for.body ], [ %inc50, %for.inc49 ]
+  %cmp23 = icmp eq i32 %px.0, 0
+  br i1 %cmp23, label %for.body25, label %for.end54
+
+for.body25:                                       ; preds = %for.cond22
+  %1 = zext i32 %px.0 to i64
+  %arrayidx = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr17, i64 %1
+  store i8 -1, i8 addrspace(1)* %arrayidx, align 1
+  br label %for.cond26
+
+for.cond26:                                       ; preds = %for.inc, %for.body25
+  %storemerge = phi i32 [ 0, %for.body25 ], [ %inc, %for.inc ]
+  %cmp27 = icmp eq i32 %storemerge, 0
+  br i1 %cmp27, label %for.body29, label %for.inc49
+
+for.body29:                                       ; preds = %for.cond26
+  %2 = zext i32 %storemerge to i64
+  %arrayidx31 = getelementptr inbounds i8, i8 addrspace(1)* %src2.0, i64 %2
+  %3 = load i8, i8 addrspace(1)* %arrayidx31, align 1
+  %4 = zext i32 %storemerge to i64
+  %arrayidx34 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 %4
+  %5 = load i8, i8 addrspace(1)* %arrayidx34, align 1
+  %cmp36 = icmp ugt i8 %3, %5
+  br i1 %cmp36, label %if.then46, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %for.body29
+  %6 = zext i32 %storemerge to i64
+  %arrayidx39 = getelementptr inbounds i8, i8 addrspace(1)* %src3.0, i64 %6
+  %7 = load i8, i8 addrspace(1)* %arrayidx39, align 1
+  %8 = zext i32 %storemerge to i64
+  %arrayidx42 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 %8
+  %9 = load i8, i8 addrspace(1)* %arrayidx42, align 1
+  %cmp44 = icmp ult i8 %7, %9
+  br i1 %cmp44, label %if.then46, label %for.inc
+
+if.then46:                                        ; preds = %lor.lhs.false, %for.body29
+  %10 = zext i32 %px.0 to i64
+  %arrayidx48 = getelementptr inbounds i8, i8 addrspace(1)* %add.ptr17, i64 %10
+  store i8 0, i8 addrspace(1)* %arrayidx48, align 1
+  br label %for.inc49
+
+for.inc:                                          ; preds = %lor.lhs.false
+  %inc = add nuw nsw i32 %storemerge, 1
+  br label %for.cond26
+
+for.inc49:                                        ; preds = %if.then46, %for.cond26
+  %inc50 = add nuw nsw i32 %px.0, 1
+  %add.ptr51 = getelementptr inbounds i8, i8 addrspace(1)* %src1.0, i64 1
+  %add.ptr52 = getelementptr inbounds i8, i8 addrspace(1)* %src2.0, i64 1
+  %add.ptr53 = getelementptr inbounds i8, i8 addrspace(1)* %src3.0, i64 1
+  br label %for.cond22
+
+for.end54:                                        ; preds = %for.cond22
+  %add55 = add nsw i32 %src2_index.0, %src2_step
+  %add56 = add nsw i32 %src3_index.0, %src3_step
+  %inc58 = add nsw i32 %y.0, 1
+  %add59 = add nsw i32 %src1_index.0, %src1_step
+  %add60 = add nsw i32 %dst_index.0, %dst_step
+  br label %for.cond
+
+if.end62:                                         ; preds = %for.cond, %entry
+  ret void
+}
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_global_id(i32) #1
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i32 @_Z5mad24iii(i32, i32, i32) #1
+
+; Function Attrs: convergent nounwind readonly
+declare spir_func i32 @_Z3minii(i32, i32) #1
+
+attributes #0 = { convergent nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { convergent nobuiltin nounwind readonly }
+
+; The purpose of this test is to make sure we correctly replace the uses of
+; divergent loop update masks outside the loop, even in the pure exit.
+
+; CHECK: spir_kernel void @__vecz_v4_blend_div_loop
+; CHECK: for.cond26.pure_exit:
+; CHECK: %if.then46.entry_mask{{[0-9]+}} = or i1 %if.then46.loop_exit_mask{{[0-9]+}}.blend, %if.then46.loop_exit_mask.blend
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
new file mode 100644
index 0000000000000..da33e218bbff8
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_gather_load.ll
@@ -0,0 +1,111 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k vecz_scalar_gather_load -vecz-passes="function(simplifycfg),mergereturn,vecz-loop-rotate,cfg-convert,cleanup-divergence" -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "kernel.opencl"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_group_id(i32)
+
+; Function Attrs: convergent nounwind readonly
+declare i64 @__mux_get_local_id(i32)
+
+; Function Attrs: convergent nounwind
+define spir_kernel void @vecz_scalar_gather_load(i32 addrspace(1)* %row_indices, i32 addrspace(1)* %row_blocks, float addrspace(1)* %result) {
+entry:
+  %call1 = call i64 @__mux_get_group_id(i32 0)
+  %call2 = call i64 @__mux_get_local_id(i32 0)
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %row_blocks, i64 %call1
+  %load1 = load i32, i32 addrspace(1)* %arrayidx1, align 4
+  %add1 = add i64 %call1, 1
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %row_blocks, i64 %add1
+  %load2 = load i32, i32 addrspace(1)* %arrayidx2, align 4
+  br label %for.cond
+
+for.cond:                                       ; preds = %entry, %for.inc
+  %storemerge = phi i32 [ %load1, %entry ], [ %inc, %for.inc ]
+  %cmp1 = icmp ult i32 %storemerge, %load2
+  br i1 %cmp1, label %if.then1, label %for.end
+
+if.then1:                                       ; preds = %for.cond
+  %storemerge.zext = zext i32 %storemerge to i64
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %row_indices, i64 %storemerge.zext
+  %load3 = load i32, i32 addrspace(1)* %gep1, align 4
+  %sub1 = sub i32 %load3, %load1
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %row_indices, i64 %storemerge.zext
+  %load4 = load i32, i32 addrspace(1)* %gep2, align 4
+  %sub2 = sub i32 %load4, %load1
+  %cmp2 = icmp ugt i32 %sub2, %sub1
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.then2:                                       ; preds = %if.then1
+  %sub1.zext = zext i32 %sub1 to i64
+  %gep3 = getelementptr inbounds float, float addrspace (1)* %result, i64 %sub1.zext
+  %load5 = load float, float addrspace(1)* %gep3, align 4
+  br label %if.else2
+
+if.else2:                                        ; preds = %if.then1, %if.then2
+  %ret = phi float [ %load5, %if.then2 ], [ 0.000000e+00, %if.then1 ]
+  %cmp3 = icmp eq i64 %call2, 0
+  br i1 %cmp3, label %if.then3, label %for.inc
+
+if.then3:                                       ; preds = %if.else2
+  %gep4 = getelementptr inbounds float, float addrspace(1)* %result, i64 %call2
+  store float %ret, float addrspace(1)* %gep4, align 4
+  br label %for.inc
+
+for.inc:                                       ; preds = %if.then3, %if.else2
+  %inc = add i32 %storemerge, 1
+  br label %for.cond
+
+for.end:                                        ; preds = %for.cond
+  ret void
+}
+
+; The purpose of this test is to ensure we don't generate a masked load for a
+; load from a uniform address, even where it is in a divergent control path.
+; It used to be the case that such a load would become a masked load during
+; control flow conversion, thefore causing it to become a varying load due to
+; the varying mask. However, since the introduction of the Mask Varying
+; attribute, it is possible to support a Uniform load with a Varying mask, so
+; it is no longer necessary to mark all loads in divergent paths as Varying.
+; The somewhat circuitous upshot of this is that the load no longer gets a mask
+; at all, since it was previously only considered to be in a divergent path on
+; account of another Mask Varying load!
+
+; CHECK: spir_kernel void @__vecz_v4_vecz_scalar_gather_load
+
+; This load depends only on the uniform loop iterator
+; CHECK: if.then1:
+; CHECK: %[[IND:.+]] = phi i32
+; CHECK: %[[ZIND:.+]] = zext i32 %[[IND]] to i64
+; CHECK: %[[GEP1:.+]] = getelementptr inbounds i32, ptr addrspace(1) %row_indices, i64 %[[ZIND]]
+; CHECK: %{{.+}} = load i32, ptr addrspace(1) %[[GEP1]]
+
+; This load depends only on other uniform loads
+; CHECK: if.then2:
+; CHECK-NOT: declare float @__vecz_b_masked_gather_load4_
+; CHECK-NOT: declare float @__vecz_b_masked_load4_
+; CHECK: %[[GEP2:.+]] = getelementptr inbounds float, ptr addrspace(1) %result
+; CHECK: %{{.+}} = load float, ptr addrspace(1) %[[GEP2]]
+
+; The store instruction is definitely in a divergent path, however, so needs a mask.
+; CHECK: if.then3:
+; CHECK: call void @__vecz_b_masked_store4_f
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
new file mode 100644
index 0000000000000..d1085569a5207
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/vecz_scalar_interleaved_load.ll
@@ -0,0 +1,83 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k vecz_scalar_interleaved_load -vecz-passes=cfg-convert,packetizer -vecz-simd-width=4 -S < %s | FileCheck %s
+
+; ModuleID = 'Unknown buffer'
+source_filename = "Unknown buffer"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind readnone
+declare i64 @__mux_get_global_id(i32) #0
+
+define spir_kernel void @vecz_scalar_interleaved_load(float addrspace(1)* %out, i64 %n, float %m) {
+entry:
+  %gid0 = tail call i64 @__mux_get_global_id(i32 0) #0
+  %gid1 = tail call i64 @__mux_get_global_id(i32 1) #0
+  %cmp1 = icmp slt i64 %gid0, %n
+  br i1 %cmp1, label %if.then1, label %end
+
+if.then1:                                     ; preds = %entry
+  %gep1 = getelementptr inbounds float, float addrspace(1)* %out, i64 %gid1
+  %cmp2 = fcmp une float %m, 0.000000e+00
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.then2:                                     ; preds = %if.then1
+  %mul1 = mul nsw i64 %gid0, %n
+  %gep2 = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %mul1
+  %cmp3 = icmp slt i64 %gid1, %n
+  %load1 = load float, float addrspace(1)* %gep2, align 4
+  %ie1 = insertelement <4 x float> poison, float %load1, i32 0
+  br i1 %cmp3, label %if.then3, label %if.else3
+
+if.then3:                                     ; preds = %if.then2
+  %laod2 = load float, float addrspace(1)* %gep2, align 4
+  br label %if.else3
+
+if.else3:                                     ; preds = %if.then2, %if.then3
+  %phi_load2 = phi float [ %laod2, %if.then3 ], [ 0.000000e+00, %if.then2 ]
+  %ie2 = insertelement <4 x float> %ie1, float %phi_load2, i32 1
+  %load3 = load float, float addrspace(1)* %gep2, align 4
+  %ie3 = insertelement <4 x float> %ie2, float %load3, i32 2
+  %x76 = load float, float addrspace(1)* %gep2, align 4
+  %ie4 = insertelement <4 x float> %ie3, float %x76, i32 3
+  br label %if.else2
+
+if.else2:                                    ; preds = %if.else3, %if.then1
+  %ret_vec = phi <4 x float> [ %ie4, %if.else3 ], [ zeroinitializer, %if.then1 ]
+  %ret = extractelement <4 x float> %ret_vec, i32 0
+  %ret_gep = getelementptr inbounds float, float addrspace(1)* %gep1, i64 %gid1
+  store float %ret, float addrspace(1)* %ret_gep, align 4
+  br label %end
+
+end:                                    ; preds = %entry, %if.else2
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+
+; The purpose of this test is to ensure we correctly generate a scalar
+; masked load for a scalar load that has a strided pointer, instead of
+; generating an interleaved masked load for a non vector load (which is
+; invalid).
+
+; The middle optimizations break this test because after scalarization,
+; some of the vector elements become dead code and thus, an interleaved
+; load is in fact generated (although correctly, in this case)
+
+; CHECK: spir_kernel void @__vecz_v4_vecz_scalar_interleaved_load
+; CHECK: declare float @__vecz_b_masked_load4_fu3ptrU3AS1b
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll
new file mode 100644
index 0000000000000..2496b1b1d675d
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workgroup_scans.ll
@@ -0,0 +1,204 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -w 4 -S -vecz-passes=packetizer < %s | FileCheck %s
+
+target triple = "spir64-unknown-unknown"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare i64 @__mux_get_global_id(i32)
+
+declare i32 @__mux_work_group_scan_inclusive_add_i32(i32, i32)
+declare i64 @__mux_work_group_scan_inclusive_add_i64(i32, i64)
+declare float @__mux_work_group_scan_inclusive_fadd_f32(i32, float)
+
+declare i32 @__mux_work_group_scan_inclusive_smin_i32(i32, i32)
+declare i32 @__mux_work_group_scan_inclusive_umin_i32(i32, i32)
+declare i32 @__mux_work_group_scan_inclusive_smax_i32(i32, i32)
+declare i32 @__mux_work_group_scan_inclusive_umax_i32(i32, i32)
+declare float @__mux_work_group_scan_inclusive_fmin_f32(i32, float)
+declare float @__mux_work_group_scan_inclusive_fmax_f32(i32, float)
+
+define spir_kernel void @reduce_scan_incl_add_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_add_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_add_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_add_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <4 x i32> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_add_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i64, i64 addrspace(1)* %in, i64 %call
+  %0 = load i64, i64 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i64 @__mux_work_group_scan_inclusive_add_i64(i32 0, i64 %0)
+  %arrayidx2 = getelementptr inbounds i64, i64 addrspace(1)* %out, i64 %call
+  store i64 %call1, i64 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_i64(
+; CHECK: [[SCAN:%.*]] = call <4 x i64> @__vecz_b_sub_group_scan_inclusive_add_Dv4_m(<4 x i64> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i64 @__mux_work_group_scan_exclusive_add_i64(i32 0, i64 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i64> poison, i64 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i64> [[HEAD]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = add <4 x i64> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x i64> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_add_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call float @__mux_work_group_scan_inclusive_fadd_f32(i32 0, float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_add_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_add_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.0{{.*}}, <4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fadd_f32(i32 0, float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = fadd <4 x float> [[SCAN]], [[SPLAT]]
+; CHECK: store <4 x float> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_smin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_smin_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smin_i32(
+; CHECK: call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smin_Dv4_i(<4 x i32> %{{.*}})
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_smin_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_umin_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_umin_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umin_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umin_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_umin_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_smax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_smax_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_smax_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_smax_Dv4_i(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_smax_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_umax_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %call
+  %0 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %call1 = tail call i32 @__mux_work_group_scan_inclusive_umax_i32(i32 0, i32 %0)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %call
+  store i32 %call1, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_umax_i32(
+; CHECK: [[SCAN:%.*]] = call <4 x i32> @__vecz_b_sub_group_scan_inclusive_umax_Dv4_j(<4 x i32> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call i32 @__mux_work_group_scan_exclusive_umax_i32(i32 0, i32 [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x i32> poison, i32 [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x i32> [[HEAD]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[SCAN]], <4 x i32> [[SPLAT]])
+; CHECK: store <4 x i32> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_fmin_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call float @__mux_work_group_scan_inclusive_fmin_f32(i32 0, float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmin_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_min_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fmin_f32(i32 0, float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]])
+; CHECK: store <4 x float> [[FINAL]],
+}
+
+define spir_kernel void @reduce_scan_incl_fmax_f32(float addrspace(1)* %in, float addrspace(1)* %out) {
+entry:
+  %call = tail call i64 @__mux_get_global_id(i32 0)
+  %arrayidx = getelementptr inbounds float, float addrspace(1)* %in, i64 %call
+  %0 = load float, float addrspace(1)* %arrayidx, align 4
+  %call1 = tail call float @__mux_work_group_scan_inclusive_fmax_f32(i32 0, float %0)
+  %arrayidx2 = getelementptr inbounds float, float addrspace(1)* %out, i64 %call
+  store float %call1, float addrspace(1)* %arrayidx2, align 4
+  ret void
+; CHECK-LABEL: @__vecz_v4_reduce_scan_incl_fmax_f32(
+; CHECK: [[SCAN:%.*]] = call <4 x float> @__vecz_b_sub_group_scan_inclusive_max_Dv4_f(<4 x float> [[INPUT:%.*]])
+; CHECK: [[SUM:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[INPUT]])
+; CHECK: [[EXCL_SCAN:%.*]] = call float @__mux_work_group_scan_exclusive_fmax_f32(i32 0, float [[SUM]])
+; CHECK: [[HEAD:%.*]] = insertelement <4 x float> poison, float [[EXCL_SCAN]], {{(i32|i64)}} 0
+; CHECK: [[SPLAT:%.*]] = shufflevector <4 x float> [[HEAD]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK: [[FINAL:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[SCAN]], <4 x float> [[SPLAT]])
+; CHECK: store <4 x float> [[FINAL]],
+}
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
new file mode 100644
index 0000000000000..3461a335d6845
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/test/lit/llvm/workitem_builtins.ll
@@ -0,0 +1,104 @@
+; Copyright (C) Codeplay Software Limited
+;
+; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+; Exceptions; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+;     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+; License for the specific language governing permissions and limitations
+; under the License.
+;
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+; RUN: veczc -k dont_mask_workitem_builtins -S < %s | FileCheck %s
+
+; ModuleID = 'kernel.opencl'
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: nounwind
+define spir_kernel void @dont_mask_workitem_builtins(i32 addrspace(2)* %in, i32 addrspace(1)* %out) #0 {
+entry:
+  %call = call i64 @__mux_get_local_id(i32 0) #5
+  %conv = trunc i64 %call to i32
+  %cmp = icmp sgt i32 %conv, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  fence syncscope("singlethread") acq_rel
+  %call2 = call i64 @__mux_get_global_id(i32 0) #5
+  %conv3 = trunc i64 %call2 to i32
+  %idxprom = sext i32 %conv3 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(2)* %in, i64 %idxprom
+  %0 = load i32, i32 addrspace(2)* %arrayidx, align 4
+  %idxprom4 = sext i32 %conv3 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom4
+  store i32 %0, i32 addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call8 = call i64 @__mux_get_local_size(i32 0) #5
+  %call9 = call i64 @__mux_get_group_id(i32 0) #5
+  %mul = mul i64 %call9, %call8
+  %add = add i64 %mul, %call
+  %sext = shl i64 %add, 32
+  %idxprom11 = ashr exact i64 %sext, 32
+  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %idxprom11
+  store i32 42, i32 addrspace(1)* %arrayidx12, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void
+}
+
+declare i64 @__mux_get_local_id(i32) #1
+
+declare i64 @__mux_get_global_id(i32) #1
+
+declare i64 @__mux_get_local_size(i32) #1
+
+declare i64 @__mux_get_group_id(i32) #1
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="0" "stackrealign" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline }
+attributes #3 = { argmemonly nounwind }
+attributes #4 = { argmemonly nounwind readonly }
+attributes #5 = { nobuiltin nounwind }
+attributes #6 = { nounwind }
+
+!opencl.kernels = !{!0}
+!llvm.ident = !{!6}
+
+!0 = !{void (i32 addrspace(2)*, i32 addrspace(1)*)* @dont_mask_workitem_builtins, !1, !2, !3, !4, !5}
+!1 = !{!"kernel_arg_addr_space", i32 2, i32 1}
+!2 = !{!"kernel_arg_access_qual", !"none", !"none"}
+!3 = !{!"kernel_arg_type", !"int*", !"int*"}
+!4 = !{!"kernel_arg_base_type", !"int*", !"int*"}
+!5 = !{!"kernel_arg_type_qual", !"const", !""}
+!6 = !{!"clang version 3.8.1 "}
+
+; The vectorized function
+; CHECK: define spir_kernel void @__vecz_v[[WIDTH:[0-9]+]]_dont_mask_workitem_builtins(
+
+; Check if the builtins are still here
+; CHECK: call i64 @__mux_get_local_id(i32 0)
+; CHECK: call i64 @__mux_get_local_size(i32 0)
+; CHECK: call i64 @__mux_get_group_id(i32 0)
+; CHECK: fence syncscope("singlethread") acq_rel
+; CHECK: call i64 @__mux_get_global_id(i32 0)
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_global_id(i32
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_local_size(i32
+; CHECK-NOT: call spir_func i64 @__vecz_b_masked___mux_get_group_id(i32
+
+; Function end
+; CHECK: ret void
+
+; Also check that we haven't declared the masked functions
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_group_id(i32, i1)
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_local_size(i32, i1)
+; CHECK-NOT: define private spir_func i64 @__vecz_b_masked___mux_get_group_id(i32, i1)
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt
new file mode 100644
index 0000000000000..921204b382aa0
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/CMakeLists.txt
@@ -0,0 +1,14 @@
+llvm_map_components_to_libnames(llvm_libs all ${LLVM_TARGETS_TO_BUILD})
+list(REMOVE_ITEM llvm_libs LTO OptRemarks)
+
+add_llvm_tool(veczc
+ ${CMAKE_CURRENT_SOURCE_DIR}/source/veczc.cpp
+)
+target_compile_options(veczc PRIVATE ${VECZ_COMPILE_OPTIONS})
+target_compile_definitions(veczc PRIVATE ${VECZ_COMPILE_DEFINITIONS})
+target_include_directories(veczc PRIVATE
+ ${CMAKE_CURRENT_SOURCE_DIR}/include>
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../compiler_pipeline/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../vecz/include
+ )
+target_link_libraries(veczc PUBLIC ${llvm_libs})
diff --git a/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
new file mode 100644
index 0000000000000..5c4a4f228db00
--- /dev/null
+++ b/llvm/lib/SYCLNativeCPUUtils/compiler_passes/vecz/tools/source/veczc.cpp
@@ -0,0 +1,465 @@
+// Copyright (C) Codeplay Software Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License") with LLVM
+// Exceptions; you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://github.com/uxlfoundation/oneapi-construction-kit/blob/main/LICENSE.txt
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <compiler/utils/builtin_info.h>
+#include <compiler/utils/device_info.h>
+#include <compiler/utils/metadata.h>
+#include <compiler/utils/optimal_builtin_replacement_pass.h>
+#include <compiler/utils/pass_machinery.h>
+#include <compiler/utils/sub_group_analysis.h>
+#include <llvm/ADT/Statistic.h>
+#include <llvm/ADT/StringSwitch.h>
+#include <llvm/Analysis/AliasAnalysis.h>
+#include <llvm/Analysis/CGSCCPassManager.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/Bitcode/BitcodeReader.h>
+#include <llvm/Bitcode/BitcodeWriterPass.h>
+#include <llvm/IR/LegacyPassManager.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IRPrinter/IRPrintingPasses.h>
+#include <llvm/IRReader/IRReader.h>
+#include <llvm/InitializePasses.h>
+#include <llvm/MC/TargetRegistry.h>
+#include <llvm/Passes/PassBuilder.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/FileSystem.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/Support/Process.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/ToolOutputFile.h>
+#include <llvm/Support/raw_ostream.h>
+#include <llvm/Target/TargetLoweringObjectFile.h>
+#include <llvm/Target/TargetMachine.h>
+#include <llvm/Target/TargetOptions.h>
+#include <multi_llvm/llvm_version.h>
+
+#include <string>
+
+#include "vecz/pass.h"
+#include "vecz/vecz_target_info.h"
+
+static llvm::cl::opt<std::string>
+    InputFilename(llvm::cl::Positional, llvm::cl::desc("<input .bc file>"),
+                  llvm::cl::init("-"));
+
+static llvm::cl::opt<std::string>
+    OutputFilename("o", llvm::cl::desc("Override output filename"),
+                   llvm::cl::value_desc("filename"));
+static llvm::cl::opt<bool, false>
+    WriteTextual("S", llvm::cl::desc("Write module as text"));
+
+static llvm::cl::list<std::string>
+    KernelNameSpecs("k", llvm::cl::desc("Kernel to vectorize"),
+                    llvm::cl::ZeroOrMore, llvm::cl::value_desc("name"));
+
+static llvm::cl::opt<unsigned>
+    SIMDDimIdx("d", llvm::cl::desc("Dimension index to vectorize on"),
+               llvm::cl::init(0), llvm::cl::value_desc("dimension"));
+
+static llvm::cl::opt<unsigned>
+    SIMDWidth("w", llvm::cl::desc("Width to vectorize to"), llvm::cl::init(0),
+              llvm::cl::value_desc("width"));
+
+static llvm::cl::opt<bool> FailQuietly(
+    "vecz-fail-quietly",
+    llvm::cl::desc("don't return an error code on vectorization failure"));
+
+static llvm::cl::opt<bool>
+    ChoicesHelp("vecz-choices-help",
+                llvm::cl::desc("see information about available choices"));
+
+static llvm::cl::opt<bool>
+    VeczAuto("vecz-auto",
+             llvm::cl::desc("run the vectorizer if it is found to be useful"));
+
+static llvm::cl::opt<unsigned, 0> VeczSimdWidth(
+    "vecz-simd-width",
+    llvm::cl::desc("manually set the SIMD width for the vectorizer"));
+
+static llvm::cl::opt<llvm::cl::boolOrDefault> VeczScalable(
+    "vecz-scalable",
+    llvm::cl::desc("force scalable vectorization for the vectorizer"));
+
+// Allow the passing of Vecz Choices string on the command line. This is parsed
+// after the choices environment variable, thus overriding it.
+static llvm::cl::opt<std::string>
+    ChoicesString("vecz-choices", llvm::cl::desc("Set vecz choices"));
+
+static llvm::cl::opt<bool>
+    VeczCollectStats("vecz-llvm-stats",
+                     llvm::cl::desc("enable reporting LLVM statistics"));
+
+static llvm::cl::opt<std::string>
+    UserTriple("vecz-target-triple", llvm::cl::desc("the target triple"));
+static llvm::cl::opt<std::string> UserCPU("vecz-target-mcpu",
+                                          llvm::cl::desc("Set the CPU model"));
+static llvm::cl::opt<std::string>
+    CPUFeatures("vecz-target-features",
+                llvm::cl::desc("Set the CPU feature string"));
+static llvm::cl::opt<bool> DoubleSupport(
+    "vecz-double-support", llvm::cl::init(true),
+    llvm::cl::desc(
+        "Assume the target has double-precision floating point support"));
+
+static llvm::cl::list<unsigned>
+    SGSizes("device-sg-sizes",
+            llvm::cl::desc("Comma-separated list of supported sub-group sizes"),
+            llvm::cl::CommaSeparated);
+
+static llvm::TargetMachine *initLLVMTarget(llvm::StringRef triple_string,
+                                           llvm::StringRef cpu_model,
+                                           llvm::StringRef target_features) {
+  const llvm::Triple triple(triple_string);
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllAsmParsers();
+  llvm::InitializeAllDisassemblers();
+
+  llvm::TargetOptions opts;
+  opts.DisableIntegratedAS = false;
+  std::string e;
+  const llvm::Target *target =
+      llvm::TargetRegistry::lookupTarget(triple.getTriple(), e);
+  if (!target) {
+    (void)::fprintf(stderr, "can't get target %s:%s\n",
+                    triple.getTriple().c_str(), e.c_str());
+    ::exit(1);
+  }
+  llvm::PassRegistry &registry = *llvm::PassRegistry::getPassRegistry();
+  llvm::initializeAlwaysInlinerLegacyPassPass(registry);
+#if LLVM_VERSION_GREATER_EQUAL(21, 0)
+  return target->createTargetMachine(triple, cpu_model, target_features, opts,
+                                     llvm::Reloc::Model::Static);
+#else
+  return target->createTargetMachine(triple.getTriple(), cpu_model,
+                                     target_features, opts,
+                                     llvm::Reloc::Model::Static);
+#endif
+}
+
+static vecz::VeczPassOptions getDefaultPassOptions() {
+  // Enable/disable Choices from the CODEPLAY_VECZ_CHOICES environment
+  // variable.
+  vecz::VectorizationChoices Choices;
+
+  const char *ptr = std::getenv("CODEPLAY_VECZ_CHOICES");
+  if (ptr && !Choices.parseChoicesString(ptr)) {
+    llvm::errs()
+        << "Failed to parse the CODEPLAY_VECZ_CHOICES env variable.\n"
+           "Use --vecz-choices-help for available choices and usage info.\n";
+    ::exit(1);
+  }
+
+  // Parse the Vecz choices given in the command line
+  const std::string &ch = ChoicesString;
+  if (!ch.empty() && !Choices.parseChoicesString(ch)) {
+    llvm::errs()
+        << "Failed to parse the --vecz-choices command line option.\n"
+           "Use --vecz-choices-help for available choices and usage info.\n";
+    ::exit(1);
+  }
+
+  if (VeczCollectStats) {
+    llvm::EnableStatistics(true);
+  }
+
+  const auto factor = SIMDWidth ? SIMDWidth : 4;
+  auto VF = llvm::ElementCount::get(VeczSimdWidth ? VeczSimdWidth : factor,
+                                    VeczScalable == llvm::cl::BOU_TRUE);
+
+  vecz::VeczPassOptions passOpts;
+  passOpts.choices = Choices;
+  passOpts.factor = VF;
+  passOpts.vecz_auto = VeczAuto;
+  passOpts.vec_dim_idx = SIMDDimIdx;
+  passOpts.local_size = SIMDWidth;
+  return passOpts;
+}
+
+// Parse a command line vectorization specification for a given kernel
+// <kernel_spec> ::= <kernel_name> ':' <spec>
+// <kernel_spec> ::= <kernel_name>
+// <spec> ::= <vf><dim>(opt)<width>(opt)
+//            <scalable_spec>(opt)<predicated_spec>(opt)
+// <spec> ::= <spec> ',' <spec>
+// <number> ::= [0-9]+
+// <kernel_name> ::= [a-zA-Z_][a-zA-Z_0-9]+
+// <dim> ::= '.' [123]
+// <vf> ::= <number>
+// <vf> ::= 'a' // automatic vectorization factor
+// <simd_width> ::= '@' <number>
+// <scalable_spec> ::= 's'
+// <predicated_spec> ::= 'p'
+static bool
+parsePassOptionsSwitch(const llvm::StringRef spec, llvm::StringRef &name,
+                       llvm::SmallVectorImpl<vecz::VeczPassOptions> &opts) {
+  auto pair = spec.split(':');
+  name = pair.first;
+  auto vals = pair.second;
+  auto defaults = getDefaultPassOptions();
+  if (!name.size()) {
+    return false;
+  }
+  if (!vals.empty()) {
+    do {
+      // HEREBEDRAGONS: The return status of `consumeInteger` and
+      // `consume_front` are "failed" and "succeeded" respectively. It's
+      // opposite day somewhere in llvm land...
+      unsigned vf;
+      auto opt = defaults;
+      if (vals.consume_front("a")) {
+        opt.vecz_auto = true;
+      } else if (!vals.consumeInteger(10, vf)) {
+        opt.factor = llvm::ElementCount::getFixed(vf);
+      }
+      if (vals.consume_front(".")) {
+        unsigned dim;
+        if (vals.consumeInteger(10, dim)) {
+          return false;
+        }
+        if (!dim || dim > 3) {
+          return false;
+        }
+        opt.vec_dim_idx = dim;
+      }
+      if (vals.consume_front("@")) {
+        unsigned simd_width;
+        if (vals.consumeInteger(10, simd_width)) {
+          return false;
+        }
+        opt.local_size = simd_width;
+      }
+      // <scalable_spec> ::= 's'
+      if (vals.consume_front("s")) {
+        opt.factor =
+            llvm::ElementCount::getScalable(opt.factor.getKnownMinValue());
+      }
+      // <predicated_spec> ::= 'p'
+      if (vals.consume_front("p")) {
+        opt.choices.enableVectorPredication();
+      }
+      opts.push_back(opt);
+    } while (vals.consume_front(",") && !vals.empty());
+    if (!vals.empty()) {
+      return false;
+    }
+  } else {
+    opts.push_back(defaults);
+  }
+  return true;
+}
+
+using KernelOptMap =
+    llvm::SmallDenseMap<llvm::StringRef,
+                        llvm::SmallVector<vecz::VeczPassOptions, 1>, 1>;
+
+int main(const int argc, const char *const argv[]) {
+  llvm::cl::ParseCommandLineOptions(argc, argv);
+
+  if (ChoicesHelp) {
+    const auto &Infos = vecz::VectorizationChoices::queryAvailableChoices();
+    llvm::outs() << "Available Vecz Choices:\n\n";
+    for (const auto &Info : Infos) {
+      llvm::outs() << "  * " << Info.name << ":\n";
+      llvm::outs() << "      " << Info.desc << "\n\n";
+    }
+    llvm::outs() << "Separate multiple items with any one of [:;,].\n"
+                    "Prefix any choice with \"no\" to disable that option.\n";
+    return 0;
+  }
+
+  // If the user didn't specify an output filename, but is reading from stdin,
+  // output to stdout. This may be emitting binary, but trust the user to know
+  // what they're doing. We could also emit a warning.
+  if (OutputFilename.empty() && InputFilename == "-") {
+    OutputFilename = "-";
+  }
+
+  if (OutputFilename.empty()) {
+    llvm::errs() << "Error: no output filename was given (use -o <file>)\n";
+    return 1;
+  }
+
+  llvm::SMDiagnostic err;
+  llvm::LLVMContext context;
+
+  std::unique_ptr<llvm::Module> module =
+      llvm::parseIRFile(InputFilename, err, context);
+
+  if (!module) {
+    auto errorOrInputFile =
+        llvm::MemoryBuffer::getFileOrSTDIN(InputFilename.getValue());
+
+    // If there was an error in getting the input file.
+    if (!errorOrInputFile) {
+      llvm::errs() << "Error: " << errorOrInputFile.getError().message() << " '"
+                   << InputFilename.getValue() << "'\n";
+      return 1;
+    }
+
+    llvm::errs() << "Error: bitcode file was malformed\n";
+    err.print("veczc", llvm::errs(),
+              llvm::sys::Process::StandardErrHasColors());
+    return 1;
+  }
+
+  KernelOptMap kernelOpts;
+  if (KernelNameSpecs.empty()) {
+    auto defaults = getDefaultPassOptions();
+    for (const auto &f : *module) {
+      if (f.getCallingConv() != llvm::CallingConv::SPIR_KERNEL) {
+        continue;
+      }
+      kernelOpts[f.getName()].push_back(defaults);
+    }
+  } else {
+    for (const auto &S : KernelNameSpecs) {
+      llvm::StringRef name;
+      llvm::SmallVector<vecz::VeczPassOptions, 1> opts;
+      if (!parsePassOptionsSwitch(S, name, opts)) {
+        (void)::fprintf(
+            stderr, "failed to parse kernel vectorization specification%s\n",
+            name.str().c_str());
+        return 1;
+      }
+      if (!module->getFunction(name)) {
+        llvm::errs() << "Error: no such kernel to vectorize ('" << name
+                     << "')\n";
+        return 1;
+      }
+      kernelOpts[name] = std::move(opts);
+    }
+  }
+
+  // Open the file.
+  std::error_code EC;
+  llvm::sys::fs::OpenFlags OpenFlags = llvm::sys::fs::OF_None;
+  if (WriteTextual) {
+    OpenFlags |= llvm::sys::fs::OF_Text;
+  }
+  auto Out =
+      std::make_unique<llvm::ToolOutputFile>(OutputFilename, EC, OpenFlags);
+  if (EC || !Out) {
+    llvm::errs() << EC.message() << '\n';
+    return 1;
+  }
+
+  std::unique_ptr<llvm::TargetMachine> tm(
+      UserTriple.size() ? initLLVMTarget(UserTriple, UserCPU, CPUFeatures)
+                        : nullptr);
+  assert(!UserTriple.size() || tm);
+  if (tm) {
+#if LLVM_VERSION_GREATER_EQUAL(21, 0)
+    module->setTargetTriple(tm->getTargetTriple());
+#else
+    module->setTargetTriple(tm->getTargetTriple().getTriple());
+#endif
+    module->setDataLayout(tm->createDataLayout());
+  }
+
+  compiler::utils::PassMachinery passMach(context, tm.get());
+
+  auto TICallback = [&](const llvm::Module &) {
+    return vecz::createTargetInfoFromTargetMachine(tm.get());
+  };
+
+  passMach.initializeStart();
+  passMach.getMAM().registerPass(
+      [&] { return vecz::TargetInfoAnalysis(TICallback); });
+  passMach.getMAM().registerPass(
+      [&] { return compiler::utils::BuiltinInfoAnalysis(); });
+  passMach.getMAM().registerPass(
+      [&] { return compiler::utils::SubgroupAnalysis(); });
+  passMach.getFAM().registerPass([] { return llvm::TargetIRAnalysis(); });
+  passMach.getMAM().registerPass([] {
+    compiler::utils::DeviceInfo Info{/*half*/ 0, /*float*/ 0, DoubleSupport,
+                                     /*MaxWorthWidth*/ 64};
+    for (const auto S : SGSizes) {
+      Info.reqd_sub_group_sizes.push_back(S);
+    }
+    return compiler::utils::DeviceInfoAnalysis(Info);
+  });
+  passMach.getMAM().registerPass([&kernelOpts] {
+    return vecz::VeczPassOptionsAnalysis(
+        [&kernelOpts](llvm::Function &F, llvm::ModuleAnalysisManager &,
+                      llvm::SmallVectorImpl<vecz::VeczPassOptions> &Opts) {
+          auto it = kernelOpts.find(F.getName());
+          if (it == kernelOpts.end()) {
+            return false;
+          }
+          Opts.assign(it->second.begin(), it->second.end());
+          return true;
+        });
+  });
+  passMach.initializeFinish();
+
+  llvm::ModulePassManager PM;
+
+  // Forcibly compute the BuiltinInfoAnalysis so that cached retrievals work.
+  PM.addPass(llvm::RequireAnalysisPass<compiler::utils::BuiltinInfoAnalysis,
+                                       llvm::Module>());
+
+  PM.addPass(llvm::createModuleToPostOrderCGSCCPassAdaptor(
+      compiler::utils::OptimalBuiltinReplacementPass()));
+  PM.addPass(vecz::RunVeczPass());
+  PM.run(*module, passMach.getMAM());
+
+  // If the user has specified a list of kernels to vectorize, we need to
+  // check we've matched their expectations. If they didn't specify we work on
+  // a "best-effort" basis
+  if (!KernelNameSpecs.empty()) {
+    for (auto p : kernelOpts) {
+      auto &f = *module->getFunction(p.first);
+      const auto &requested = p.getSecond();
+      llvm::SmallVector<compiler::utils::LinkMetadataResult, 1> results;
+      compiler::utils::parseOrigToVeczFnLinkMetadata(f, results);
+      for (auto &expected : requested) {
+        if (expected.vecz_auto) {
+          continue;
+        }
+        bool found = false;
+        for (auto &result : results) {
+          // FIXME this probably not the best way to do this
+          found |= result.second.vf.getKnownMinValue() >=
+                   expected.factor.getKnownMinValue();
+        }
+        if (!found) {
+          llvm::errs() << "Error: Failed to vectorize function '" << f.getName()
+                       << "'\n";
+          return FailQuietly ? 0 : 1;
+        }
+      }
+    }
+  }
+
+  // Write the resulting module.
+  llvm::ModulePassManager printMPM;
+  if (WriteTextual) {
+    printMPM.addPass(llvm::PrintModulePass(Out->os()));
+  } else {
+    printMPM.addPass(llvm::BitcodeWriterPass(Out->os()));
+  }
+  printMPM.run(*module, passMach.getMAM());
+
+  Out->keep();
+
+  if (llvm::AreStatisticsEnabled()) {
+    llvm::PrintStatistics();
+  }
+  return 0;
+}