diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
index bbaad03a27479..4210a0d150abf 100644
--- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
+++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
@@ -82,6 +82,7 @@ def AspectExt_oneapi_limited_graph : Aspect<"ext_oneapi_limited_graph">;
 def AspectExt_oneapi_private_alloca : Aspect<"ext_oneapi_private_alloca">;
 def AspectExt_oneapi_queue_profiling_tag : Aspect<"ext_oneapi_queue_profiling_tag">;
 def AspectExt_oneapi_virtual_mem : Aspect<"ext_oneapi_virtual_mem">;
+def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group">;
 // Deprecated aspects
 def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
 def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
@@ -139,7 +140,7 @@ def : TargetInfo<"__TestAspectList",
     AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group,
     AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component,
     AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence, AspectExt_oneapi_limited_graph,
-    AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem],
+    AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem, AspectExt_oneapi_cuda_cluster_group],
     []>;
 // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
 // match.
@@ -205,9 +206,9 @@ def : CudaTargetInfo<"nvidia_gpu_sm_87", !listconcat(CudaMinAspects, CudaBindles
 def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
     [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
-    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier])>;
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_cuda_async_barrier, AspectExt_oneapi_cuda_cluster_group])>;
 
 //
 // HIP / AMDGPU device aspects
diff --git a/sycl/doc/design/DeviceConfigFile.md b/sycl/doc/design/DeviceConfigFile.md
index 505c65226a809..35273bbddcc84 100644
--- a/sycl/doc/design/DeviceConfigFile.md
+++ b/sycl/doc/design/DeviceConfigFile.md
@@ -180,6 +180,7 @@ def AspectExt_intel_free_memory : Aspect<"ext_intel_free_memory">;
 def AspectExt_intel_device_id : Aspect<"ext_intel_device_id">;
 def AspectExt_intel_memory_clock_rate : Aspect<"ext_intel_memory_clock_rate">;
 def AspectExt_intel_memory_bus_width : Aspect<"ext_intel_memory_bus_width">;
+def AspectExt_oneapi_cuda_cluster_group : Aspect<"ext_oneapi_cuda_cluster_group">;
 def AspectEmulated : Aspect<"emulated">;
     
 def TargetTable : DynamicTable { 
diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp
index c1fdc6857b4a0..b6955af412b3a 100644
--- a/sycl/include/sycl/detail/cg.hpp
+++ b/sycl/include/sycl/detail/cg.hpp
@@ -178,6 +178,7 @@ class CGExecKernel : public CG {
   std::vector<std::shared_ptr<const void>> MAuxiliaryResources;
   sycl::detail::pi::PiKernelCacheConfig MKernelCacheConfig;
   bool MKernelIsCooperative = false;
+  bool MKernelUsesClusterLaunch = false;
 
   CGExecKernel(NDRDescT NDRDesc, std::shared_ptr<HostKernelBase> HKernel,
                std::shared_ptr<detail::kernel_impl> SyclKernel,
@@ -188,7 +189,8 @@ class CGExecKernel : public CG {
                std::vector<std::shared_ptr<const void>> AuxiliaryResources,
                CGTYPE Type,
                sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-               bool KernelIsCooperative, detail::code_location loc = {})
+               bool KernelIsCooperative, bool MKernelUsesClusterLaunch,
+               detail::code_location loc = {})
       : CG(Type, std::move(CGData), std::move(loc)),
         MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
         MSyclKernel(std::move(SyclKernel)),
@@ -196,7 +198,8 @@ class CGExecKernel : public CG {
         MKernelName(std::move(KernelName)), MStreams(std::move(Streams)),
         MAuxiliaryResources(std::move(AuxiliaryResources)),
         MKernelCacheConfig(std::move(KernelCacheConfig)),
-        MKernelIsCooperative(KernelIsCooperative) {
+        MKernelIsCooperative(KernelIsCooperative),
+        MKernelUsesClusterLaunch(MKernelUsesClusterLaunch) {
     assert(getType() == Kernel && "Wrong type of exec kernel CG.");
   }
 
diff --git a/sycl/include/sycl/detail/cg_types.hpp b/sycl/include/sycl/detail/cg_types.hpp
index 9da1f0b664d46..162510440b661 100644
--- a/sycl/include/sycl/detail/cg_types.hpp
+++ b/sycl/include/sycl/detail/cg_types.hpp
@@ -50,8 +50,8 @@ class ArgDesc {
   int MIndex;
 };
 
-// The structure represents NDRange - global, local sizes, global offset and
-// number of dimensions.
+// The structure represents NDRange - global, local sizes, global offset,
+// number of dimensions, and the cluster dimensions if applicable.
 class NDRDescT {
   // The method initializes all sizes for dimensions greater than the passed one
   // to the default values, so they will not affect execution.
@@ -128,6 +128,17 @@ class NDRDescT {
     Dims = Dims_;
   }
 
+  template <int Dims_> void setClusterDimensions(sycl::range<Dims_> N) {
+    if (Dims_ != Dims) {
+      throw std::runtime_error(
+          "Dimensionality of cluster, global and local ranges must be same");
+    }
+
+    for (int I = 0; I < Dims_; ++I) {
+      ClusterDimensions[I] = N[I];
+    }
+  }
+
   sycl::range<3> GlobalSize;
   sycl::range<3> LocalSize;
   sycl::id<3> GlobalOffset;
@@ -135,6 +146,7 @@ class NDRDescT {
   /// simplest form of parallel_for_work_group. If set, all other fields must be
   /// zero
   sycl::range<3> NumWorkGroups;
+  sycl::range<3> ClusterDimensions{1, 1, 1};
   size_t Dims;
 };
 
diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def
index 3a41c4881febf..d1d76ddd8ed6f 100644
--- a/sycl/include/sycl/detail/pi.def
+++ b/sycl/include/sycl/detail/pi.def
@@ -230,4 +230,7 @@ _PI_API(piextVirtualMemGetInfo)
 // Enqueue native command
 _PI_API(piextEnqueueNativeCommand)
 
+// Kernel Launch Properties
+_PI_API(piextEnqueueKernelLaunchCustom)
+
 #undef _PI_API
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 66f0a57a3060b..8ead3c6f355e8 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -197,9 +197,11 @@
 // pi_virtual_access_flags bit flags.
 // 15.55 Added piextEnqueueNativeCommand as well as associated types and enums
 // 16.56 Replaced piextUSMEnqueueMemset with piextUSMEnqueueFill
+// 16.57 Added mappings to UR launch properties extension
+// (piextEnqueueKernelLaunchCustom)
 
 #define _PI_H_VERSION_MAJOR 16
-#define _PI_H_VERSION_MINOR 56
+#define _PI_H_VERSION_MINOR 57
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -514,8 +516,12 @@ typedef enum {
 
   // Virtual memory support
   PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM = 0x2011E,
+
   // Native enqueue
   PI_EXT_ONEAPI_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT = 0x2011F,
+
+  // Return whether cluster launch is supported by device
+  PI_EXT_ONEAPI_DEVICE_INFO_CLUSTER_LAUNCH = 0x2021,
 } _pi_device_info;
 
 typedef enum {
@@ -1317,8 +1323,28 @@ typedef enum {
           ///< P2P link, otherwise such operations are not supported.
 } _pi_peer_attr;
 
+typedef enum {
+  PI_LAUNCH_PROPERTY_IGNORE = 0x0,
+  PI_LAUNCH_PROPERTY_COOPERATIVE = 0x1,
+  PI_LAUNCH_PROPERTY_CLUSTER_DIMENSION = 0x2,
+} _pi_launch_property_id;
+
+typedef union {
+  int cooperative;
+  int32_t cluster_dims[3];
+} _pi_launch_property_value;
+
 using pi_mem_info = _pi_mem_info;
 using pi_peer_attr = _pi_peer_attr;
+using pi_launch_property_id = _pi_launch_property_id;
+using pi_launch_property_value = _pi_launch_property_value;
+
+typedef struct {
+  pi_launch_property_id id;
+  pi_launch_property_value value;
+} _pi_launch_property;
+
+using pi_launch_property = _pi_launch_property;
 
 //
 // Following section contains SYCL RT Plugin Interface (PI) functions.
@@ -1933,6 +1959,14 @@ __SYCL_EXPORT pi_result piextEnqueueCooperativeKernelLaunch(
     const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
     const pi_event *event_wait_list, pi_event *event);
 
+__SYCL_EXPORT pi_result piextEnqueueKernelLaunchCustom(
+    pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
+    const size_t *global_work_size, const size_t *local_work_size,
+    pi_uint32 num_props_in_launch_prop_list,
+    const pi_launch_property *launch_prop_list,
+    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
+    pi_event *event);
+
 __SYCL_EXPORT pi_result piEnqueueEventsWait(pi_queue command_queue,
                                             pi_uint32 num_events_in_wait_list,
                                             const pi_event *event_wait_list,
diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp
index 1fe21d36a8aaa..83fc2996f42e8 100644
--- a/sycl/include/sycl/detail/pi.hpp
+++ b/sycl/include/sycl/detail/pi.hpp
@@ -148,6 +148,7 @@ using PiImageOffset = ::pi_image_offset_struct;
 using PiImageRegion = ::pi_image_region_struct;
 using PiPhysicalMem = ::pi_physical_mem;
 using PiVirtualAccessFlags = ::pi_virtual_access_flags;
+using PiLaunchProperty = ::pi_launch_property;
 
 __SYCL_EXPORT void contextSetExtendedDeleter(const sycl::context &constext,
                                              pi_context_extended_deleter func,
diff --git a/sycl/include/sycl/device_aspect_macros.hpp b/sycl/include/sycl/device_aspect_macros.hpp
index 10f61c1e48435..55ed162863079 100644
--- a/sycl/include/sycl/device_aspect_macros.hpp
+++ b/sycl/include/sycl/device_aspect_macros.hpp
@@ -381,6 +381,11 @@
 #define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_virtual_mem__ 0
 #endif
 
+#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_cuda_cluster_group__
+// __SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75)
+#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_cuda_cluster_group__ 0
+#endif
+
 #ifndef __SYCL_ANY_DEVICE_HAS_host__
 // __SYCL_ASPECT(host, 0)
 #define __SYCL_ANY_DEVICE_HAS_host__ 0
@@ -750,3 +755,8 @@
 // __SYCL_ASPECT(ext_oneapi_virtual_mem, 74)
 #define __SYCL_ANY_DEVICE_HAS_ext_oneapi_virtual_mem__ 0
 #endif
+
+#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_cuda_cluster_group__
+// __SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75)
+#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_cuda_cluster_group__ 0
+#endif
diff --git a/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp b/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp
new file mode 100644
index 0000000000000..31487beffd810
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/cluster_group_prop.hpp
@@ -0,0 +1,94 @@
+//==--- cluster_group_prop.hpp --- SYCL extension for cuda cluster group ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/oneapi/properties/properties.hpp>
+#include <sycl/range.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext::oneapi::experimental {
+
+namespace cuda {
+template <int Dim>
+struct cluster_size
+    : ::sycl::ext::oneapi::experimental::detail::run_time_property_key<
+          ::sycl::ext::oneapi::experimental::detail::ClusterLaunch> {
+  cluster_size(const range<Dim> &size) : size(size) {}
+  sycl::range<Dim> get_cluster_size() { return size; }
+
+private:
+  range<Dim> size;
+};
+
+template <int Dim> using cluster_size_key = cluster_size<Dim>;
+
+} // namespace cuda
+
+template <>
+struct is_property_key<cuda::cluster_size_key<1>> : std::true_type {};
+template <>
+struct is_property_key<cuda::cluster_size_key<2>> : std::true_type {};
+template <>
+struct is_property_key<cuda::cluster_size_key<3>> : std::true_type {};
+
+template <typename T>
+struct is_property_key_of<cuda::cluster_size_key<1>, T> : std::true_type {};
+
+template <typename T>
+struct is_property_key_of<cuda::cluster_size_key<2>, T> : std::true_type {};
+
+template <typename T>
+struct is_property_key_of<cuda::cluster_size_key<3>, T> : std::true_type {};
+
+template <>
+struct is_property_value<cuda::cluster_size_key<1>>
+    : is_property_key<cuda::cluster_size_key<1>> {};
+template <>
+struct is_property_value<cuda::cluster_size_key<2>>
+    : is_property_key<cuda::cluster_size_key<2>> {};
+template <>
+struct is_property_value<cuda::cluster_size_key<3>>
+    : is_property_key<cuda::cluster_size_key<3>> {};
+
+template <typename O>
+struct is_property_value_of<cuda::cluster_size_key<1>, O>
+    : is_property_key_of<cuda::cluster_size_key<1>, O> {};
+
+template <typename O>
+struct is_property_value_of<cuda::cluster_size_key<2>, O>
+    : is_property_key_of<cuda::cluster_size_key<2>, O> {};
+
+template <typename O>
+struct is_property_value_of<cuda::cluster_size_key<3>, O>
+    : is_property_key_of<cuda::cluster_size_key<3>, O> {};
+
+namespace detail {
+template <typename PropertiesT> constexpr std::size_t getClusterDim() {
+  if constexpr (PropertiesT::template has_property<
+                    sycl::ext::oneapi::experimental::cuda::cluster_size_key<
+                        1>>()) {
+    return 1;
+  }
+  if constexpr (PropertiesT::template has_property<
+                    sycl::ext::oneapi::experimental::cuda::cluster_size_key<
+                        2>>()) {
+    return 2;
+  }
+  if constexpr (PropertiesT::template has_property<
+                    sycl::ext::oneapi::experimental::cuda::cluster_size_key<
+                        3>>()) {
+    return 3;
+  }
+  return 0;
+}
+} // namespace detail
+} // namespace ext::oneapi::experimental
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
index 7f10dd7f79e85..7d463d7c66eca 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
@@ -149,9 +149,9 @@ void parallel_for(handler &CGH,
   ext::oneapi::experimental::detail::LaunchConfigAccess<range<Dimensions>,
                                                         Properties>
       ConfigAccess(Config);
-  CGH.parallel_for<KernelName>(ConfigAccess.getRange(),
-                               std::forward<ReductionsT>(Reductions)...,
-                               KernelObj);
+  CGH.parallel_for<KernelName>(
+      ConfigAccess.getRange(), ConfigAccess.getProperties(),
+      std::forward<ReductionsT>(Reductions)..., KernelObj);
 }
 
 template <typename KernelName = sycl::detail::auto_name, int Dimensions,
@@ -225,9 +225,9 @@ void nd_launch(handler &CGH,
   ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
                                                         Properties>
       ConfigAccess(Config);
-  CGH.parallel_for<KernelName>(ConfigAccess.getRange(),
-                               std::forward<ReductionsT>(Reductions)...,
-                               KernelObj);
+  CGH.parallel_for<KernelName>(
+      ConfigAccess.getRange(), ConfigAccess.getProperties(),
+      std::forward<ReductionsT>(Reductions)..., KernelObj);
 }
 
 template <typename KernelName = sycl::detail::auto_name, int Dimensions,
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index 1b13620d45c3f..62a772eb362d5 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -53,7 +53,8 @@ enum class UnsupportedGraphFeatures {
   sycl_ext_oneapi_enqueue_barrier = 4,
   sycl_ext_oneapi_memcpy2d = 5,
   sycl_ext_oneapi_device_global = 6,
-  sycl_ext_oneapi_bindless_images = 7
+  sycl_ext_oneapi_bindless_images = 7,
+  sycl_ext_oneapi_experimental_cuda_cluster_launch = 8
 };
 
 inline const char *
@@ -76,6 +77,8 @@ UnsupportedFeatureToString(UnsupportedGraphFeatures Feature) {
     return "sycl_ext_oneapi_device_global";
   case UGF::sycl_ext_oneapi_bindless_images:
     return "sycl_ext_oneapi_bindless_images";
+  case UGF::sycl_ext_oneapi_experimental_cuda_cluster_launch:
+    return "sycl_ext_oneapi_experimental_cuda_cluster_launch";
   }
 
   assert(false && "Unhandled graphs feature");
diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp
index 50b385f3f0cab..e225928c4cd68 100644
--- a/sycl/include/sycl/ext/oneapi/properties/property.hpp
+++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp
@@ -211,8 +211,9 @@ enum PropKind : uint32_t {
   OutputDataPlacement = 70,
   IncludeFiles = 71,
   RegisteredKernelNames = 72,
+  ClusterLaunch = 73,
   // PropKindSize must always be the last value.
-  PropKindSize = 73,
+  PropKindSize = 74,
 };
 
 struct property_key_base_tag {};
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index a26da04193e3b..1a19f26883c7e 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -32,6 +32,7 @@
 #include <sycl/ext/oneapi/bindless_images_mem_handle.hpp>
 #include <sycl/ext/oneapi/device_global/device_global.hpp>
 #include <sycl/ext/oneapi/device_global/properties.hpp>
+#include <sycl/ext/oneapi/experimental/cluster_group_prop.hpp>
 #include <sycl/ext/oneapi/experimental/graph.hpp>
 #include <sycl/ext/oneapi/experimental/raw_kernel_arg.hpp>
 #include <sycl/ext/oneapi/experimental/use_root_sync_prop.hpp>
@@ -958,6 +959,21 @@ class __SYCL_EXPORT handler {
       sycl::ext::oneapi::experimental::execution_scope threadScope,
       sycl::ext::oneapi::experimental::execution_scope coordinationScope);
 
+  template <typename Properties>
+  void checkAndSetClusterRange(const Properties &Props) {
+    namespace syclex = sycl::ext::oneapi::experimental;
+    constexpr std::size_t cluster_dim =
+        syclex::detail::getClusterDim<Properties>();
+    if constexpr (cluster_dim > 0) {
+      setKernelUsesClusterLaunch();
+      MNDRDesc.setClusterDimensions(
+          Props
+              .template get_property<
+                  syclex::cuda::cluster_size_key<cluster_dim>>()
+              .get_cluster_size());
+    }
+  }
+
   /// Process kernel properties.
   ///
   /// Stores information about kernel properties into the handler.
@@ -1026,6 +1042,8 @@ class __SYCL_EXPORT handler {
           sycl::ext::oneapi::experimental::execution_scope::work_item,
           prop.coordinationScope);
     }
+
+    checkAndSetClusterRange(Props);
   }
 
   /// Checks whether it is possible to copy the source shape to the destination
@@ -1409,9 +1427,9 @@ class __SYCL_EXPORT handler {
     kernel_parallel_for_wrapper<NameT, TransformedArgType, KernelType,
                                 PropertiesT>(KernelFunc);
 #ifndef __SYCL_DEVICE_ONLY__
-    processProperties<NameT, PropertiesT>(Props);
     detail::checkValueRange<Dims>(ExecutionRange);
     MNDRDesc.set(std::move(ExecutionRange));
+    processProperties<NameT, PropertiesT>(Props);
     StoreLambda<NameT, KernelType, Dims, TransformedArgType>(
         std::move(KernelFunc));
     setType(detail::CG::Kernel);
@@ -3624,6 +3642,9 @@ class __SYCL_EXPORT handler {
   // Set value of the kernel is cooperative flag
   void setKernelIsCooperative(bool);
 
+  // Set using cuda thread block cluster launch flag true
+  void setKernelUsesClusterLaunch();
+
   template <
       ext::oneapi::experimental::detail::UnsupportedGraphFeatures FeatureT>
   void throwIfGraphAssociated() const {
diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def
index 2151776ddb8e7..f9a14e40dcd3e 100644
--- a/sycl/include/sycl/info/aspects.def
+++ b/sycl/include/sycl/info/aspects.def
@@ -68,3 +68,4 @@ __SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d_usm, 71)
 __SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 72)
 __SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73)
 __SYCL_ASPECT(ext_oneapi_virtual_mem, 74)
+__SYCL_ASPECT(ext_oneapi_cuda_cluster_group, 75)
diff --git a/sycl/include/sycl/info/device_traits.def b/sycl/include/sycl/info/device_traits.def
index 730ef88e335d4..14126c3ee5e2c 100644
--- a/sycl/include/sycl/info/device_traits.def
+++ b/sycl/include/sycl/info/device_traits.def
@@ -231,6 +231,8 @@ __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_work_groups_3d, id<3>,
                          PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D)
 __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_global_work_groups, size_t,
                          PI_EXT_ONEAPI_DEVICE_INFO_MAX_GLOBAL_WORK_GROUPS)
+__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_cuda_cluster_group, bool,
+                         PI_EXT_ONEAPI_DEVICE_INFO_CLUSTER_LAUNCH)
 
 #ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF
 #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp
index b4b121a2815c4..1bdb816efcb39 100644
--- a/sycl/include/sycl/sycl.hpp
+++ b/sycl/include/sycl/sycl.hpp
@@ -88,6 +88,7 @@
 #include <sycl/ext/oneapi/experimental/ballot_group.hpp>
 #include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
 #include <sycl/ext/oneapi/experimental/builtins.hpp>
+#include <sycl/ext/oneapi/experimental/cluster_group_prop.hpp>
 #include <sycl/ext/oneapi/experimental/composite_device.hpp>
 #include <sycl/ext/oneapi/experimental/cuda/barrier.hpp>
 #include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index e58d59547c310..92d1eb4e4c825 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -564,6 +564,19 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueKernelLaunchCustom(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    pi_uint32 NumPropsInLaunchPropList,
+    const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  return pi2ur::piextEnqueueKernelLaunchCustom(
+      Queue, Kernel, WorkDim, GlobalWorkSize, LocalWorkSize,
+      NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList,
+      EventsWaitList, OutEvent);
+}
+
 pi_result piextEnqueueCooperativeKernelLaunch(
     pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
     const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index 97c1e72fde422..671df35c30ac4 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -577,6 +577,26 @@ pi_result piextEnqueueCooperativeKernelLaunch(
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueKernelLaunchCustom(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    pi_uint32 NumPropsInLaunchPropList,
+    const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+  (void)Queue;
+  (void)Kernel;
+  (void)WorkDim;
+  (void)GlobalWorkSize;
+  (void)LocalWorkSize;
+  (void)NumPropsInLaunchPropList;
+  (void)LaunchPropList;
+  (void)NumEventsInWaitList;
+  (void)EventsWaitList;
+  (void)OutEvent;
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  return PI_ERROR_UNSUPPORTED_FEATURE;
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 93fab8d0fabb8..daff872067f86 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -578,6 +578,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueKernelLaunchCustom(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    pi_uint32 NumPropsInLaunchPropList,
+    const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+  (void)Queue;
+  (void)Kernel;
+  (void)WorkDim;
+  (void)GlobalWorkSize;
+  (void)LocalWorkSize;
+  (void)NumPropsInLaunchPropList;
+  (void)LaunchPropList;
+  (void)NumEventsInWaitList;
+  (void)EventsWaitList;
+  (void)OutEvent;
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  return PI_ERROR_UNSUPPORTED_FEATURE;
+}
+
 pi_result piextEnqueueCooperativeKernelLaunch(
     pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
     const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp
index 8b577c4c997e3..4e61f0a9da031 100644
--- a/sycl/plugins/native_cpu/pi_native_cpu.cpp
+++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp
@@ -573,6 +573,26 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueKernelLaunchCustom(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    pi_uint32 NumPropsInLaunchPropList,
+    const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+  (void)Queue;
+  (void)Kernel;
+  (void)WorkDim;
+  (void)GlobalWorkSize;
+  (void)LocalWorkSize;
+  (void)NumPropsInLaunchPropList;
+  (void)LaunchPropList;
+  (void)NumEventsInWaitList;
+  (void)EventsWaitList;
+  (void)OutEvent;
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  return PI_ERROR_UNSUPPORTED_FEATURE;
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 890d0fdecf79b..e3345d4cb5237 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -554,6 +554,26 @@ pi_result piextEnqueueCooperativeKernelLaunch(
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+pi_result piextEnqueueKernelLaunchCustom(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    pi_uint32 NumPropsInLaunchPropList,
+    const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+  (void)Queue;
+  (void)Kernel;
+  (void)WorkDim;
+  (void)GlobalWorkSize;
+  (void)LocalWorkSize;
+  (void)NumPropsInLaunchPropList;
+  (void)LaunchPropList;
+  (void)NumEventsInWaitList;
+  (void)EventsWaitList;
+  (void)OutEvent;
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  return PI_ERROR_UNSUPPORTED_FEATURE;
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 7dd2e0bbbac8c..b576e38ee47d4 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1341,6 +1341,8 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
         UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP)
     PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM,
                              UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT)
+    PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_CLUSTER_LAUNCH,
+                             UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP)
 #undef PI_TO_UR_MAP_DEVICE_INFO
   default:
     return PI_ERROR_UNKNOWN;
@@ -3780,6 +3782,57 @@ inline pi_result piextEnqueueCooperativeKernelLaunch(
   return PI_SUCCESS;
 }
 
+inline pi_result piextEnqueueKernelLaunchCustom(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    pi_uint32 NumPropsInLaunchPropList,
+    const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+  PI_ASSERT(Kernel, PI_ERROR_INVALID_KERNEL);
+  PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
+  PI_ASSERT((WorkDim > 0) && (WorkDim < 4), PI_ERROR_INVALID_WORK_DIMENSION);
+
+  ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
+  ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
+  const ur_event_handle_t *UrEventsWaitList =
+      reinterpret_cast<const ur_event_handle_t *>(EventsWaitList);
+
+  ur_event_handle_t *UREvent = reinterpret_cast<ur_event_handle_t *>(OutEvent);
+
+  std::vector<ur_exp_launch_property_t> props(NumPropsInLaunchPropList);
+  for (pi_uint32 i = 0; i < NumPropsInLaunchPropList; i++) {
+    switch (LaunchPropList[i].id) {
+    case PI_LAUNCH_PROPERTY_IGNORE: {
+      props[i].id = UR_EXP_LAUNCH_PROPERTY_ID_IGNORE;
+      break;
+    }
+    case PI_LAUNCH_PROPERTY_CLUSTER_DIMENSION: {
+
+      props[i].id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION;
+      props[i].value.clusterDim[0] = LaunchPropList[i].value.cluster_dims[0];
+      props[i].value.clusterDim[1] = LaunchPropList[i].value.cluster_dims[1];
+      props[i].value.clusterDim[2] = LaunchPropList[i].value.cluster_dims[2];
+      break;
+    }
+    case PI_LAUNCH_PROPERTY_COOPERATIVE: {
+      props[i].id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
+      props[i].value.cooperative = LaunchPropList[i].value.cooperative;
+      break;
+    }
+    default: {
+      return PI_ERROR_INVALID_VALUE;
+    }
+    }
+  }
+
+  HANDLE_ERRORS(urEnqueueKernelLaunchCustomExp(
+      UrQueue, UrKernel, WorkDim, GlobalWorkSize, LocalWorkSize,
+      NumPropsInLaunchPropList, &props[0], NumEventsInWaitList,
+      UrEventsWaitList, UREvent));
+
+  return PI_SUCCESS;
+}
+
 inline pi_result
 piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, pi_bool BlockingWrite,
                        pi_image_offset Origin, pi_image_region Region,
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index 717fd6c895331..b75a5abd60327 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -273,6 +273,19 @@ __SYCL_EXPORT pi_result piEnqueueKernelLaunch(
       NumEventsInWaitList, EventWaitList, OutEvent);
 }
 
+__SYCL_EXPORT pi_result piextEnqueueKernelLaunchCustom(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    pi_uint32 NumPropsInLaunchPropList,
+    const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  return pi2ur::piextEnqueueKernelLaunchCustom(
+      Queue, Kernel, WorkDim, GlobalWorkSize, LocalWorkSize,
+      NumPropsInLaunchPropList, LaunchPropList, NumEventsInWaitList,
+      EventsWaitList, OutEvent);
+}
+
 __SYCL_EXPORT pi_result piEnqueueMemImageWrite(
     pi_queue Queue, pi_mem Image, pi_bool BlockingWrite, pi_image_offset Origin,
     pi_image_region Region, size_t InputRowPitch, size_t InputSlicePitch,
@@ -1632,6 +1645,9 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_API(piextDisablePeerAccess)
   _PI_API(piextPeerAccessGetInfo)
 
+  // Launch Properties
+  _PI_API(piextEnqueueKernelLaunchCustom)
+
   _PI_API(piextPluginGetOpaqueData)
   _PI_API(piTearDown)
 
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index e22d87be839ae..c268987bf57f6 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -366,6 +366,8 @@ bool device_impl::has(aspect Aspect) const {
     return get_info<info::device::usm_host_allocations>();
   case aspect::ext_intel_mem_channel:
     return get_info<info::device::ext_intel_mem_channel>();
+  case aspect::ext_oneapi_cuda_cluster_group:
+    return get_info<info::device::ext_oneapi_cuda_cluster_group>();
   case aspect::usm_atomic_host_allocations:
     return (get_device_info_impl<pi_usm_capabilities,
                                  info::device::usm_host_allocations>::
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 98d9d171f5737..5be5a76e68e69 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -301,6 +301,25 @@ struct get_device_info_impl<std::vector<memory_scope>,
   }
 };
 
+// Specialization for cuda cluster group
+template <>
+struct get_device_info_impl<bool, info::device::ext_oneapi_cuda_cluster_group> {
+  static bool get(const DeviceImplPtr &Dev) {
+    bool result = false;
+    if (Dev->getBackend() == backend::ext_oneapi_cuda) {
+      sycl::detail::pi::PiResult Err =
+          Dev->getPlugin()->call_nocheck<PiApiKind::piDeviceGetInfo>(
+              Dev->getHandleRef(),
+              PiInfoCode<info::device::ext_oneapi_cuda_cluster_group>::value,
+              sizeof(result), &result, nullptr);
+      if (Err != PI_SUCCESS) {
+        return false;
+      }
+    }
+    return result;
+  }
+};
+
 // Specialization for exec_capabilities, OpenCL returns a bitfield
 template <>
 struct get_device_info_impl<std::vector<info::execution_capability>,
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index de4a6fd3bd22e..51d9d65de79f7 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -973,7 +973,8 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
               // TODO: Pass accessor mem allocations
               nullptr,
               // TODO: Extract from handler
-              PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT, CG->MKernelIsCooperative);
+              PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT, CG->MKernelIsCooperative,
+              CG->MKernelUsesClusterLaunch);
           if (Res != pi_result::PI_SUCCESS) {
             throw sycl::exception(
                 sycl::make_error_code(sycl::errc::kernel),
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
index f0df55d5e069b..daea9816deac5 100644
--- a/sycl/source/detail/handler_impl.hpp
+++ b/sycl/source/detail/handler_impl.hpp
@@ -116,6 +116,7 @@ class handler_impl {
       PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT;
 
   bool MKernelIsCooperative = false;
+  bool MKernelUsesClusterLaunch = false;
 
   // Extra information for bindless image copy
   sycl::detail::pi::PiMemImageDesc MImageDesc;
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 952482e42d79f..51631d0d3e4f6 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -970,7 +970,8 @@ jit_compiler::fuseKernels(QueueImplPtr Queue,
   FusedCG.reset(new detail::CGExecKernel(
       NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
       std::move(CGData), std::move(FusedArgs), FusedOrCachedKernelName, {}, {},
-      CG::CGTYPE::Kernel, KernelCacheConfig, false /* KernelIsCooperative */));
+      CG::CGTYPE::Kernel, KernelCacheConfig, false /* KernelIsCooperative */,
+      false /* KernelUsesClusterLaunch*/));
   return FusedCG;
 }
 
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 583a3c5f0c386..563915598c128 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -303,7 +303,8 @@ bool Command::isFusable() const {
   }
   const auto &CG = (static_cast<const ExecCGCommand &>(*this)).getCG();
   return (CG.getType() == CG::CGTYPE::Kernel) &&
-         (!static_cast<const CGExecKernel &>(CG).MKernelIsCooperative);
+         (!static_cast<const CGExecKernel &>(CG).MKernelIsCooperative) &&
+         (!static_cast<const CGExecKernel &>(CG).MKernelUsesClusterLaunch);
 }
 
 static void flushCrossQueueDeps(const std::vector<EventImplPtr> &EventImpls,
@@ -2370,7 +2371,7 @@ static pi_result SetKernelParamsAndLaunch(
     const detail::EventImplPtr &OutEventImpl,
     const KernelArgMask *EliminatedArgMask,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    bool IsCooperative) {
+    bool IsCooperative, bool KernelUsesClusterLaunch) {
   assert(Queue && "Kernel submissions should have an associated queue");
   const PluginPtr &Plugin = Queue->getPlugin();
 
@@ -2408,6 +2409,35 @@ static pi_result SetKernelParamsAndLaunch(
   }
   if (OutEventImpl != nullptr)
     OutEventImpl->setHostEnqueueTime();
+  if (KernelUsesClusterLaunch) {
+    std::vector<pi_launch_property> property_list;
+
+    pi_launch_property_value launch_property_value_cluster_range;
+    launch_property_value_cluster_range.cluster_dims[0] =
+        NDRDesc.ClusterDimensions[0];
+    launch_property_value_cluster_range.cluster_dims[1] =
+        NDRDesc.ClusterDimensions[1];
+    launch_property_value_cluster_range.cluster_dims[2] =
+        NDRDesc.ClusterDimensions[2];
+
+    property_list.push_back(
+        {pi_launch_property_id::PI_LAUNCH_PROPERTY_CLUSTER_DIMENSION,
+         launch_property_value_cluster_range});
+
+    if (IsCooperative) {
+      pi_launch_property_value launch_property_value_cooperative;
+      launch_property_value_cooperative.cooperative = 1;
+      property_list.push_back(
+          {pi_launch_property_id::PI_LAUNCH_PROPERTY_COOPERATIVE,
+           launch_property_value_cooperative});
+    }
+
+    return Plugin->call_nocheck<PiApiKind::piextEnqueueKernelLaunchCustom>(
+        Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalSize[0],
+        LocalSize, property_list.size(), property_list.data(), RawEvents.size(),
+        RawEvents.empty() ? nullptr : &RawEvents[0],
+        OutEventImpl ? &OutEventImpl->getHandleRef() : nullptr);
+  }
   pi_result Error =
       [&](auto... Args) {
         if (IsCooperative) {
@@ -2561,7 +2591,7 @@ pi_int32 enqueueImpKernel(
     const detail::EventImplPtr &OutEventImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-    const bool KernelIsCooperative) {
+    const bool KernelIsCooperative, const bool KernelUsesClusterLaunch) {
   assert(Queue && "Kernel submissions should have an associated queue");
   // Run OpenCL kernel
   auto ContextImpl = Queue->getContextImplPtr();
@@ -2650,10 +2680,10 @@ pi_int32 enqueueImpKernel(
           sizeof(sycl::detail::pi::PiKernelCacheConfig), &KernelCacheConfig);
     }
 
-    Error = SetKernelParamsAndLaunch(Queue, Args, DeviceImageImpl, Kernel,
-                                     NDRDesc, EventsWaitList, OutEventImpl,
-                                     EliminatedArgMask, getMemAllocationFunc,
-                                     KernelIsCooperative);
+    Error = SetKernelParamsAndLaunch(
+        Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList,
+        OutEventImpl, EliminatedArgMask, getMemAllocationFunc,
+        KernelIsCooperative, KernelUsesClusterLaunch);
 
     const PluginPtr &Plugin = Queue->getPlugin();
     if (!SyclKernelImpl && !MSyclKernel) {
@@ -3006,7 +3036,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return enqueueImpKernel(
         MQueue, NDRDesc, Args, ExecKernel->getKernelBundle(), SyclKernel,
         KernelName, RawEvents, EventImpl, getMemAllocationFunc,
-        ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative);
+        ExecKernel->MKernelCacheConfig, ExecKernel->MKernelIsCooperative,
+        ExecKernel->MKernelUsesClusterLaunch);
   }
   case CG::CGTYPE::CopyUSM: {
     CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 471bf66264c54..3ed072b9bb643 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -633,7 +633,7 @@ pi_int32 enqueueImpKernel(
     const detail::EventImplPtr &Event,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
-    bool KernelIsCooperative);
+    bool KernelIsCooperative, const bool KernelUsesClusterLaunch);
 
 class KernelFusionCommand;
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 7dae7fdbf5726..d6d488c45f019 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -278,7 +278,8 @@ event handler::finalize() {
         Result = enqueueImpKernel(MQueue, MNDRDesc, MArgs, KernelBundleImpPtr,
                                   MKernel, MKernelName.c_str(), RawEvents,
                                   NewEvent, nullptr, MImpl->MKernelCacheConfig,
-                                  MImpl->MKernelIsCooperative);
+                                  MImpl->MKernelIsCooperative,
+                                  MImpl->MKernelUsesClusterLaunch);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
         // Emit signal only when event is created
         if (NewEvent != nullptr) {
@@ -341,7 +342,8 @@ event handler::finalize() {
         std::move(MImpl->MKernelBundle), std::move(CGData), std::move(MArgs),
         MKernelName.c_str(), std::move(MStreamStorage),
         std::move(MImpl->MAuxiliaryResources), MCGType,
-        MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative, MCodeLoc));
+        MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative,
+        MImpl->MKernelUsesClusterLaunch, MCodeLoc));
     break;
   }
   case detail::CG::CopyAccToPtr:
@@ -1728,6 +1730,13 @@ void handler::setKernelIsCooperative(bool KernelIsCooperative) {
   MImpl->MKernelIsCooperative = KernelIsCooperative;
 }
 
+void handler::setKernelUsesClusterLaunch() {
+  throwIfGraphAssociated<
+      syclex::detail::UnsupportedGraphFeatures::
+          sycl_ext_oneapi_experimental_cuda_cluster_launch>();
+  MImpl->MKernelUsesClusterLaunch = true;
+}
+
 void handler::ext_oneapi_graph(
     ext::oneapi::experimental::command_graph<
         ext::oneapi::experimental::graph_state::executable>
diff --git a/sycl/test-e2e/Basic/aspects.cpp b/sycl/test-e2e/Basic/aspects.cpp
index bd47d1b4bf1cb..f9b3187882d47 100644
--- a/sycl/test-e2e/Basic/aspects.cpp
+++ b/sycl/test-e2e/Basic/aspects.cpp
@@ -84,6 +84,9 @@ int main() {
     if (plt.has(aspect::usm_system_allocations)) {
       std::cout << "  USM system allocations" << std::endl;
     }
+    if (plt.has(aspect::ext_oneapi_cuda_cluster_group)) {
+      std::cout << " ext_oneapi_cuda_cluster_group" << std::endl;
+    }
   }
   std::cout << "Passed." << std::endl;
   return 0;
diff --git a/sycl/test-e2e/ClusterLaunch/cluster_launch_enqueue_functions.cpp b/sycl/test-e2e/ClusterLaunch/cluster_launch_enqueue_functions.cpp
new file mode 100644
index 0000000000000..e639260be5714
--- /dev/null
+++ b/sycl/test-e2e/ClusterLaunch/cluster_launch_enqueue_functions.cpp
@@ -0,0 +1,85 @@
+// Tests whether or not cluster launch was successful, with the correct ranges
+// that were passed via enqueue functions extension
+// REQUIRES: aspect-ext_oneapi_cuda_cluster_group
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/ext/oneapi/experimental/cluster_group_prop.hpp>
+#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
+#include <sycl/queue.hpp>
+#include <sycl/usm.hpp>
+
+#include <string>
+
+template <int Dim>
+int test_cluster_launch_enqueue_functions(sycl::queue &Queue,
+                                          sycl::range<Dim> GlobalRange,
+                                          sycl::range<Dim> LocalRange,
+                                          sycl::range<Dim> ClusterRange) {
+  using namespace sycl::ext::oneapi::experimental;
+
+  cuda::cluster_size ClusterDims(ClusterRange);
+  properties ClusterLaunchProperty{ClusterDims};
+
+  int *CorrectResultFlag = sycl::malloc_device<int>(1, Queue);
+  Queue.memset(CorrectResultFlag, 0, sizeof(int)).wait();
+
+  submit_with_event(Queue, [&](sycl::handler &CGH) {
+    nd_launch(CGH,
+              launch_config(sycl::nd_range<Dim>(GlobalRange, LocalRange),
+                            ClusterLaunchProperty),
+              [=](sycl::nd_item<Dim> It) {
+                uint32_t ClusterDimX, ClusterDimY, ClusterDimZ;
+// Temporary solution till cluster group class is implemented
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) &&            \
+    (__SYCL_CUDA_ARCH__ >= 900)
+                asm volatile("\n\t"
+                             "mov.u32 %0, %%cluster_nctaid.x; \n\t"
+                             "mov.u32 %1, %%cluster_nctaid.y; \n\t"
+                             "mov.u32 %2, %%cluster_nctaid.z; \n\t"
+                             : "=r"(ClusterDimZ), "=r"(ClusterDimY),
+                               "=r"(ClusterDimX));
+#endif
+                if constexpr (Dim == 1) {
+                  if (ClusterDimZ == ClusterRange[0] && ClusterDimY == 1 &&
+                      ClusterDimX == 1) {
+                    *CorrectResultFlag = 1;
+                  }
+                } else if constexpr (Dim == 2) {
+                  if (ClusterDimZ == ClusterRange[1] &&
+                      ClusterDimY == ClusterRange[0] && ClusterDimX == 1) {
+                    *CorrectResultFlag = 1;
+                  }
+                } else {
+                  if (ClusterDimZ == ClusterRange[2] &&
+                      ClusterDimY == ClusterRange[1] &&
+                      ClusterDimX == ClusterRange[0]) {
+                    *CorrectResultFlag = 1;
+                  }
+                }
+              });
+  }).wait_and_throw();
+
+  int CorrectResultFlagHost = 0;
+  Queue.copy(CorrectResultFlag, &CorrectResultFlagHost, 1).wait();
+  return CorrectResultFlagHost;
+}
+
+int main() {
+
+  sycl::queue Queue;
+
+  int HostCorrectFlag =
+      test_cluster_launch_enqueue_functions(Queue, sycl::range{128, 128, 128},
+                                            sycl::range{16, 16, 2},
+                                            sycl::range{2, 4, 1}) &&
+      test_cluster_launch_enqueue_functions(Queue, sycl::range{512, 1024},
+                                            sycl::range{32, 32},
+                                            sycl::range{4, 2}) &&
+      test_cluster_launch_enqueue_functions(Queue, sycl::range{128},
+                                            sycl::range{32}, sycl::range{2}) &&
+      test_cluster_launch_enqueue_functions(Queue, sycl::range{16384},
+                                            sycl::range{32}, sycl::range{16});
+
+  return !HostCorrectFlag;
+}
diff --git a/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp b/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp
new file mode 100644
index 0000000000000..e37d4ea1f1fb3
--- /dev/null
+++ b/sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp
@@ -0,0 +1,85 @@
+// Tests whether or not cluster launch was successful, with the correct ranges
+// that were passed via parallel for overload
+// REQUIRES: aspect-ext_oneapi_cuda_cluster_group
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/ext/oneapi/experimental/cluster_group_prop.hpp>
+#include <sycl/queue.hpp>
+#include <sycl/usm.hpp>
+
+#include <string>
+
+template <int Dim>
+int test_cluster_launch_parallel_for(sycl::queue &Queue,
+                                     sycl::range<Dim> GlobalRange,
+                                     sycl::range<Dim> LocalRange,
+                                     sycl::range<Dim> ClusterRange) {
+  using namespace sycl::ext::oneapi::experimental;
+
+  cuda::cluster_size ClusterDims(ClusterRange);
+  properties ClusterLaunchProperty{ClusterDims};
+
+  int *CorrectResultFlag = sycl::malloc_device<int>(1, Queue);
+  Queue.memset(CorrectResultFlag, 0, sizeof(int)).wait();
+
+  Queue
+      .submit([&](sycl::handler &CGH) {
+        CGH.parallel_for(sycl::nd_range<Dim>(GlobalRange, LocalRange),
+                         ClusterLaunchProperty, [=](sycl::nd_item<Dim> It) {
+                           uint32_t ClusterDimX, ClusterDimY, ClusterDimZ;
+// Temporary solution till cluster group class is implemented
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_CUDA_ARCH__) &&            \
+    (__SYCL_CUDA_ARCH__ >= 900)
+                           asm volatile("\n\t"
+                                        "mov.u32 %0, %%cluster_nctaid.x; \n\t"
+                                        "mov.u32 %1, %%cluster_nctaid.y; \n\t"
+                                        "mov.u32 %2, %%cluster_nctaid.z; \n\t"
+                                        : "=r"(ClusterDimZ), "=r"(ClusterDimY),
+                                          "=r"(ClusterDimX));
+#endif
+                           if constexpr (Dim == 1) {
+                             if (ClusterDimZ == ClusterRange[0] &&
+                                 ClusterDimY == 1 && ClusterDimX == 1) {
+                               *CorrectResultFlag = 1;
+                             }
+                           } else if constexpr (Dim == 2) {
+                             if (ClusterDimZ == ClusterRange[1] &&
+                                 ClusterDimY == ClusterRange[0] &&
+                                 ClusterDimX == 1) {
+                               *CorrectResultFlag = 1;
+                             }
+                           } else {
+                             if (ClusterDimZ == ClusterRange[2] &&
+                                 ClusterDimY == ClusterRange[1] &&
+                                 ClusterDimX == ClusterRange[0]) {
+                               *CorrectResultFlag = 1;
+                             }
+                           }
+                         });
+      })
+      .wait_and_throw();
+
+  int CorrectResultFlagHost = 0;
+  Queue.copy(CorrectResultFlag, &CorrectResultFlagHost, 1).wait();
+  return CorrectResultFlagHost;
+}
+
+int main() {
+
+  sycl::queue Queue;
+
+  int HostCorrectFlag =
+      test_cluster_launch_parallel_for(Queue, sycl::range{128, 128, 128},
+                                       sycl::range{16, 16, 2},
+                                       sycl::range{2, 4, 1}) &&
+      test_cluster_launch_parallel_for(Queue, sycl::range{512, 1024},
+                                       sycl::range{32, 32},
+                                       sycl::range{4, 2}) &&
+      test_cluster_launch_parallel_for(Queue, sycl::range{128}, sycl::range{32},
+                                       sycl::range{2}) &&
+      test_cluster_launch_parallel_for(Queue, sycl::range{16384},
+                                       sycl::range{32}, sycl::range{16});
+
+  return !HostCorrectFlag;
+}
diff --git a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp
new file mode 100644
index 0000000000000..e8bc71f4fc465
--- /dev/null
+++ b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp
@@ -0,0 +1,69 @@
+// Checks whether or not event Dependencies are honored by
+// piExtEnqueueLaunchKernelCustom
+// REQUIRES: aspect-ext_oneapi_cuda_cluster_group
+// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/accessor.hpp>
+#include <sycl/buffer.hpp>
+#include <sycl/ext/oneapi/experimental/cluster_group_prop.hpp>
+#include <sycl/queue.hpp>
+#include <sycl/usm.hpp>
+
+#include <vector>
+
+template <typename T> void dummy_kernel(T *Input, int N, sycl::nd_item<1> It) {
+#if defined(__SYCL_CUDA_ARCH__) && (__SYCL_CUDA_ARCH__ >= 900)
+  auto ID = It.get_global_linear_id();
+  uint32_t ClusterDim;
+  asm volatile("mov.u32 %0, %%cluster_nctaid.x;" : "=r"(ClusterDim));
+
+  if (ID < N) {
+    Input[ID] += static_cast<T>(ClusterDim);
+  }
+#endif
+}
+
+int main() {
+
+  std::vector<int> HostArray(4096, -20);
+  sycl::queue Queue;
+
+  if (!Queue.get_device().has(sycl::aspect::ext_oneapi_cuda_cluster_group)) {
+    printf("Cluster group not supported on this arch, exiting...\n");
+    return 0;
+  }
+
+  {
+    sycl::buffer<int, 1> Buff(HostArray.data(), 4096);
+    Queue.submit([&](sycl::handler &CGH) {
+      auto Acc = Buff.template get_access<sycl::access::mode::write>(CGH);
+      CGH.parallel_for(4096, [=](auto i) { Acc[i] = 1; });
+    });
+    Queue.submit([&](sycl::handler &CGH) {
+      using namespace sycl::ext::oneapi::experimental;
+
+      cuda::cluster_size ClusterDims(sycl::range{2});
+      properties ClusterLaunchProperty{ClusterDims};
+      auto Acc = Buff.template get_access<sycl::access::mode::read_write>(CGH);
+      CGH.parallel_for(
+          sycl::nd_range({4096}, {32}), ClusterLaunchProperty,
+          [=](sycl::nd_item<1> It) {
+            dummy_kernel(
+                Acc.get_multi_ptr<sycl::access::decorated::yes>().get(), 4096,
+                It);
+          });
+    });
+    Queue.submit([&](sycl::handler &CGH) {
+      auto Acc = Buff.template get_access<sycl::access::mode::read_write>(CGH);
+      CGH.parallel_for(4096, [=](auto i) { Acc[i] *= 5; });
+    });
+  }
+
+  for (const auto &V : HostArray) {
+    if (V != 15) {
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp
index cc073064c724c..dc1e3f82eddaa 100644
--- a/sycl/test/abi/layout_handler.cpp
+++ b/sycl/test/abi/layout_handler.cpp
@@ -103,85 +103,89 @@ void foo() {
 // CHECK-NEXT: 320 |     class sycl::range<3> NumWorkGroups
 // CHECK-NEXT: 320 |       class sycl::detail::array<3> (base)
 // CHECK-NEXT: 320 |         size_t[3] common_array
-// CHECK-NEXT: 344 |     size_t Dims
-// CHECK-NEXT: 352 |   class sycl::detail::string MKernelName
-// CHECK-NEXT: 352 |     char * str
-// CHECK-NEXT: 360 |   class std::shared_ptr<class sycl::detail::kernel_impl> MKernel
-// CHECK-NEXT: 360 |     class std::__shared_ptr<class sycl::detail::kernel_impl> (base)
-// CHECK-NEXT: 360 |       class std::__shared_ptr_access<class sycl::detail::kernel_impl, __gnu_cxx::_S_atomic> (base) (empty)
-// CHECK-NEXT: 360 |       element_type * _M_ptr
-// CHECK-NEXT: 368 |       class std::__shared_count<> _M_refcount
-// CHECK-NEXT: 368 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
-// CHECK-NEXT: 376 |   detail::class CG::CGTYPE MCGType
-// CHECK-NEXT: 384 |   void * MSrcPtr
-// CHECK-NEXT: 392 |   void * MDstPtr
-// CHECK-NEXT: 400 |   size_t MLength
-// CHECK-NEXT: 408 |   class std::vector<unsigned char> MPattern
-// CHECK-NEXT: 408 |     struct std::_Vector_base<unsigned char, class std::allocator<unsigned char> > (base)
-// CHECK-NEXT: 408 |       struct std::_Vector_base<unsigned char, class std::allocator<unsigned char> >::_Vector_impl _M_impl
-// CHECK-NEXT: 408 |         class std::allocator<unsigned char> (base) (empty)
-// CHECK:      408 |           pointer _M_start
-// CHECK-NEXT: 416 |           pointer _M_finish
-// CHECK-NEXT: 424 |           pointer _M_end_of_storage
-// CHECK-NEXT: 432 |   class std::unique_ptr<class sycl::detail::HostKernelBase> MHostKernel
-// CHECK:      432 |       class std::__uniq_ptr_impl<class sycl::detail::HostKernelBase, struct std::default_delete<class sycl::detail::HostKernelBase> >
-// CHECK-NEXT: 432 |         class std::tuple<class sycl::detail::HostKernelBase *, struct std::default_delete<class sycl::detail::HostKernelBase> > _M_t
-// CHECK-NEXT: 432 |           struct std::_Tuple_impl<0, class sycl::detail::HostKernelBase *, struct std::default_delete<class sycl::detail::HostKernelBase> > (base)
-// CHECK-NEXT: 432 |             struct std::_Tuple_impl<1, struct std::default_delete<class sycl::detail::HostKernelBase> > (base) (empty)
-// CHECK:      432 |             struct std::_Head_base<0, class sycl::detail::HostKernelBase *> (base)
-// CHECK-NEXT: 432 |               class sycl::detail::HostKernelBase * _M_head_impl
-// CHECK-NEXT: 440 |   class std::unique_ptr<class sycl::detail::HostTask> MHostTask
-// CHECK:      440 |       class std::__uniq_ptr_impl<class sycl::detail::HostTask, struct std::default_delete<class sycl::detail::HostTask> >
-// CHECK-NEXT: 440 |         class std::tuple<class sycl::detail::HostTask *, struct std::default_delete<class sycl::detail::HostTask> > _M_t
-// CHECK-NEXT: 440 |           struct std::_Tuple_impl<0, class sycl::detail::HostTask *, struct std::default_delete<class sycl::detail::HostTask> > (base)
-// CHECK-NEXT: 440 |             struct std::_Tuple_impl<1, struct std::default_delete<class sycl::detail::HostTask> > (base) (empty)
-// CHECK:      440 |             struct std::_Head_base<0, class sycl::detail::HostTask *> (base)
-// CHECK-NEXT: 440 |               class sycl::detail::HostTask * _M_head_impl
-// CHECK-NEXT: 448 |   class std::vector<class std::shared_ptr<class sycl::detail::event_impl> > MEventsWaitWithBarrier
-// CHECK-NEXT: 448 |     struct std::_Vector_base<class std::shared_ptr<class sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class sycl::detail::event_impl> > > (base)
-// CHECK-NEXT: 448 |       struct std::_Vector_base<class std::shared_ptr<class sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class sycl::detail::event_impl> > >::_Vector_impl _M_impl
-// CHECK-NEXT: 448 |         class std::allocator<class std::shared_ptr<class sycl::detail::event_impl> > (base) (empty)
-// CHECK:      448 |           pointer _M_start
-// CHECK-NEXT: 456 |           pointer _M_finish
-// CHECK-NEXT: 464 |           pointer _M_end_of_storage
-// CHECK-NEXT: 472 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::graph_impl> MGraph
-// CHECK-NEXT: 472 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::graph_impl> (base)
-// CHECK-NEXT: 472 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::graph_impl, __gnu_cxx::_S_atomic> (base) (empty)
-// CHECK-NEXT: 472 |       element_type * _M_ptr
-// CHECK-NEXT: 480 |       class std::__shared_count<> _M_refcount
-// CHECK-NEXT: 480 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
-// CHECK-NEXT: 488 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::exec_graph_impl> MExecGraph
-// CHECK-NEXT: 488 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::exec_graph_impl> (base)
-// CHECK-NEXT: 488 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::exec_graph_impl, __gnu_cxx::_S_atomic> (base) (empty)
-// CHECK-NEXT: 488 |       element_type * _M_ptr
-// CHECK-NEXT: 496 |       class std::__shared_count<> _M_refcount
-// CHECK-NEXT: 496 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
-// CHECK-NEXT: 504 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::node_impl> MSubgraphNode
-// CHECK-NEXT: 504 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::node_impl> (base)
-// CHECK-NEXT: 504 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::node_impl, __gnu_cxx::_S_atomic> (base) (empty)
-// CHECK-NEXT: 504 |       element_type * _M_ptr
-// CHECK-NEXT: 512 |       class std::__shared_count<> _M_refcount
-// CHECK-NEXT: 512 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
-// CHECK-NEXT: 520 |   class std::unique_ptr<class sycl::detail::CG> MGraphNodeCG
-// CHECK:      520 |       class std::__uniq_ptr_impl<class sycl::detail::CG, struct std::default_delete<class sycl::detail::CG> >
-// CHECK-NEXT: 520 |         class std::tuple<class sycl::detail::CG *, struct std::default_delete<class sycl::detail::CG> > _M_t
-// CHECK-NEXT: 520 |           struct std::_Tuple_impl<0, class sycl::detail::CG *, struct std::default_delete<class sycl::detail::CG> > (base)
-// CHECK-NEXT: 520 |             struct std::_Tuple_impl<1, struct std::default_delete<class sycl::detail::CG> > (base) (empty)
-// CHECK:      520 |             struct std::_Head_base<0, class sycl::detail::CG *> (base)
-// CHECK-NEXT: 520 |               class sycl::detail::CG * _M_head_impl
-// CHECK-NEXT: 528 |   struct sycl::detail::code_location MCodeLoc
-// CHECK-NEXT: 528 |     const char * MFileName
-// CHECK-NEXT: 536 |     const char * MFunctionName
-// CHECK-NEXT: 544 |     unsigned long MLineNo
-// CHECK-NEXT: 552 |     unsigned long MColumnNo
-// CHECK-NEXT: 560 |   _Bool MIsFinalized
-// CHECK-NEXT: 568 |   class sycl::event MLastEvent
-// CHECK-NEXT: 568 |     class sycl::detail::OwnerLessBase<class sycl::event> (base) (empty)
-// CHECK-NEXT: 568 |     class std::shared_ptr<class sycl::detail::event_impl> impl
-// CHECK-NEXT: 568 |       class std::__shared_ptr<class sycl::detail::event_impl> (base)
-// CHECK-NEXT: 568 |         class std::__shared_ptr_access<class sycl::detail::event_impl, __gnu_cxx::_S_atomic> (base) (empty)
-// CHECK-NEXT: 568 |         element_type * _M_ptr
-// CHECK-NEXT: 576 |         class std::__shared_count<> _M_refcount
-// CHECK-NEXT: 576 |           _Sp_counted_base<(_Lock_policy)2U> * _M_pi
-// CHECK-NEXT:     | [sizeof=584, dsize=584, align=8,
-// CHECK-NEXT:     |  nvsize=584, nvalign=8]
+// CHECK-NEXT: 344 |     class sycl::range<3> ClusterDimensions
+// CHECK-NEXT: 344 |       class sycl::detail::array<3> (base)
+// CHECK-NEXT: 344 |         size_t[3] common_array
+// CHECK-NEXT: 368 |     size_t Dims
+// CHECK-NEXT: 376 |   class sycl::detail::string MKernelName
+// CHECK-NEXT: 376 |     char * str
+// CHECK-NEXT: 384 |   class std::shared_ptr<class sycl::detail::kernel_impl> MKernel
+// CHECK-NEXT: 384 |     class std::__shared_ptr<class sycl::detail::kernel_impl> (base)
+// CHECK-NEXT: 384 |       class std::__shared_ptr_access<class sycl::detail::kernel_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 384 |       element_type * _M_ptr
+// CHECK-NEXT: 392 |       class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 392 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT: 400 |   detail::class CG::CGTYPE MCGType
+// CHECK-NEXT: 408 |   void * MSrcPtr
+// CHECK-NEXT: 416 |   void * MDstPtr
+// CHECK-NEXT: 424 |   size_t MLength
+// CHECK-NEXT: 432 |   class std::vector<unsigned char> MPattern
+// CHECK-NEXT: 432 |     struct std::_Vector_base<unsigned char, class std::allocator<unsigned char> > (base)
+// CHECK-NEXT: 432 |       struct std::_Vector_base<unsigned char, class std::allocator<unsigned char> >::_Vector_impl _M_impl
+// CHECK-NEXT: 432 |         class std::allocator<unsigned char> (base) (empty)
+// CHECK:      432 |           pointer _M_start
+// CHECK-NEXT: 440 |           pointer _M_finish
+// CHECK-NEXT: 448 |           pointer _M_end_of_storage
+// CHECK-NEXT: 456 |   class std::unique_ptr<class sycl::detail::HostKernelBase> MHostKernel
+// CHECK:      456 |       class std::__uniq_ptr_impl<class sycl::detail::HostKernelBase, struct std::default_delete<class sycl::detail::HostKernelBase> >
+// CHECK-NEXT: 456 |         class std::tuple<class sycl::detail::HostKernelBase *, struct std::default_delete<class sycl::detail::HostKernelBase> > _M_t
+// CHECK-NEXT: 456 |           struct std::_Tuple_impl<0, class sycl::detail::HostKernelBase *, struct std::default_delete<class sycl::detail::HostKernelBase> > (base)
+// CHECK-NEXT: 456 |             struct std::_Tuple_impl<1, struct std::default_delete<class sycl::detail::HostKernelBase> > (base) (empty)
+// CHECK:      456 |             struct std::_Head_base<0, class sycl::detail::HostKernelBase *> (base)
+// CHECK-NEXT: 456 |               class sycl::detail::HostKernelBase * _M_head_impl
+// CHECK-NEXT: 464 |   class std::unique_ptr<class sycl::detail::HostTask> MHostTask
+// CHECK:      464 |       class std::__uniq_ptr_impl<class sycl::detail::HostTask, struct std::default_delete<class sycl::detail::HostTask> >
+// CHECK-NEXT: 464 |         class std::tuple<class sycl::detail::HostTask *, struct std::default_delete<class sycl::detail::HostTask> > _M_t
+// CHECK-NEXT: 464 |           struct std::_Tuple_impl<0, class sycl::detail::HostTask *, struct std::default_delete<class sycl::detail::HostTask> > (base)
+// CHECK-NEXT: 464 |             struct std::_Tuple_impl<1, struct std::default_delete<class sycl::detail::HostTask> > (base) (empty)
+// CHECK:      464 |             struct std::_Head_base<0, class sycl::detail::HostTask *> (base)
+// CHECK-NEXT: 464 |               class sycl::detail::HostTask * _M_head_impl
+// CHECK-NEXT: 472 |   class std::vector<class std::shared_ptr<class sycl::detail::event_impl> > MEventsWaitWithBarrier
+// CHECK-NEXT: 472 |     struct std::_Vector_base<class std::shared_ptr<class sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class sycl::detail::event_impl> > > (base)
+// CHECK-NEXT: 472 |       struct std::_Vector_base<class std::shared_ptr<class sycl::detail::event_impl>, class std::allocator<class std::shared_ptr<class sycl::detail::event_impl> > >::_Vector_impl _M_impl
+// CHECK-NEXT: 472 |         class std::allocator<class std::shared_ptr<class sycl::detail::event_impl> > (base) (empty)
+// CHECK:      472 |           pointer _M_start
+// CHECK-NEXT: 480 |           pointer _M_finish
+// CHECK-NEXT: 488 |           pointer _M_end_of_storage
+// CHECK-NEXT: 496 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::graph_impl> MGraph
+// CHECK-NEXT: 496 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::graph_impl> (base)
+// CHECK-NEXT: 496 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::graph_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 496 |       element_type * _M_ptr
+// CHECK-NEXT: 504 |       class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 504 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT: 512 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::exec_graph_impl> MExecGraph
+// CHECK-NEXT: 512 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::exec_graph_impl> (base)
+// CHECK-NEXT: 512 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::exec_graph_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 512 |       element_type * _M_ptr
+// CHECK-NEXT: 520 |       class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 520 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT: 528 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::node_impl> MSubgraphNode
+// CHECK-NEXT: 528 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::node_impl> (base)
+// CHECK-NEXT: 528 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::node_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 528 |       element_type * _M_ptr
+// CHECK-NEXT: 536 |       class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 536 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT: 544 |   class std::unique_ptr<class sycl::detail::CG> MGraphNodeCG
+// CHECK:      544 |       class std::__uniq_ptr_impl<class sycl::detail::CG, struct std::default_delete<class sycl::detail::CG> >
+// CHECK-NEXT: 544 |         class std::tuple<class sycl::detail::CG *, struct std::default_delete<class sycl::detail::CG> > _M_t
+// CHECK-NEXT: 544 |           struct std::_Tuple_impl<0, class sycl::detail::CG *, struct std::default_delete<class sycl::detail::CG> > (base)
+// CHECK-NEXT: 544 |             struct std::_Tuple_impl<1, struct std::default_delete<class sycl::detail::CG> > (base) (empty)
+// CHECK:      544 |             struct std::_Head_base<0, class sycl::detail::CG *> (base)
+// CHECK-NEXT: 544 |               class sycl::detail::CG * _M_head_impl
+// CHECK-NEXT: 552 |   struct sycl::detail::code_location MCodeLoc
+// CHECK-NEXT: 552 |     const char * MFileName
+// CHECK-NEXT: 560 |     const char * MFunctionName
+// CHECK-NEXT: 568 |     unsigned long MLineNo
+// CHECK-NEXT: 576 |     unsigned long MColumnNo
+// CHECK-NEXT: 584 |   _Bool MIsFinalized
+// CHECK-NEXT: 592 |   class sycl::event MLastEvent
+// CHECK-NEXT: 592 |     class sycl::detail::OwnerLessBase<class sycl::event> (base) (empty)
+// CHECK-NEXT: 592 |     class std::shared_ptr<class sycl::detail::event_impl> impl
+// CHECK-NEXT: 592 |       class std::__shared_ptr<class sycl::detail::event_impl> (base)
+// CHECK-NEXT: 592 |         class std::__shared_ptr_access<class sycl::detail::event_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 592 |         element_type * _M_ptr
+// CHECK-NEXT: 600 |         class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 600 |           _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT:     | [sizeof=608, dsize=608, align=8,
+// CHECK-NEXT:     |  nvsize=608, nvalign=8]
+
diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump
index 7ec2569938519..f85c8a8b1b55f 100644
--- a/sycl/test/abi/pi_cuda_symbol_check.dump
+++ b/sycl/test/abi/pi_cuda_symbol_check.dump
@@ -183,3 +183,4 @@ piextVirtualMemReserve
 piextVirtualMemSetAccess
 piextVirtualMemUnmap
 piextWaitExternalSemaphore
+piextEnqueueKernelLaunchCustom
diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump
index a75fc9af455c9..f3706312ae9d6 100644
--- a/sycl/test/abi/pi_hip_symbol_check.dump
+++ b/sycl/test/abi/pi_hip_symbol_check.dump
@@ -183,3 +183,4 @@ piextVirtualMemReserve
 piextVirtualMemSetAccess
 piextVirtualMemUnmap
 piextWaitExternalSemaphore
+piextEnqueueKernelLaunchCustom
diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump
index 53c7f0e2ed531..88fc7c1e80cf7 100644
--- a/sycl/test/abi/pi_level_zero_symbol_check.dump
+++ b/sycl/test/abi/pi_level_zero_symbol_check.dump
@@ -182,3 +182,4 @@ piextVirtualMemReserve
 piextVirtualMemSetAccess
 piextVirtualMemUnmap
 piextWaitExternalSemaphore
+piextEnqueueKernelLaunchCustom
diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump
index 3d347196f757d..760f222f9a11e 100644
--- a/sycl/test/abi/pi_nativecpu_symbol_check.dump
+++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump
@@ -183,3 +183,4 @@ piextVirtualMemReserve
 piextVirtualMemSetAccess
 piextVirtualMemUnmap
 piextWaitExternalSemaphore
+piextEnqueueKernelLaunchCustom
diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump
index 7e174ffbb2a4c..75fba0db0809a 100644
--- a/sycl/test/abi/pi_opencl_symbol_check.dump
+++ b/sycl/test/abi/pi_opencl_symbol_check.dump
@@ -182,3 +182,4 @@ piextVirtualMemReserve
 piextVirtualMemSetAccess
 piextVirtualMemUnmap
 piextWaitExternalSemaphore
+piextEnqueueKernelLaunchCustom
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index e96b3ec995331..137275aab3795 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3515,6 +3515,7 @@ _ZN4sycl3_V17handler22memcpyFromDeviceGlobalEPvPKvbmm
 _ZN4sycl3_V17handler22setHandlerKernelBundleENS0_6kernelE
 _ZN4sycl3_V17handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE
 _ZN4sycl3_V17handler22setKernelIsCooperativeEb
+_ZN4sycl3_V17handler26setKernelUsesClusterLaunchEv
 _ZN4sycl3_V17handler24GetRangeRoundingSettingsERmS2_S2_
 _ZN4sycl3_V17handler24ext_intel_read_host_pipeENS0_6detail11string_viewEPvmb
 _ZN4sycl3_V17handler24ext_oneapi_memcpy2d_implEPvmPKvmmm
@@ -3979,6 +3980,7 @@ _ZNK4sycl3_V16device13get_info_implINS0_4info6device33ext_oneapi_max_global_work
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device33usm_restricted_shared_allocationsEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device35ext_intel_gpu_eu_count_per_subsliceEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device38sub_group_independent_forward_progressEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_4info6device29ext_oneapi_cuda_cluster_groupEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device4nameEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device6vendorEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device7aspectsEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index 0bc26c55b8a73..d82585c9c8112 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -142,6 +142,7 @@
 ??$get_info_impl@Uext_oneapi_max_work_groups_2d@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$id@$01@12@XZ
 ??$get_info_impl@Uext_oneapi_max_work_groups_3d@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$id@$02@12@XZ
 ??$get_info_impl@Uext_oneapi_srgb@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ
+??$get_info_impl@Uext_oneapi_cuda_cluster_group@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA_NXZ
 ??$get_info_impl@Uextensions@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@Vstring@detail@_V1@sycl@@V?$allocator@Vstring@detail@_V1@sycl@@@std@@@std@@XZ
 ??$get_info_impl@Uextensions@platform@info@_V1@sycl@@@platform@_V1@sycl@@AEBA?AV?$vector@Vstring@detail@_V1@sycl@@V?$allocator@Vstring@detail@_V1@sycl@@@std@@@std@@XZ
 ??$get_info_impl@Ufree_memory@device@info@intel@ext@_V1@sycl@@@device@_V1@sycl@@AEBA_KXZ
@@ -4410,6 +4411,7 @@
 ?setHandlerKernelBundle@handler@_V1@sycl@@AEAAXVkernel@23@@Z
 ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4_pi_kernel_cache_config@@@Z
 ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z
+?setKernelUsesClusterLaunch@handler@_V1@sycl@@AEAAXXZ
 ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z
 ?setNDRangeUsed@handler@_V1@sycl@@AEAAX_N@Z
 ?setStateExplicitKernelBundle@handler@_V1@sycl@@AEAAXXZ
diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp
index 0d3d43faf3f35..db310b6291924 100644
--- a/sycl/test/abi/symbol_size_alignment.cpp
+++ b/sycl/test/abi/symbol_size_alignment.cpp
@@ -52,9 +52,9 @@ int main() {
   check<event, 16, 8>();
   check<gpu_selector, 8, 8>();
 #ifdef _MSC_VER
-  check<handler, 576, 8>();
+  check<handler, 600, 8>();
 #else
-  check<handler, 584, 8>();
+  check<handler, 608, 8>();
 #endif
   check<image<1>, 16, 8>();
   check<kernel, 16, 8>();
diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
index 2e3718035d5d3..db06078bc5ba5 100644
--- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp
+++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
@@ -178,6 +178,7 @@
 // CHECK-NEXT: ext/intel/experimental/kernel_execution_properties.hpp
 // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp
 // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp
+// CHECK-NEXT: ext/oneapi/experimental/cluster_group_prop.hpp
 // CHECK-NEXT: ext/oneapi/experimental/raw_kernel_arg.hpp
 // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp
 // CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp
diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp
index 04f306dc0cfb8..1b76c23fe64c5 100644
--- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp
+++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp
@@ -606,3 +606,25 @@ TEST_F(CommandGraphTest, ProfilingExceptionProperty) {
   }
   ASSERT_EQ(Success, false);
 }
+
+TEST_F(CommandGraphTest, ClusterLaunchException) {
+  namespace syclex = sycl::ext::oneapi::experimental;
+
+  syclex::properties cluster_launch_property{
+      syclex::cuda::cluster_size<1>(sycl::range<1>{4})};
+
+  std::error_code ExceptionCode = make_error_code(sycl::errc::success);
+  try {
+    Graph.begin_recording(Queue);
+    auto Event1 = Queue.submit([&](sycl::handler &cgh) {
+      cgh.parallel_for<TestKernel<>>(sycl::nd_range<1>({4096}, {32}),
+                                     cluster_launch_property,
+                                     [&](sycl::nd_item<1> it) {});
+    });
+    Queue.wait();
+    Graph.end_recording(Queue);
+  } catch (exception &Exception) {
+    ExceptionCode = Exception.code();
+  }
+  ASSERT_EQ(ExceptionCode, sycl::errc::invalid);
+}
diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp
index 5541db5688c53..24a33ee98d97b 100644
--- a/sycl/unittests/helpers/PiMockPlugin.hpp
+++ b/sycl/unittests/helpers/PiMockPlugin.hpp
@@ -1658,3 +1658,13 @@ inline pi_result mock_piextUSMImport(const void *HostPtr, size_t Size,
 inline pi_result mock_piextUSMRelease(const void *HostPtr, pi_context Context) {
   return PI_SUCCESS;
 }
+
+inline pi_result mock_piextEnqueueKernelLaunchCustom(
+    pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+    const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    pi_uint32 NumPropsInLaunchPropList,
+    const pi_launch_property *LaunchPropList, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventsWaitList, pi_event *OutEvent) {
+
+  return PI_SUCCESS;
+}
diff --git a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
index 649e7a6126729..27708bb2efba5 100644
--- a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
+++ b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp
@@ -124,7 +124,8 @@ class MockHandler : public sycl::handler {
           std::move(CGH->CGData), std::move(CGH->MArgs),
           CGH->MKernelName.c_str(), std::move(CGH->MStreamStorage),
           std::move(MImpl->MAuxiliaryResources), CGH->MCGType, {},
-          MImpl->MKernelIsCooperative, CGH->MCodeLoc));
+          MImpl->MKernelIsCooperative, MImpl->MKernelUsesClusterLaunch,
+          CGH->MCodeLoc));
       break;
     }
     default:
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
index 9d89a970c9a09..ab20050f250a6 100644
--- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp
+++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
@@ -307,7 +307,8 @@ class MockHandlerCustomFinalize : public MockHandler {
           getNDRDesc(), std::move(getHostKernel()), getKernel(),
           std::move(MImpl->MKernelBundle), std::move(CGData), getArgs(),
           getKernelName(), getStreamStorage(), MImpl->MAuxiliaryResources,
-          getCGType(), {}, MImpl->MKernelIsCooperative, getCodeLoc()));
+          getCGType(), {}, MImpl->MKernelIsCooperative,
+          MImpl->MKernelUsesClusterLaunch, getCodeLoc()));
       break;
     }
     case sycl::detail::CG::CodeplayHostTask: {
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index d4f65caddf56c..29b54fc31dad2 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -36,7 +36,8 @@ class MockHandlerStreamInit : public MockHandler {
                                         getRequirements(), getEvents()),
           getArgs(), getKernelName(), getStreamStorage(),
           std::move(MImpl->MAuxiliaryResources), getCGType(), {},
-          MImpl->MKernelIsCooperative, getCodeLoc()));
+          MImpl->MKernelIsCooperative, MImpl->MKernelUsesClusterLaunch,
+          getCodeLoc()));
       break;
     }
     default: